From 0b08dee5d72e4b08075f5f565e9dd9137f86785e Mon Sep 17 00:00:00 2001 From: Karan Jariwala Date: Wed, 3 Apr 2019 14:28:08 -0700 Subject: [PATCH] [MXNet-1375][Fit API]Added RNN integration test for fit() API (#14547) * Added RNN integration test for fit() API * Addressed review comments: change in JenkinFile, tmp directory, ctx with condense if/else, renamed imports * CPU test doesn't require nvidiadocker container * Modified the structure by removing the redundant code --- ci/docker/runtime_functions.sh | 14 + tests/nightly/Jenkinsfile | 16 + tests/nightly/estimator/test_sentiment_rnn.py | 276 ++++++++++++++++++ 3 files changed, 306 insertions(+) create mode 100644 tests/nightly/estimator/test_sentiment_rnn.py diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh index e1da222ca298..301d45be119f 100755 --- a/ci/docker/runtime_functions.sh +++ b/ci/docker/runtime_functions.sh @@ -1350,6 +1350,20 @@ nightly_scala_demo_test_cpu() { bash bin/run_im.sh } +nightly_estimator_rnn_gpu() { + set -ex + cd /work/mxnet/tests/nightly/estimator + export PYTHONPATH=/work/mxnet/python/ + python test_sentiment_rnn.py --type gpu +} + +nightly_estimator_rnn_cpu() { + set -ex + cd /work/mxnet/tests/nightly/estimator + export PYTHONPATH=/work/mxnet/python/ + python test_sentiment_rnn.py --type cpu +} + # Deploy deploy_docs() { diff --git a/tests/nightly/Jenkinsfile b/tests/nightly/Jenkinsfile index 758c86476f75..a65da2d0b87e 100755 --- a/tests/nightly/Jenkinsfile +++ b/tests/nightly/Jenkinsfile @@ -136,6 +136,22 @@ core_logic: { utils.docker_run('ubuntu_nightly_cpu', 'nightly_test_javascript', false) } } + }, + 'estimator: RNN GPU': { + node(NODE_LINUX_GPU) { + ws('workspace/estimator-test-rnn-gpu') { + utils.unpack_and_init('gpu', mx_lib) + utils.docker_run('ubuntu_nightly_gpu', 'nightly_estimator_test_rnn_gpu', true) + } + } + }, + 'estimator: RNN CPU': { + node(NODE_LINUX_CPU) { + ws('workspace/estimator-test-rnn-cpu') { + utils.unpack_and_init('cpu', mx_lib) + utils.docker_run('ubuntu_nightly_cpu', 'nightly_estimator_test_rnn_cpu', false) + } + } } } } diff --git a/tests/nightly/estimator/test_sentiment_rnn.py b/tests/nightly/estimator/test_sentiment_rnn.py new file mode 100644 index 000000000000..7e42831786ce --- /dev/null +++ b/tests/nightly/estimator/test_sentiment_rnn.py @@ -0,0 +1,276 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Gluon Text Sentiment Classification Example using RNN/CNN +Example modified from below link: +https://github.com/d2l-ai/d2l-en/blob/master/chapter_natural-language-processing/sentiment-analysis-rnn.md +https://github.com/d2l-ai/d2l-en/blob/master/chapter_natural-language-processing/sentiment-analysis-cnn.md""" + +import argparse +import os +import tarfile +import random +import collections +import mxnet as mx +from mxnet import nd, gluon +from mxnet.contrib import text +from mxnet.gluon import nn, rnn +from mxnet.gluon.estimator import estimator + + +class TextCNN(nn.Block): + def __init__(self, vocab, embed_size, kernel_sizes, num_channels, + **kwargs): + super(TextCNN, self).__init__(**kwargs) + self.embedding = nn.Embedding(len(vocab), embed_size) + # The embedding layer does not participate in training + self.constant_embedding = nn.Embedding(len(vocab), embed_size) + self.dropout = nn.Dropout(0.5) + self.decoder = nn.Dense(2) + # The max-over-time pooling layer has no weight, so it can share an + # instance + self.pool = nn.GlobalMaxPool1D() + # Create multiple one-dimensional convolutional layers + self.convs = nn.Sequential() + for c, k in zip(num_channels, kernel_sizes): + self.convs.add(nn.Conv1D(c, k, activation='relu')) + + def forward(self, inputs): + # Concatenate the output of two embedding layers with shape of + # (batch size, number of words, word vector dimension) by word vector + embeddings = nd.concat( + self.embedding(inputs), self.constant_embedding(inputs), dim=2) + # According to the input format required by Conv1D, the word vector + # dimension, that is, the channel dimension of the one-dimensional + # convolutional layer, is transformed into the previous dimension + embeddings = embeddings.transpose((0, 2, 1)) + # For each one-dimensional convolutional layer, after max-over-time + # pooling, an NDArray with the shape of (batch size, channel size, 1) + # can be obtained. Use the flatten function to remove the last + # dimension and then concatenate on the channel dimension + encoding = nd.concat(*[nd.flatten( + self.pool(conv(embeddings))) for conv in self.convs], dim=1) + # After applying the dropout method, use a fully connected layer to + # obtain the output + outputs = self.decoder(self.dropout(encoding)) + return outputs + + +class BiRNN(nn.Block): + def __init__(self, vocab, embed_size, num_hiddens, num_layers, **kwargs): + super(BiRNN, self).__init__(**kwargs) + self.embedding = nn.Embedding(len(vocab), embed_size) + # Set Bidirectional to True to get a bidirectional recurrent neural + # network + self.encoder = rnn.LSTM(num_hiddens, num_layers=num_layers, + bidirectional=True, input_size=embed_size) + self.decoder = nn.Dense(2) + + def forward(self, inputs): + # The shape of inputs is (batch size, number of words). Because LSTM + # needs to use sequence as the first dimension, the input is + # transformed and the word feature is then extracted. The output shape + # is (number of words, batch size, word vector dimension). + embeddings = self.embedding(inputs.T) + # The shape of states is (number of words, batch size, 2 * number of + # hidden units). + states = self.encoder(embeddings) + # Concatenate the hidden states of the initial time step and final + # time step to use as the input of the fully connected layer. Its + # shape is (batch size, 4 * number of hidden units) + encoding = nd.concat(states[0], states[-1]) + outputs = self.decoder(encoding) + return outputs + + +def download_imdb(data_dir='/tmp/data'): + ''' + Download and extract the IMDB dataset + ''' + url = ('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz') + sha1 = '01ada507287d82875905620988597833ad4e0903' + if not os.path.exists(data_dir): + os.makedirs(data_dir) + file_path = os.path.join(data_dir, 'aclImdb_v1.tar.gz') + if not os.path.isfile(file_path): + file_path = gluon.utils.download(url, data_dir, sha1_hash=sha1) + with tarfile.open(file_path, 'r') as f: + f.extractall(data_dir) + + +def read_imdb(folder='train'): + ''' + Read the IMDB dataset + ''' + data = [] + for label in ['pos', 'neg']: + folder_name = os.path.join('/tmp/data/aclImdb/', folder, label) + for file in os.listdir(folder_name): + with open(os.path.join(folder_name, file), 'rb') as f: + review = f.read().decode('utf-8').replace('\n', '').lower() + data.append([review, 1 if label == 'pos' else 0]) + random.shuffle(data) + return data + + +def get_tokenized_imdb(data): + ''' + Tokenized the words + ''' + + def tokenizer(text): + return [tok.lower() for tok in text.split(' ')] + + return [tokenizer(review) for review, _ in data] + + +def get_vocab_imdb(data): + ''' + Get the indexed tokens + ''' + tokenized_data = get_tokenized_imdb(data) + counter = collections.Counter([tk for st in tokenized_data for tk in st]) + return text.vocab.Vocabulary(counter, min_freq=5) + + +def preprocess_imdb(data, vocab): + ''' + Make the length of each comment 500 by truncating or adding 0s + ''' + max_l = 500 + + def pad(x): + return x[:max_l] if len(x) > max_l else x + [0] * (max_l - len(x)) + + tokenized_data = get_tokenized_imdb(data) + features = nd.array([pad(vocab.to_indices(x)) for x in tokenized_data]) + labels = nd.array([score for _, score in data]) + return features, labels + + +def run(net, train_dataloader, test_dataloader, **kwargs): + ''' + Train a test sentiment model + ''' + num_epochs = kwargs['epochs'] + ctx = kwargs['ctx'] + batch_size = kwargs['batch_size'] + lr = kwargs['lr'] + + # Define trainer + trainer = mx.gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr}) + # Define loss and evaluation metrics + loss = gluon.loss.SoftmaxCrossEntropyLoss() + acc = mx.metric.Accuracy() + + # Define estimator + est = estimator.Estimator(net=net, loss=loss, metrics=acc, + trainers=trainer, context=ctx) + # Begin training + est.fit(train_data=train_dataloader, val_data=test_dataloader, + epochs=num_epochs, batch_size=batch_size) + return est + + +def test_estimator_cpu(**kwargs): + ''' + Test estimator by doing one pass over each model with synthetic data + ''' + models = ['TextCNN', 'BiRNN'] + ctx = kwargs['ctx'] + batch_size = kwargs['batch_size'] + embed_size = kwargs['embed_size'] + + train_data = mx.nd.random.randint(low=0, high=100, shape=(2 * batch_size, 500)) + train_label = mx.nd.random.randint(low=0, high=2, shape=(2 * batch_size,)) + val_data = mx.nd.random.randint(low=0, high=100, shape=(batch_size, 500)) + val_label = mx.nd.random.randint(low=0, high=2, shape=(batch_size,)) + + train_dataloader = gluon.data.DataLoader(dataset=gluon.data.ArrayDataset(train_data, train_label), + batch_size=batch_size, shuffle=True) + val_dataloader = gluon.data.DataLoader(dataset=gluon.data.ArrayDataset(val_data, val_label), + batch_size=batch_size) + vocab_list = mx.nd.zeros(shape=(100,)) + + # Get the model + for model in models: + if model == 'TextCNN': + kernel_sizes, nums_channels = [3, 4, 5], [100, 100, 100] + net = TextCNN(vocab_list, embed_size, kernel_sizes, nums_channels) + else: + num_hiddens, num_layers = 100, 2 + net = BiRNN(vocab_list, embed_size, num_hiddens, num_layers) + net.initialize(mx.init.Xavier(), ctx=ctx) + + run(net, train_dataloader, val_dataloader, **kwargs) + + +def test_estimator_gpu(**kwargs): + ''' + Test estimator by training Bidirectional RNN for 5 epochs on the IMDB dataset + and verify accuracy + ''' + ctx = kwargs['ctx'] + batch_size = kwargs['batch_size'] + num_epochs = kwargs['epochs'] + embed_size = kwargs['embed_size'] + + # data + download_imdb() + train_data, test_data = read_imdb('train'), read_imdb('test') + vocab = get_vocab_imdb(train_data) + + train_set = gluon.data.ArrayDataset(*preprocess_imdb(train_data, vocab)) + test_set = gluon.data.ArrayDataset(*preprocess_imdb(test_data, vocab)) + train_dataloader = gluon.data.DataLoader(train_set, batch_size, shuffle=True) + test_dataloader = gluon.data.DataLoader(test_set, batch_size) + + # Model + num_hiddens, num_layers = 100, 2 + net = BiRNN(vocab, embed_size, num_hiddens, num_layers) + net.initialize(mx.init.Xavier(), ctx=ctx) + + glove_embedding = text.embedding.create( + 'glove', pretrained_file_name='glove.6B.100d.txt', vocabulary=vocab) + + net.embedding.weight.set_data(glove_embedding.idx_to_vec) + net.embedding.collect_params().setattr('grad_req', 'null') + + est = run(net, train_dataloader, test_dataloader, **kwargs) + + assert est.train_stats['train_accuracy'][num_epochs - 1] > 0.70 + + +parser = argparse.ArgumentParser(description='test gluon estimator') +parser.add_argument('--type', type=str, default='cpu') +opt = parser.parse_args() +kwargs = { + 'batch_size': 64, + 'lr': 0.01, + 'embed_size': 100 +} + +if opt.type == 'cpu': + kwargs['ctx'] = mx.cpu() + kwargs['epochs'] = 1 + test_estimator_cpu(**kwargs) +elif opt.type == 'gpu': + kwargs['ctx'] = mx.gpu() + kwargs['epochs'] = 5 + test_estimator_gpu(**kwargs) +else: + raise RuntimeError("Unknown test type")