Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding Benchmark for TA ops #1801

Merged
merged 3 commits into from
Jun 23, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions benchmark/benchmark_torcharrow_ops.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import sys, os

import torcharrow as ta
import torchtext.transforms as T
from benchmark.utils import Timer
from torcharrow import functional as ta_F
from torchtext._download_hooks import load_state_dict_from_url
from torchtext.datasets import SST2

sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), "../examples"))
from data_pipeline.roberta_dataframe import init_ta_gpt2bpe_encoder, init_ta_gpt2bpe_vocab


def run_torchtext_ops():
# tokenizer converting text into tokens
encoder_json_path = "https://download.pytorch.org/models/text/gpt2_bpe_encoder.json"
vocab_bpe_path = "https://download.pytorch.org/models/text/gpt2_bpe_vocab.bpe"
tokenizer = T.GPT2BPETokenizer(encoder_json_path, vocab_bpe_path)

# vocabulary converting tokens to IDs
vocab_path = "https://download.pytorch.org/models/text/roberta.vocab.pt"
vocab = T.VocabTransform(load_state_dict_from_url(vocab_path))

# dataset
train_dp = SST2(split="train")
text_list = list(train_dp.map(lambda x: x[0]))

with Timer("Running torchtext's GPT2BPE tokenizer"):
tokenized_text = tokenizer(text_list)

with Timer("Running torchtext's vocab query"):
_ = vocab(tokenized_text)


def run_torcharrow_ops():
# tokenizer converting text into tokens
tokenizer = init_ta_gpt2bpe_encoder()

# vocabulary converting tokens to IDs
vocab = init_ta_gpt2bpe_vocab()

# dataset
train_dp = SST2(split="train")
text_list = list(train_dp.map(lambda x: x[0]))
data_frame = ta.dataframe({"text": text_list})

with Timer("Running torcharrow's GPT2BPE tokenizer"):
data_frame["tokens"] = ta_F.bpe_tokenize(tokenizer, data_frame["text"])

with Timer("Running torcharrow's vocab query"):
data_frame["token_ids"] = ta_F.lookup_indices(vocab, data_frame["tokens"])


if __name__ == "__main__":
run_torchtext_ops()
run_torcharrow_ops()