diff --git a/src/tokenizers.js b/src/tokenizers.js index 5b4e0170c..48a26b636 100644 --- a/src/tokenizers.js +++ b/src/tokenizers.js @@ -1518,6 +1518,8 @@ class SplitPreTokenizer extends PreTokenizer { if (this.config.invert) { return text.match(this.pattern) || []; + } else if (this.config.behavior?.toLowerCase() === 'removed') { + return text.split(this.pattern).filter(x => x); } else { return regexSplit(text, this.pattern); } diff --git a/tests/models/roberta/tokenization.js b/tests/models/roberta/tokenization.js index 05030999b..458430878 100644 --- a/tests/models/roberta/tokenization.js +++ b/tests/models/roberta/tokenization.js @@ -691,4 +691,14 @@ export const TEST_CONFIG = { decoded: " \tH\u00e4LLo!how \n Are yoU? ", }, }, + + // Split tokenizer with behavior="Removed" and invert=false + "onnx-community/camembertv2-base": { + SIMPLE: { + text: BASE_TEST_STRINGS.SIMPLE, + tokens: ['How', 'are', 'you', 'doi', '##ng', '?'], + ids: [1, 14473, 9556, 10577, 6471, 9274, 38, 2], + decoded: "[CLS] How are you doing? [SEP]", + } + }, };