Skip to content

Commit

Permalink
Add support for Split pretokenizer w/ behavior=removed & `invert=fa…
Browse files Browse the repository at this point in the history
…lse`
  • Loading branch information
xenova authored Nov 16, 2024
1 parent 7b1ce3c commit 7c1bde9
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/tokenizers.js
Original file line number Diff line number Diff line change
Expand Up @@ -1518,6 +1518,8 @@ class SplitPreTokenizer extends PreTokenizer {

if (this.config.invert) {
return text.match(this.pattern) || [];
} else if (this.config.behavior?.toLowerCase() === 'removed') {
return text.split(this.pattern).filter(x => x);
} else {
return regexSplit(text, this.pattern);
}
Expand Down
10 changes: 10 additions & 0 deletions tests/models/roberta/tokenization.js
Original file line number Diff line number Diff line change
Expand Up @@ -691,4 +691,14 @@ export const TEST_CONFIG = {
decoded: "<s> \tH\u00e4LLo!how \n Are yoU? </s>",
},
},

// Split tokenizer with behavior="Removed" and invert=false
"onnx-community/camembertv2-base": {
SIMPLE: {
text: BASE_TEST_STRINGS.SIMPLE,
tokens: ['How', 'are', 'you', 'doi', '##ng', '?'],
ids: [1, 14473, 9556, 10577, 6471, 9274, 38, 2],
decoded: "[CLS] How are you doing? [SEP]",
}
},
};

0 comments on commit 7c1bde9

Please sign in to comment.