| from transformers import GPT2Tokenizer | |
| class FLMTokenizer(GPT2Tokenizer): | |
| model_input_names = ["input_ids", "attention_mask"] | |
| def __init__( | |
| self, | |
| vocab_file, | |
| merges_file, | |
| errors="replace", | |
| unk_token="<|endoftext|>", | |
| bos_token="<|endoftext|>", | |
| eos_token="<|endoftext|>", | |
| pad_token=None, | |
| add_prefix_space=False, | |
| add_bos_token=False, | |
| **kwargs, | |
| ): | |
| super().__init__( | |
| vocab_file, | |
| merges_file, | |
| errors=errors, | |
| unk_token=unk_token, | |
| bos_token=bos_token, | |
| eos_token=eos_token, | |
| pad_token=pad_token, | |
| add_prefix_space=add_prefix_space, | |
| add_bos_token=add_bos_token, | |
| **kwargs, | |
| ) | |
| self.pat = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" |