danielhanchen commited on
Commit
8fbdec9
·
verified ·
1 Parent(s): e55d904

Add files using upload-large-folder tool

Browse files
Files changed (50) hide show
  1. chat_template.jinja +96 -0
  2. config.json +99 -0
  3. configuration_deepseek.py +212 -0
  4. generation_config.json +5 -0
  5. model-00004-of-00527.safetensors +3 -0
  6. model-00011-of-00527.safetensors +3 -0
  7. model-00013-of-00527.safetensors +3 -0
  8. model-00017-of-00527.safetensors +3 -0
  9. model-00022-of-00527.safetensors +3 -0
  10. model-00034-of-00527.safetensors +3 -0
  11. model-00035-of-00527.safetensors +3 -0
  12. model-00037-of-00527.safetensors +3 -0
  13. model-00056-of-00527.safetensors +3 -0
  14. model-00299-of-00527.safetensors +3 -0
  15. model-00316-of-00527.safetensors +3 -0
  16. model-00325-of-00527.safetensors +3 -0
  17. model-00401-of-00527.safetensors +3 -0
  18. model-00415-of-00527.safetensors +3 -0
  19. model-00418-of-00527.safetensors +3 -0
  20. model-00424-of-00527.safetensors +3 -0
  21. model-00430-of-00527.safetensors +3 -0
  22. model-00431-of-00527.safetensors +3 -0
  23. model-00433-of-00527.safetensors +3 -0
  24. model-00434-of-00527.safetensors +3 -0
  25. model-00435-of-00527.safetensors +3 -0
  26. model-00447-of-00527.safetensors +3 -0
  27. model-00453-of-00527.safetensors +3 -0
  28. model-00454-of-00527.safetensors +3 -0
  29. model-00456-of-00527.safetensors +3 -0
  30. model-00463-of-00527.safetensors +3 -0
  31. model-00468-of-00527.safetensors +3 -0
  32. model-00471-of-00527.safetensors +3 -0
  33. model-00478-of-00527.safetensors +3 -0
  34. model-00482-of-00527.safetensors +3 -0
  35. model-00483-of-00527.safetensors +3 -0
  36. model-00484-of-00527.safetensors +3 -0
  37. model-00487-of-00527.safetensors +3 -0
  38. model-00491-of-00527.safetensors +3 -0
  39. model-00497-of-00527.safetensors +3 -0
  40. model-00498-of-00527.safetensors +3 -0
  41. model-00499-of-00527.safetensors +3 -0
  42. model-00518-of-00527.safetensors +3 -0
  43. model-00519-of-00527.safetensors +3 -0
  44. model-00520-of-00527.safetensors +3 -0
  45. model-00521-of-00527.safetensors +3 -0
  46. model-00522-of-00527.safetensors +3 -0
  47. model-00527-of-00527.safetensors +3 -0
  48. special_tokens_map.json +40 -0
  49. tokenization_kimi.py +349 -0
  50. tokenizer_config.json +180 -0
chat_template.jinja ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro render_content(msg) -%}
2
+ {%- set c = msg.get('content') -%}
3
+ {%- if c is string -%}
4
+ {{ c }}
5
+ {%- elif c is not none -%}
6
+ {% for content in c -%}
7
+ {% if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
8
+ <|media_start|>image<|media_content|><|media_pad|><|media_end|>
9
+ {% else -%}
10
+ {{ content['text'] }}
11
+ {%- endif -%}
12
+ {%- endfor -%}
13
+ {%- endif -%}
14
+ {%- endmacro -%}
15
+
16
+ {% macro set_roles(message) -%}
17
+ {%- set role_name = message.get('name') or message['role'] -%}
18
+ {%- if message['role'] == 'user' -%}
19
+ <|im_user|>{{role_name}}<|im_middle|>
20
+ {%- elif message['role'] == 'assistant' -%}
21
+ <|im_assistant|>{{role_name}}<|im_middle|>
22
+ {%- else -%}
23
+ <|im_system|>{{role_name}}<|im_middle|>
24
+ {%- endif -%}
25
+ {%- endmacro -%}
26
+
27
+
28
+ {%- macro render_toolcalls(message) -%}
29
+ <|tool_calls_section_begin|>
30
+ {%- for tool_call in message['tool_calls'] -%}
31
+ {%- set formatted_id = tool_call['id'] -%}
32
+ <|tool_call_begin|>{{ formatted_id }}<|tool_call_argument_begin|>{% if tool_call['function']['arguments'] is string %}{{ tool_call['function']['arguments'] }}{% else %}{{ tool_call['function']['arguments'] | tojson }}{% endif %}<|tool_call_end|>
33
+ {%- endfor -%}
34
+ <|tool_calls_section_end|>
35
+ {%- endmacro -%}
36
+
37
+
38
+ {# Find last non-tool-call assisitant message #}
39
+ {%- set ns = namespace(last_non_tool_call_assistant_msg=-1) -%}
40
+ {%- for idx in range(messages|length-1, -1, -1) -%}
41
+ {%- if messages[idx]['role'] == 'assistant' and not messages[idx].get('tool_calls') -%}
42
+ {%- set ns.last_non_tool_call_assistant_msg = idx -%}
43
+ {%- break -%}
44
+ {%- endif -%}
45
+ {%- endfor -%}
46
+
47
+ {# split all messages into history & suffix, reasoning_content in suffix should be reserved.#}
48
+ {%- set hist_msgs = messages[:ns.last_non_tool_call_assistant_msg+1] -%}
49
+ {%- set suffix_msgs = messages[ns.last_non_tool_call_assistant_msg+1:] -%}
50
+
51
+ {%- if tools -%}
52
+ <|im_system|>tool_declare<|im_middle|>{{ tools | tojson(separators=(',', ':')) }}<|im_end|>
53
+ {%- endif -%}
54
+
55
+ {%- for message in hist_msgs -%}
56
+ {%- if loop.first and messages[0]['role'] != 'system' -%}
57
+ <|im_system|>system<|im_middle|>You are Kimi, an AI assistant created by Moonshot AI.<|im_end|>
58
+ {%- endif -%}
59
+ {{set_roles(message)}}
60
+ {%- if message['role'] == 'assistant' -%}
61
+ <think></think>{{render_content(message)}}
62
+ {%- if message.get('tool_calls') -%}
63
+ {{render_toolcalls(message)}}
64
+ {%- endif -%}
65
+ {%- elif message['role'] == 'tool' -%}
66
+ {%- set tool_call_id = message.tool_call_id -%}
67
+ ## Return of {{ tool_call_id }}
68
+ {{render_content(message)}}
69
+ {%- elif message['content'] is not none -%}
70
+ {{render_content(message)}}
71
+ {%- endif -%}
72
+ <|im_end|>
73
+ {%- endfor -%}
74
+
75
+ {%- for message in suffix_msgs -%}
76
+ {{set_roles(message)}}
77
+ {%- if message['role'] == 'assistant' -%}
78
+ {%- set rc = message.get('reasoning_content', '') -%}
79
+ <think>{{rc}}</think>{{render_content(message)}}
80
+ {%- if message.get('tool_calls') -%}
81
+ {{render_toolcalls(message)}}
82
+ {%- endif -%}
83
+ {%- elif message['role'] == 'tool' -%}
84
+ {%- set tool_call_id = message.tool_call_id -%}
85
+ ## Return of {{ tool_call_id }}
86
+ {{render_content(message)}}
87
+ {%- elif message['content'] is not none -%}
88
+ {{render_content(message)}}
89
+ {%- endif -%}
90
+ <|im_end|>
91
+ {%- endfor -%}
92
+
93
+
94
+ {%- if add_generation_prompt -%}
95
+ <|im_assistant|>assistant<|im_middle|>
96
+ {%- endif -%}
config.json ADDED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_attn_implementation_autoset": false,
3
+ "architectures": [
4
+ "DeepseekV3ForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "auto_map": {
9
+ "AutoConfig": "configuration_deepseek.DeepseekV3Config",
10
+ "AutoModel": "modeling_deepseek.DeepseekV3Model",
11
+ "AutoModelForCausalLM": "modeling_deepseek.DeepseekV3ForCausalLM"
12
+ },
13
+ "aux_loss_alpha": 0.001,
14
+ "bos_token_id": 163584,
15
+ "dtype": "bfloat16",
16
+ "eos_token_id": 163586,
17
+ "ep_size": 1,
18
+ "first_k_dense_replace": 1,
19
+ "hidden_act": "silu",
20
+ "hidden_size": 7168,
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 18432,
23
+ "kv_lora_rank": 512,
24
+ "max_position_embeddings": 262144,
25
+ "model_type": "deepseek_v3",
26
+ "moe_intermediate_size": 2048,
27
+ "moe_layer_freq": 1,
28
+ "n_group": 1,
29
+ "n_routed_experts": 384,
30
+ "n_shared_experts": 1,
31
+ "norm_topk_prob": true,
32
+ "num_attention_heads": 64,
33
+ "num_experts_per_tok": 8,
34
+ "num_hidden_layers": 61,
35
+ "num_key_value_heads": 64,
36
+ "num_nextn_predict_layers": 0,
37
+ "pad_token_id": 163839,
38
+ "pretraining_tp": 1,
39
+ "q_lora_rank": 1536,
40
+ "qk_nope_head_dim": 128,
41
+ "qk_rope_head_dim": 64,
42
+ "quantization_config": {
43
+ "config_groups": {
44
+ "group_0": {
45
+ "format": null,
46
+ "input_activations": null,
47
+ "output_activations": null,
48
+ "targets": [
49
+ "Linear"
50
+ ],
51
+ "weights": {
52
+ "actorder": null,
53
+ "block_structure": null,
54
+ "dynamic": false,
55
+ "group_size": 32,
56
+ "num_bits": 4,
57
+ "observer": "minmax",
58
+ "observer_kwargs": {},
59
+ "strategy": "group",
60
+ "symmetric": true,
61
+ "type": "int"
62
+ }
63
+ }
64
+ },
65
+ "format": "pack-quantized",
66
+ "global_compression_ratio": null,
67
+ "ignore": [
68
+ "lm_head",
69
+ "re:.*self_attn.*",
70
+ "re:.*shared_experts.*",
71
+ "re:.*mlp\\.(gate|up|gate_up|down)_proj.*"
72
+ ],
73
+ "kv_cache_scheme": null,
74
+ "quant_method": "compressed-tensors",
75
+ "quantization_status": "compressed",
76
+ "sparsity_config": {}
77
+ },
78
+ "rms_norm_eps": 1e-05,
79
+ "rope_scaling": {
80
+ "beta_fast": 1.0,
81
+ "beta_slow": 1.0,
82
+ "factor": 64.0,
83
+ "mscale": 1.0,
84
+ "mscale_all_dim": 1.0,
85
+ "original_max_position_embeddings": 4096,
86
+ "type": "yarn"
87
+ },
88
+ "rope_theta": 50000.0,
89
+ "routed_scaling_factor": 2.827,
90
+ "scoring_func": "sigmoid",
91
+ "seq_aux": true,
92
+ "tie_word_embeddings": false,
93
+ "topk_group": 1,
94
+ "topk_method": "noaux_tc",
95
+ "transformers_version": "4.57.1",
96
+ "use_cache": true,
97
+ "v_head_dim": 128,
98
+ "vocab_size": 163840
99
+ }
configuration_deepseek.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copy from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/configuration_deepseek.py
2
+
3
+ from transformers.configuration_utils import PretrainedConfig
4
+ from transformers.utils import logging
5
+
6
+ logger = logging.get_logger(__name__)
7
+
8
+ DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
9
+ class DeepseekV3Config(PretrainedConfig):
10
+ r"""
11
+ This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
12
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
13
+ defaults will yield a similar configuration to that of the DeepSeek-V3.
14
+
15
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
16
+ documentation from [`PretrainedConfig`] for more information.
17
+
18
+
19
+ Args:
20
+ vocab_size (`int`, *optional*, defaults to 129280):
21
+ Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
22
+ `inputs_ids` passed when calling [`DeepseekV3Model`]
23
+ hidden_size (`int`, *optional*, defaults to 4096):
24
+ Dimension of the hidden representations.
25
+ intermediate_size (`int`, *optional*, defaults to 11008):
26
+ Dimension of the MLP representations.
27
+ moe_intermediate_size (`int`, *optional*, defaults to 1407):
28
+ Dimension of the MoE representations.
29
+ num_hidden_layers (`int`, *optional*, defaults to 32):
30
+ Number of hidden layers in the Transformer decoder.
31
+ num_nextn_predict_layers (`int`, *optional*, defaults to 1):
32
+ Number of nextn predict layers in the DeepSeekV3 Model.
33
+ num_attention_heads (`int`, *optional*, defaults to 32):
34
+ Number of attention heads for each attention layer in the Transformer decoder.
35
+ n_shared_experts (`int`, *optional*, defaults to None):
36
+ Number of shared experts, None means dense model.
37
+ n_routed_experts (`int`, *optional*, defaults to None):
38
+ Number of routed experts, None means dense model.
39
+ routed_scaling_factor (`float`, *optional*, defaults to 1.0):
40
+ Scaling factor or routed experts.
41
+ topk_method (`str`, *optional*, defaults to `gready`):
42
+ Topk method used in routed gate.
43
+ n_group (`int`, *optional*, defaults to None):
44
+ Number of groups for routed experts.
45
+ topk_group (`int`, *optional*, defaults to None):
46
+ Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
47
+ num_experts_per_tok (`int`, *optional*, defaults to None):
48
+ Number of selected experts, None means dense model.
49
+ moe_layer_freq (`int`, *optional*, defaults to 1):
50
+ The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
51
+ first_k_dense_replace (`int`, *optional*, defaults to 0):
52
+ Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
53
+ \--k dense layers--/
54
+ norm_topk_prob (`bool`, *optional*, defaults to False):
55
+ Whether to normalize the weights of the routed experts.
56
+ scoring_func (`str`, *optional*, defaults to 'softmax'):
57
+ Method of computing expert weights.
58
+ aux_loss_alpha (`float`, *optional*, defaults to 0.001):
59
+ Auxiliary loss weight coefficient.
60
+ seq_aux = (`bool`, *optional*, defaults to True):
61
+ Whether to compute the auxiliary loss for each individual sample.
62
+ num_key_value_heads (`int`, *optional*):
63
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
64
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
65
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
66
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
67
+ by meanpooling all the original heads within that group. For more details checkout [this
68
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
69
+ `num_attention_heads`.
70
+ hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
71
+ The non-linear activation function (function or string) in the decoder.
72
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
73
+ The maximum sequence length that this model might ever be used with.
74
+ initializer_range (`float`, *optional*, defaults to 0.02):
75
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
76
+ rms_norm_eps (`float`, *optional*, defaults to 1e-06):
77
+ The epsilon used by the rms normalization layers.
78
+ use_cache (`bool`, *optional*, defaults to `True`):
79
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
80
+ relevant if `config.is_decoder=True`.
81
+ pad_token_id (`int`, *optional*):
82
+ Padding token id.
83
+ bos_token_id (`int`, *optional*, defaults to 1):
84
+ Beginning of stream token id.
85
+ eos_token_id (`int`, *optional*, defaults to 2):
86
+ End of stream token id.
87
+ pretraining_tp (`int`, *optional*, defaults to 1):
88
+ Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
89
+ document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
90
+ necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
91
+ issue](https://github.com/pytorch/pytorch/issues/76232).
92
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
93
+ Whether to tie weight embeddings
94
+ rope_theta (`float`, *optional*, defaults to 10000.0):
95
+ The base period of the RoPE embeddings.
96
+ rope_scaling (`Dict`, *optional*):
97
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
98
+ strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
99
+ `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
100
+ `max_position_embeddings` to the expected new maximum.
101
+ attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
102
+ Whether to use a bias in the query, key, value and output projection layers during self-attention.
103
+ attention_dropout (`float`, *optional*, defaults to 0.0):
104
+ The dropout ratio for the attention probabilities.
105
+
106
+ ```python
107
+ >>> from transformers import DeepseekV3Model, DeepseekV3Config
108
+
109
+ >>> # Initializing a Deepseek-V3 style configuration
110
+ >>> configuration = DeepseekV3Config()
111
+
112
+ >>> # Accessing the model configuration
113
+ >>> configuration = model.config
114
+ ```"""
115
+
116
+ model_type = "deepseek_v3"
117
+ keys_to_ignore_at_inference = ["past_key_values"]
118
+
119
+ def __init__(
120
+ self,
121
+ vocab_size=129280,
122
+ hidden_size=7168,
123
+ intermediate_size=18432,
124
+ moe_intermediate_size = 2048,
125
+ num_hidden_layers=61,
126
+ num_nextn_predict_layers=1,
127
+ num_attention_heads=128,
128
+ num_key_value_heads=128,
129
+ n_shared_experts = 1,
130
+ n_routed_experts = 256,
131
+ ep_size = 1,
132
+ routed_scaling_factor = 2.5,
133
+ kv_lora_rank = 512,
134
+ q_lora_rank = 1536,
135
+ qk_rope_head_dim = 64,
136
+ v_head_dim = 128,
137
+ qk_nope_head_dim = 128,
138
+ topk_method = 'noaux_tc',
139
+ n_group = 8,
140
+ topk_group = 4,
141
+ num_experts_per_tok = 8,
142
+ moe_layer_freq = 1,
143
+ first_k_dense_replace = 3,
144
+ norm_topk_prob = True,
145
+ scoring_func = 'sigmoid',
146
+ aux_loss_alpha = 0.001,
147
+ seq_aux = True,
148
+ hidden_act="silu",
149
+ max_position_embeddings=4096,
150
+ initializer_range=0.02,
151
+ rms_norm_eps=1e-6,
152
+ use_cache=True,
153
+ pad_token_id=None,
154
+ bos_token_id=0,
155
+ eos_token_id=1,
156
+ pretraining_tp=1,
157
+ tie_word_embeddings=False,
158
+ rope_theta=10000.0,
159
+ rope_scaling=None,
160
+ attention_bias=False,
161
+ attention_dropout=0.0,
162
+ **kwargs,
163
+ ):
164
+ self.vocab_size = vocab_size
165
+ self.max_position_embeddings = max_position_embeddings
166
+ self.hidden_size = hidden_size
167
+ self.intermediate_size = intermediate_size
168
+ self.moe_intermediate_size = moe_intermediate_size
169
+ self.num_hidden_layers = num_hidden_layers
170
+ self.num_nextn_predict_layers = num_nextn_predict_layers
171
+ self.num_attention_heads = num_attention_heads
172
+ self.n_shared_experts = n_shared_experts
173
+ self.n_routed_experts = n_routed_experts
174
+ self.ep_size = ep_size
175
+ self.routed_scaling_factor = routed_scaling_factor
176
+ self.kv_lora_rank = kv_lora_rank
177
+ self.q_lora_rank = q_lora_rank
178
+ self.qk_rope_head_dim = qk_rope_head_dim
179
+ self.v_head_dim = v_head_dim
180
+ self.qk_nope_head_dim = qk_nope_head_dim
181
+ self.topk_method = topk_method
182
+ self.n_group = n_group
183
+ self.topk_group = topk_group
184
+ self.num_experts_per_tok = num_experts_per_tok
185
+ self.moe_layer_freq = moe_layer_freq
186
+ self.first_k_dense_replace = first_k_dense_replace
187
+ self.norm_topk_prob = norm_topk_prob
188
+ self.scoring_func = scoring_func
189
+ self.aux_loss_alpha = aux_loss_alpha
190
+ self.seq_aux = seq_aux
191
+ # for backward compatibility
192
+ if num_key_value_heads is None:
193
+ num_key_value_heads = num_attention_heads
194
+
195
+ self.num_key_value_heads = num_key_value_heads
196
+ self.hidden_act = hidden_act
197
+ self.initializer_range = initializer_range
198
+ self.rms_norm_eps = rms_norm_eps
199
+ self.pretraining_tp = pretraining_tp
200
+ self.use_cache = use_cache
201
+ self.rope_theta = rope_theta
202
+ self.rope_scaling = rope_scaling
203
+ self.attention_bias = attention_bias
204
+ self.attention_dropout = attention_dropout
205
+
206
+ super().__init__(
207
+ pad_token_id=pad_token_id,
208
+ bos_token_id=bos_token_id,
209
+ eos_token_id=eos_token_id,
210
+ tie_word_embeddings=tie_word_embeddings,
211
+ **kwargs,
212
+ )
generation_config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token_id": 163586,
3
+ "max_length": 262144,
4
+ "transformers_version": "4.57.1"
5
+ }
model-00004-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2a2ccbb19c35396788185385012a1a79e57edb81166ed42401cbaede98832b1
3
+ size 4994959584
model-00011-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9199c784a8263238f7e1552f6c8ddc48113cb344bc2958b571eb3eb42d3b24f
3
+ size 4994959064
model-00013-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:72adee2fb15d766998bea53f5865b1bf194eceb7e215ee3e9f01f31f1f07b3cc
3
+ size 4994959584
model-00017-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:97d4589a8ae3a687b6eee383438833fc5b906ba334b68da0cb39a4d2e4d515f6
3
+ size 4994959592
model-00022-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa57599eb57c58a077b155fb7c48f13f8530f6f8641529a696b84e017b617c96
3
+ size 4994959584
model-00034-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26e3233c646fe4818ef9b0af6528e17b3676eec26c8d9b2eaf6c506d4005caa3
3
+ size 4973857008
model-00035-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:769b2640f536857a97ab5f88378697e374a6579c9d28a58351b33ad83e14ad5b
3
+ size 4994959592
model-00037-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a13d0cba926ca6ecd797c1c72750e6d56061a4ae0c931bdd84afa3b15fecffe
3
+ size 4973856480
model-00056-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22268c3f70f7c11d3f2773b0d7c41119cf4175f5c4a3fbaaae6467a7aa564d01
3
+ size 4994959336
model-00299-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:04da0f9f667c0fdab07172c21ba829a4b180f42e8e0e15739800cde268efbf89
3
+ size 4998136688
model-00316-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59c02bf2103a9a1e7449f5a4fdf4b1b37203aae533e069c8fc3015a6feb895f3
3
+ size 4998136960
model-00325-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e222ccf03a4c35b3f1ef6b4e1e7a66eaf2d45d5e58a8a131bcdaffef8cdb91ac
3
+ size 4998136824
model-00401-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92a585b6f12b09c74e4e61fbd6e5792bd23279ce768969bfff512581715cde21
3
+ size 4994960120
model-00415-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b883b30c3277edaa13ddd57ab2b87ecb292594da45ae40f91ef52c63f4c1e14b
3
+ size 4973857384
model-00418-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75598b1a2768da67a56d8fdd97aa297cb8203980ab8e3e302519667a0577bcea
3
+ size 4994960120
model-00424-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5817b674961c123b05261ae415bf96b57f5e3b9a24d61d579d5cfe2e6877884f
3
+ size 4973857512
model-00430-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c2c0bf7c02a9d0acc8ccb7b4d5989afb228953c7e7a3da1167ff8f773baf5c8
3
+ size 4998136840
model-00431-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4a71a4cedb1ab7eb7edd6e33b84c5cb35d828f371879197825212bf7870b7e2e
3
+ size 4973857008
model-00433-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4181c86ff9950975e398d538d0c9c6636a49e4ab0c54eb1879d918fcf9480f46
3
+ size 4973857536
model-00434-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5fd465d5f788af68f9d7ea26da9e2e1c3254e5c99b2cd16a57c09996c37f42d
3
+ size 4994960120
model-00435-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24de4225c7c30e6310b575fcd9abb15a3edbee4e65e3015e51c5f0c395afc2fb
3
+ size 4973857536
model-00447-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41d5979f434c155a425b90c72b3ffdf1281206ac7d127b85e76a45f394a51dc4
3
+ size 4997875224
model-00453-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1410cc38f41af4f76528cfb0894e769ac2417c8078275fd4a9c99531a66a00a1
3
+ size 4973857536
model-00454-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb4b0959f7da1232a3974f4bc532559fc0612ba77be3fcfc2a247a069258a8f8
3
+ size 4994960120
model-00456-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e71b5318ea1067da5e6d63202c1a2202e73d2e29b7c1bdcad5c8bed02ce0f69b
3
+ size 4998136968
model-00463-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8dbbb79782256ff6a87725f2192f1b1a10cb44ca917421204351dd2c001dad26
3
+ size 4994960120
model-00468-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84ba42b4ebaa61263b33b4884b78e95940596da12e4c008653f7d14c2cb2c698
3
+ size 4994960120
model-00471-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:14f8a2fac8b4128330e754bb4fd7d0ecd76bf06771f8c8cbd97c9bef3fc5499a
3
+ size 4973857536
model-00478-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d358e0f545d8c1450443cbc8a0a4098bf419de5fcdb3a459941d504439248ec
3
+ size 4973857536
model-00482-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:911d02c80774f402df76f6e9abc7bebbfa558ac50ede2891934e762b8471befd
3
+ size 4968384168
model-00483-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:45c11949d4f1ddaa72d94cfc371b1048c04a7f923f8f5decda43bab458f4f8bc
3
+ size 4995352144
model-00484-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5098ecc16ce1c552ed0c3cd3466e27e5af9b7a4856a8684ec832b0224722d596
3
+ size 4973857008
model-00487-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a97b3b6b247af852a78f69da49fe9d58a34bcb8d23c8b49d983c682bfbb94122
3
+ size 4994960120
model-00491-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf50ec699cd37e83de75572e2053781b2c8a1af697c372da3e7dd92e1053e106
3
+ size 4998136984
model-00497-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4961b8f150a0dd5ce6ffc95ce8920108994a0cefd0a1177c840c8ded2562a0b
3
+ size 4973857536
model-00498-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a19908ac72121b66c3a8004aa292145f325262c658592ed6442cb6f4c69bf01e
3
+ size 4994960120
model-00499-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dea787809ae7e4a1e1ba191afbfedbf7785b21e29a717d159ee02a3096c0172e
3
+ size 4973857536
model-00518-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79bbcd2fc296bfb1de9031b3ef91df85817ef5e359e992694daf0f3360402ec0
3
+ size 4988015168
model-00519-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2cf7316ab4e2a321e0d17df91efd9d71b67c3999ec76709c45c4adb11a6af8b
3
+ size 4994959584
model-00520-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fbcc4f14f9c76c8e002ca82fd603685170d227c07ec34b595185dab015661b93
3
+ size 4973857376
model-00521-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1404c9149374fa03a62861f7b76cde5e39d7e9510c9990779637fa620522646
3
+ size 4994960120
model-00522-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:294685791ba67962720d0e03bf3ade07fc59568e0e98273329bb4138467d3de5
3
+ size 4973857536
model-00527-of-00527.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf8d385a433c11531088dbf7f22ac16b71fc2366659d0a4d544d2a9ddb758bef
3
+ size 2348810368
special_tokens_map.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_end|>",
4
+ "<|im_user|>",
5
+ "<|im_assistant|>",
6
+ "<|start_header_id|>",
7
+ "<|end_header_id|>",
8
+ "[EOT]",
9
+ "<|im_system|>",
10
+ "<|im_middle|>"
11
+ ],
12
+ "bos_token": {
13
+ "content": "[BOS]",
14
+ "lstrip": false,
15
+ "normalized": false,
16
+ "rstrip": false,
17
+ "single_word": false
18
+ },
19
+ "eos_token": {
20
+ "content": "[EOS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false
25
+ },
26
+ "pad_token": {
27
+ "content": "[PAD]",
28
+ "lstrip": false,
29
+ "normalized": false,
30
+ "rstrip": false,
31
+ "single_word": false
32
+ },
33
+ "unk_token": {
34
+ "content": "[UNK]",
35
+ "lstrip": false,
36
+ "normalized": false,
37
+ "rstrip": false,
38
+ "single_word": false
39
+ }
40
+ }
tokenization_kimi.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tiktoken
3
+
4
+ from logging import getLogger
5
+ from pathlib import Path
6
+ from typing import (
7
+ cast,
8
+ Tuple,
9
+ Dict,
10
+ Iterator,
11
+ List,
12
+ Union,
13
+ Optional,
14
+ )
15
+ from shutil import copyfile
16
+ from tiktoken.load import load_tiktoken_bpe
17
+ from tokenizers import AddedToken, pre_tokenizers, Regex
18
+ from transformers.tokenization_utils import PreTrainedTokenizer
19
+ from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
20
+ from typing import Any
21
+
22
+
23
+ logger = getLogger(__name__)
24
+ VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
25
+
26
+
27
+ class TikTokenTokenizer(PreTrainedTokenizer):
28
+ """
29
+ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
30
+
31
+ This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
32
+ this superclass for more information regarding those methods.
33
+
34
+ Args:
35
+ vocab_file (`str`):
36
+ The path to the Tiktoken model file.
37
+ bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
38
+ The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
39
+ eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
40
+ The end of sequence token.
41
+ unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
42
+ The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
43
+ token instead. The second to last item in special_tokens.
44
+ pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
45
+ The token used for padding, for example when batching sequences of different lengths.
46
+ additional_special_tokens (list of `str`, *optional*):
47
+ A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
48
+ skipped when decoding if `skip_special_tokens` is set to `True`.
49
+ """
50
+
51
+ vocab_files_names = VOCAB_FILES_NAMES
52
+
53
+ model_input_names = ["input_ids", "attention_mask"]
54
+
55
+ special_tokens: Dict[str, int]
56
+
57
+ num_reserved_special_tokens = 256
58
+
59
+ pat_str = "|".join(
60
+ [
61
+ r"""[\p{Han}]+""",
62
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
63
+ r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
64
+ r"""\p{N}{1,3}""",
65
+ r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
66
+ r"""\s*[\r\n]+""",
67
+ r"""\s+(?!\S)""",
68
+ r"""\s+""",
69
+ ]
70
+ )
71
+
72
+ def __init__(
73
+ self,
74
+ vocab_file,
75
+ bos_token: Union[str, AddedToken]="[BOS]",
76
+ eos_token: Union[str, AddedToken]="[EOS]",
77
+ unk_token: Union[str, AddedToken, None]=None,
78
+ pad_token: Union[str, AddedToken, None]=None,
79
+ additional_special_tokens: List[str]=None,
80
+ added_tokens_decoder: Optional[dict] = None,
81
+ **kwargs,
82
+ ):
83
+ assert os.path.isfile(vocab_file), vocab_file
84
+
85
+ if additional_special_tokens is None:
86
+ additional_special_tokens = [
87
+ "<|im_end|>",
88
+ "<|im_user|>",
89
+ "<|im_assistant|>",
90
+ "<|start_header_id|>",
91
+ "<|end_header_id|>",
92
+ "[EOT]",
93
+ "<|im_system|>",
94
+ "<|im_middle|>",
95
+ ]
96
+
97
+ special_tokens_mapping = {
98
+ i: added_tokens_decoder[i].content for i in added_tokens_decoder
99
+ }
100
+
101
+ self.vocab_file = vocab_file
102
+ mergeable_ranks = load_tiktoken_bpe(vocab_file)
103
+ num_base_tokens = len(mergeable_ranks)
104
+ self.special_tokens = {
105
+ special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
106
+ for i in range(
107
+ num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2
108
+ )
109
+ }
110
+
111
+
112
+
113
+ self.model = tiktoken.Encoding(
114
+ name=Path(vocab_file).name,
115
+ pat_str=self.pat_str,
116
+ mergeable_ranks=mergeable_ranks,
117
+ special_tokens=self.special_tokens,
118
+ )
119
+ logger.info(f"Reloaded tiktoken model from {vocab_file}")
120
+
121
+ self.n_words: int = self.model.n_vocab
122
+ # BOS / EOS token IDs
123
+ self.bos_id: int = self.special_tokens[str(bos_token)]
124
+ self.eos_id: int = self.special_tokens[str(eos_token)]
125
+ logger.info(
126
+ f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
127
+ )
128
+
129
+ self.pad_id: int = self.special_tokens[str(pad_token)]
130
+ self.unk_id: int = self.special_tokens[str(unk_token)]
131
+
132
+ self.byte_encoder = bytes_to_unicode()
133
+ self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
134
+
135
+ self.decoder = {}
136
+ for i in range(self.n_words):
137
+ # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
138
+ decoding = ''.join([
139
+ self.byte_encoder[ord(char)] for char in
140
+ self.model.decode_single_token_bytes(i).decode('latin-1')
141
+ ])
142
+ self.decoder[i] = decoding
143
+
144
+ self.encoder = {}
145
+ for i in range(self.n_words):
146
+ if i in self.decoder:
147
+ self.encoder[self.decoder[i]] = i
148
+
149
+ super().__init__(
150
+ bos_token=bos_token,
151
+ eos_token=eos_token,
152
+ unk_token=unk_token,
153
+ pad_token=pad_token,
154
+ additional_special_tokens=additional_special_tokens,
155
+ added_tokens_decoder=added_tokens_decoder,
156
+ **kwargs,
157
+ )
158
+ self.all_special_ids_set = set(self.all_special_ids)
159
+
160
+ def encode(
161
+ self,
162
+ text: str,
163
+ allow_special_tokens: bool = True,
164
+ **kwargs
165
+ ) -> List[int]:
166
+ """
167
+ Encodes a string into a list of token IDs.
168
+
169
+ Args:
170
+ text (str): The input string to be encoded.
171
+
172
+ Returns:
173
+ list[int]: A list of token IDs.
174
+ """
175
+ # If there are other args, we should call super().encode because there are a lot of code
176
+ # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
177
+ # NOTE: our encode method is not compatible with the super().encode method,
178
+ # e.g. split_special_tokens' default is True in our encode method.
179
+ if len(kwargs) > 0:
180
+ logger.warning( f"Calling super().encode with {kwargs}" )
181
+ return super().encode(text, **kwargs)
182
+
183
+ assert type(text) is str
184
+
185
+ # The tiktoken tokenizer can handle <=400k chars without
186
+ # pyo3_runtime.PanicException.
187
+ TIKTOKEN_MAX_ENCODE_CHARS = 400_000
188
+
189
+ # https://github.com/openai/tiktoken/issues/195
190
+ # Here we iterate over subsequences and split if we exceed the limit
191
+ # of max consecutive non-whitespace or whitespace characters.
192
+ MAX_NO_WHITESPACES_CHARS = 25_000
193
+
194
+ texts = self.pre_tokenizer_process(text)
195
+
196
+ all_substrs = []
197
+ for text in texts:
198
+ substrs = (
199
+ substr
200
+ for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
201
+ for substr in self._split_whitespaces_or_nonwhitespaces(
202
+ text[i: i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
203
+ )
204
+ )
205
+ all_substrs.extend(substrs)
206
+
207
+ t: List[int] = []
208
+ for substr in all_substrs:
209
+ if allow_special_tokens:
210
+ t.extend(
211
+ # we should consider special token as a common token
212
+ self.model.encode(
213
+ substr,
214
+ allowed_special="all",
215
+ )
216
+ )
217
+ else:
218
+ t.extend(
219
+ # we should consider special token as a common token
220
+ self.model.encode(
221
+ substr,
222
+ disallowed_special=(),
223
+ )
224
+ )
225
+
226
+ return t
227
+
228
+ def decode(
229
+ self,
230
+ token_ids: Union[int, List[int]],
231
+ **kwargs
232
+ ) -> str:
233
+ """
234
+ Decodes a list of token IDs into a string.
235
+
236
+ Args:
237
+ token_ids (List[int]): The list of token IDs to be decoded.
238
+
239
+ Returns:
240
+ str: The decoded string.
241
+ """
242
+ # If there are other args, we should call super().decode because there are a lot of code
243
+ # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
244
+ if len(kwargs) > 0:
245
+ return super().decode(token_ids, **kwargs)
246
+
247
+ if type(token_ids) is int:
248
+ token_ids = [token_ids]
249
+
250
+ return self.model.decode(cast(List[int], token_ids))
251
+
252
+ @staticmethod
253
+ def _split_whitespaces_or_nonwhitespaces(
254
+ s: str, max_consecutive_slice_len: int
255
+ ) -> Iterator[str]:
256
+ """
257
+ Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
258
+ consecutive whitespaces or consecutive non-whitespaces.
259
+ """
260
+ current_slice_len = 0
261
+ current_slice_is_space = s[0].isspace() if len(s) > 0 else False
262
+ slice_start = 0
263
+
264
+ for i in range(len(s)):
265
+ is_now_space = s[i].isspace()
266
+
267
+ if current_slice_is_space ^ is_now_space:
268
+ current_slice_len = 1
269
+ current_slice_is_space = is_now_space
270
+ else:
271
+ current_slice_len += 1
272
+ if current_slice_len > max_consecutive_slice_len:
273
+ yield s[slice_start:i]
274
+ slice_start = i
275
+ current_slice_len = 1
276
+ yield s[slice_start:]
277
+
278
+ def pre_tokenizer_process(self, text: str) -> List[str]:
279
+ """
280
+ pre-tokenizes the input text into a list of tokens.
281
+ This method is used to split the input text into smaller chunks for internal processing.
282
+ """
283
+ return [text]
284
+
285
+
286
+ """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
287
+ @property
288
+ def vocab_size(self) -> int:
289
+ return self.n_words
290
+
291
+ def get_vocab(self) -> Dict[str, int]:
292
+ return self.encoder
293
+
294
+ def _tokenize(self, text: str, **kwargs) -> List[str]:
295
+ return [
296
+ self.decoder[t]
297
+ for t in self.encode(text)
298
+ ]
299
+
300
+ def _convert_token_to_id(self, token: str) -> int:
301
+ return self.encoder.get(token, self.unk_id)
302
+
303
+ def _convert_id_to_token(self, index: int) -> str:
304
+ return self.decoder.get(index)
305
+
306
+ @staticmethod
307
+ def clean_up_tokenization(out_string: str) -> str:
308
+ return out_string
309
+
310
+ def convert_tokens_to_string(self, tokens: List[str]) -> str:
311
+ text = ''.join(tokens)
312
+ text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', 'replace')
313
+ return text
314
+
315
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
316
+ if not os.path.isdir(save_directory):
317
+ raise ValueError(f"vocabulary path ({save_directory}) should be a directory")
318
+ out_vocab_file = os.path.join(
319
+ save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
320
+ )
321
+
322
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
323
+ copyfile(self.vocab_file, out_vocab_file)
324
+
325
+ return (out_vocab_file,)
326
+
327
+
328
+
329
+ def apply_chat_template(
330
+ self, conversation, tools: Optional[list[dict]] = None,
331
+ tokenize: bool = False,
332
+ add_generation_prompt: bool = True,
333
+ **kwargs
334
+ ):
335
+ tools = deep_sort_dict(tools)
336
+ return super().apply_chat_template(conversation,
337
+ tools=tools,
338
+ tokenize=tokenize,
339
+ add_generation_prompt=add_generation_prompt,
340
+ **kwargs)
341
+
342
+
343
+ def deep_sort_dict(obj: Any) -> Any:
344
+ if isinstance(obj, dict):
345
+ return {k: deep_sort_dict(v) for k, v in sorted(obj.items())}
346
+ if isinstance(obj, list):
347
+ return [deep_sort_dict(item) for item in obj]
348
+ return obj
349
+
tokenizer_config.json ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "163584": {
4
+ "content": "[BOS]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "163585": {
12
+ "content": "[EOS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "163586": {
20
+ "content": "<|im_end|>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "163587": {
28
+ "content": "<|im_user|>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "163588": {
36
+ "content": "<|im_assistant|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "163590": {
44
+ "content": "<|start_header_id|>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "163591": {
52
+ "content": "<|end_header_id|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "163593": {
60
+ "content": "[EOT]",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "163594": {
68
+ "content": "<|im_system|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "163595": {
76
+ "content": "<|tool_calls_section_begin|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": false
82
+ },
83
+ "163596": {
84
+ "content": "<|tool_calls_section_end|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": false
90
+ },
91
+ "163597": {
92
+ "content": "<|tool_call_begin|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": false
98
+ },
99
+ "163598": {
100
+ "content": "<|tool_call_argument_begin|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": false
106
+ },
107
+ "163599": {
108
+ "content": "<|tool_call_end|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": false
114
+ },
115
+ "163601": {
116
+ "content": "<|im_middle|>",
117
+ "lstrip": false,
118
+ "normalized": false,
119
+ "rstrip": false,
120
+ "single_word": false,
121
+ "special": true
122
+ },
123
+ "163606": {
124
+ "content": "<think>",
125
+ "lstrip": false,
126
+ "normalized": false,
127
+ "rstrip": false,
128
+ "single_word": false,
129
+ "special": false
130
+ },
131
+ "163607": {
132
+ "content": "</think>",
133
+ "lstrip": false,
134
+ "normalized": false,
135
+ "rstrip": false,
136
+ "single_word": false,
137
+ "special": false
138
+ },
139
+ "163838": {
140
+ "content": "[UNK]",
141
+ "lstrip": false,
142
+ "normalized": false,
143
+ "rstrip": false,
144
+ "single_word": false,
145
+ "special": true
146
+ },
147
+ "163839": {
148
+ "content": "[PAD]",
149
+ "lstrip": false,
150
+ "normalized": false,
151
+ "rstrip": false,
152
+ "single_word": false,
153
+ "special": true
154
+ }
155
+ },
156
+ "additional_special_tokens": [
157
+ "<|im_end|>",
158
+ "<|im_user|>",
159
+ "<|im_assistant|>",
160
+ "<|start_header_id|>",
161
+ "<|end_header_id|>",
162
+ "[EOT]",
163
+ "<|im_system|>",
164
+ "<|im_middle|>"
165
+ ],
166
+ "auto_map": {
167
+ "AutoTokenizer": [
168
+ "tokenization_kimi.TikTokenTokenizer",
169
+ null
170
+ ]
171
+ },
172
+ "bos_token": "[BOS]",
173
+ "clean_up_tokenization_spaces": false,
174
+ "eos_token": "[EOS]",
175
+ "extra_special_tokens": {},
176
+ "model_max_length": 1000000000000000019884624838656,
177
+ "pad_token": "[PAD]",
178
+ "tokenizer_class": "TikTokenTokenizer",
179
+ "unk_token": "[UNK]"
180
+ }