Spaces:
Sleeping
Sleeping
Alan Liu
commited on
Commit
·
c93009d
1
Parent(s):
ed50ee5
add client throughput
Browse files- app.py +9 -6
- calc_util.py +13 -1
app.py
CHANGED
|
@@ -138,8 +138,9 @@ with col3: # Prefilling
|
|
| 138 |
prefilling_operation_count = prefilling_operation(model_config, inference_config)
|
| 139 |
prefilling_activation_memory_count = prefilling_activation_memory(model_config, inference_config)
|
| 140 |
inference_info['inference_prefilling_time'] = prefilling_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
|
| 141 |
-
inference_info['inference_prefilling_throughput'] = inference_config['input_seq_length']*inference_config['batchsize']/inference_info['inference_prefilling_time']
|
| 142 |
inference_info['prefilling_memory_latency'] = prefilling_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
|
|
|
|
|
|
|
| 143 |
cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * inference_config['input_seq_length']))
|
| 144 |
|
| 145 |
operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
|
|
@@ -162,9 +163,9 @@ with col3: # Prefilling
|
|
| 162 |
|
| 163 |
header5("Summary: Prefilling")
|
| 164 |
st.markdown(create_table(df_subtotal_operation_count))
|
| 165 |
-
st.write(f"Prefillng throughput (tokens/s): {inference_info['inference_prefilling_throughput']:.2f}")
|
| 166 |
st.write(f"FLOPS latency: {inference_info['inference_prefilling_time']}")
|
| 167 |
st.write(f"Memory latency: {inference_info['prefilling_memory_latency']}")
|
|
|
|
| 168 |
|
| 169 |
if inference_config['KV_cache']:
|
| 170 |
st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
|
|
@@ -175,9 +176,9 @@ with col4: # Generation
|
|
| 175 |
generation_operation_count = generation_operation(model_config, inference_config)
|
| 176 |
generation_activation_memory_count = generation_activation_memory(model_config, inference_config)
|
| 177 |
inference_info['inference_generation_time'] = generation_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
|
| 178 |
-
inference_info['inference_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize']/inference_info['inference_generation_time']
|
| 179 |
-
inference_info['inference_client_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize'] / (inference_info['inference_prefilling_time'] + inference_info['inference_generation_time'])
|
| 180 |
inference_info['generation_memory_latency'] = generation_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
|
|
|
|
|
|
|
| 181 |
cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * (inference_config['input_seq_length']+inference_config['output_seq_length'])))
|
| 182 |
|
| 183 |
operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
|
|
@@ -199,10 +200,12 @@ with col4: # Generation
|
|
| 199 |
|
| 200 |
header5("Summary: Generation")
|
| 201 |
st.markdown(create_table(df_subtotal_operation_count))
|
| 202 |
-
st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
|
| 203 |
-
st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
|
| 204 |
st.write(f"FLOPS latency: {inference_info['inference_generation_time']}")
|
| 205 |
st.write(f"Memory latency: {inference_info['generation_memory_latency']}")
|
|
|
|
|
|
|
| 206 |
|
| 207 |
if inference_config['KV_cache']:
|
| 208 |
st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
|
|
|
|
| 138 |
prefilling_operation_count = prefilling_operation(model_config, inference_config)
|
| 139 |
prefilling_activation_memory_count = prefilling_activation_memory(model_config, inference_config)
|
| 140 |
inference_info['inference_prefilling_time'] = prefilling_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
|
|
|
|
| 141 |
inference_info['prefilling_memory_latency'] = prefilling_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
|
| 142 |
+
calc_prefilling_throughput(model_config, inference_config, inference_info)
|
| 143 |
+
|
| 144 |
cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * inference_config['input_seq_length']))
|
| 145 |
|
| 146 |
operation_items = {key: "{:,}".format(int(prefilling_operation_count[key])) for key in prefilling_operation_count if key not in subtotal_operations}
|
|
|
|
| 163 |
|
| 164 |
header5("Summary: Prefilling")
|
| 165 |
st.markdown(create_table(df_subtotal_operation_count))
|
|
|
|
| 166 |
st.write(f"FLOPS latency: {inference_info['inference_prefilling_time']}")
|
| 167 |
st.write(f"Memory latency: {inference_info['prefilling_memory_latency']}")
|
| 168 |
+
st.write(f"Prefillng throughput (tokens/s): {inference_info['prefilling_throughput']:.2f} ({inference_info['prefilling_bound_type']}-bound)")
|
| 169 |
|
| 170 |
if inference_config['KV_cache']:
|
| 171 |
st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
|
|
|
|
| 176 |
generation_operation_count = generation_operation(model_config, inference_config)
|
| 177 |
generation_activation_memory_count = generation_activation_memory(model_config, inference_config)
|
| 178 |
inference_info['inference_generation_time'] = generation_operation_count['total'] / (gpu_config['TFLOP']*1024**4)
|
|
|
|
|
|
|
| 179 |
inference_info['generation_memory_latency'] = generation_activation_memory_count['total'] / (gpu_config['memory_bandwidth']*1024**3)
|
| 180 |
+
calc_generation_throughput(model_config, inference_config, inference_info)
|
| 181 |
+
|
| 182 |
cached_parameter_count['kv_cache'] = 2 * (inference_config['batchsize'] * (model_config['hidden_size'] * model_config['num_hidden_layers'] * (inference_config['input_seq_length']+inference_config['output_seq_length'])))
|
| 183 |
|
| 184 |
operation_items = {key: "{:,}".format(int(generation_operation_count[key])) for key in generation_operation_count if key not in subtotal_operations}
|
|
|
|
| 200 |
|
| 201 |
header5("Summary: Generation")
|
| 202 |
st.markdown(create_table(df_subtotal_operation_count))
|
| 203 |
+
#st.write(f"Generation-only throughput (tokens/s): {inference_info['inference_generation_throughput']:.2f}")
|
| 204 |
+
#st.write(f"(Client) Generation throughput (tokens/s): {inference_info['inference_client_generation_throughput']:.2f}")
|
| 205 |
st.write(f"FLOPS latency: {inference_info['inference_generation_time']}")
|
| 206 |
st.write(f"Memory latency: {inference_info['generation_memory_latency']}")
|
| 207 |
+
st.write(f"Generation-only throughput (tokens/s): {inference_info['generation_throughput']:.2f} ({inference_info['generation_bound_type']}-bound)")
|
| 208 |
+
st.write(f"(Client) Generation throughput (tokens/s): {inference_info['client_generation_throughput']:.2f}")
|
| 209 |
|
| 210 |
if inference_config['KV_cache']:
|
| 211 |
st.write(f"kv cache (Byte): {cached_parameter_count['kv_cache']:,}")
|
calc_util.py
CHANGED
|
@@ -296,4 +296,16 @@ def generation_activation_memory(model_config, inference_config):
|
|
| 296 |
activation_memory['mlp'] + activation_memory['layernorm']
|
| 297 |
)
|
| 298 |
|
| 299 |
-
return activation_memory
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
activation_memory['mlp'] + activation_memory['layernorm']
|
| 297 |
)
|
| 298 |
|
| 299 |
+
return activation_memory
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
def calc_prefilling_throughput(model_config, inference_config, inference_info):
|
| 303 |
+
inference_info['prefilling_throughput'] = inference_config['input_seq_length']*inference_config['batchsize'] / max([inference_info['inference_prefilling_time'], inference_info['prefilling_memory_latency']])
|
| 304 |
+
inference_info['prefilling_bound_type'] = "memory" if inference_info['inference_prefilling_time'] < inference_info['prefilling_memory_latency'] else "arithmetic"
|
| 305 |
+
|
| 306 |
+
def calc_generation_throughput(model_config, inference_config, inference_info):
|
| 307 |
+
inference_info['generation_throughput'] = inference_config['input_seq_length']*inference_config['batchsize'] / max([inference_info['inference_generation_time'], inference_info['generation_memory_latency']])
|
| 308 |
+
inference_info['generation_bound_type'] = "memory" if inference_info['inference_generation_time'] < inference_info['generation_memory_latency'] else "arithmetic"
|
| 309 |
+
|
| 310 |
+
total_time = max([inference_info['inference_prefilling_time'], inference_info['prefilling_memory_latency']]) + max([inference_info['inference_generation_time'], inference_info['generation_memory_latency']])
|
| 311 |
+
inference_info['client_generation_throughput'] = inference_config['output_seq_length']*inference_config['batchsize'] / total_time
|