davidpomerenke commited on
Commit
aa92add
·
verified ·
1 Parent(s): 972026c

Upload from GitHub Actions: model name no bracket stuff

Browse files
evals/backend.py CHANGED
@@ -4,7 +4,6 @@ import os
4
  import numpy as np
5
  import pandas as pd
6
  import uvicorn
7
-
8
  from countries import make_country_table
9
  from datasets_.util import load
10
  from fastapi import FastAPI, Request
@@ -12,8 +11,12 @@ from fastapi.middleware.cors import CORSMiddleware
12
  from fastapi.middleware.gzip import GZipMiddleware
13
  from fastapi.responses import JSONResponse
14
  from fastapi.staticfiles import StaticFiles
 
 
 
15
 
16
  scores = load("results")
 
17
  languages = load("languages")
18
  models = load("models")
19
 
@@ -31,56 +34,155 @@ task_metrics = [
31
  "mgsm_accuracy",
32
  ]
33
 
 
34
  def compute_normalized_average(df, metrics):
35
  """Compute simple average across metric columns without normalization."""
36
  return df[metrics].mean(axis=1, skipna=False)
37
 
38
 
39
- def make_model_table(scores_df, models):
40
- scores_df = scores_df.copy()
41
- # Create a combined task_metric for origin
42
- scores_df["task_metric_origin"] = (
43
- scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
44
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
- # Pivot to get scores for each origin-specific metric
47
- scores_pivot = scores_df.pivot_table(
48
- index="model",
49
- columns="task_metric_origin",
50
- values="score",
51
- aggfunc="mean",
52
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
- # Create the regular task_metric for the main average calculation
 
 
55
  scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
 
 
 
 
 
56
  main_pivot = scores_df.pivot_table(
57
  index="model", columns="task_metric", values="score", aggfunc="mean"
58
  )
59
-
60
- # Merge the two pivots
 
61
  df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
62
 
 
63
  for metric in task_metrics:
64
- if metric not in df.columns:
65
- df[metric] = np.nan
66
-
67
  df["average"] = compute_normalized_average(df, task_metrics)
 
68
 
69
- # Add flag if any machine-origin data was used
70
- machine_presence = scores_df[scores_df["origin"] == "machine"].groupby(["model", "task_metric"]).size()
 
 
 
 
71
  for metric in task_metrics:
72
- df[f"{metric}_contains_machine"] = df.index.map(lambda m: (m, metric) in machine_presence.index)
 
 
 
 
73
  df = df.sort_values(by="average", ascending=False).reset_index()
74
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
75
  df["rank"] = df.index + 1
 
 
 
76
 
77
- # Dynamically find all metric columns to include
78
- final_cols = df.columns
79
- metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
80
-
81
- df["creation_date"] = df["creation_date"].apply(lambda x: x.isoformat() if x else None)
82
 
83
- df = df[
84
  [
85
  "rank",
86
  "model",
@@ -93,42 +195,58 @@ def make_model_table(scores_df, models):
93
  "license",
94
  "cost",
95
  "average",
96
- *sorted(list(set(metric_cols))),
 
97
  ]
98
  ]
99
- return df
100
 
101
 
102
- def make_language_table(scores_df, languages):
103
  scores_df = scores_df.copy()
104
  scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
105
-
106
- # Pivot scores
107
  score_pivot = scores_df.pivot_table(
108
  index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
109
  )
110
-
111
- # Pivot origins (first origin since each task+lang combo has only one)
112
  origin_pivot = scores_df.pivot_table(
113
  index="bcp_47", columns="task_metric", values="origin", aggfunc="first"
114
  )
115
  origin_pivot = origin_pivot.add_suffix("_origin")
116
-
117
  df = pd.merge(score_pivot, origin_pivot, on="bcp_47", how="outer")
118
-
119
- for metric in task_metrics:
120
- if metric not in df.columns:
121
- df[metric] = np.nan
122
 
 
 
 
123
  df["average"] = compute_normalized_average(df, task_metrics)
124
- df = pd.merge(languages, df, on="bcp_47", how="outer")
125
- df = df.sort_values(by="speakers", ascending=False)
126
 
127
- # Dynamically find all metric columns to include
128
- final_cols = df.columns
129
- metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
131
- df = df[
132
  [
133
  "bcp_47",
134
  "language_name",
@@ -136,110 +254,97 @@ def make_language_table(scores_df, languages):
136
  "speakers",
137
  "family",
138
  "average",
 
139
  "in_benchmark",
140
- *sorted(list(set(metric_cols))),
141
  ]
142
  ]
143
- return df
144
 
145
 
146
  def make_language_tier_history(scores_df, languages, models):
147
- # Rank languages by speakers
148
- ranked_langs = languages.sort_values(by="speakers", ascending=False).reset_index(drop=True)
149
-
150
- # Define tiers
151
- tier_ranges = {
152
- "Top 1": (0, 1),
153
- "Top 2-20": (1, 20),
154
- "Top 20-200": (19, 500),
155
- }
156
-
157
  # Calculate model-language proficiency scores
158
  scores_df = scores_df.copy()
159
  scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
160
-
161
- # Pivot to get model-language-metric scores
162
  pivot = scores_df.pivot_table(
163
- index=["model", "bcp_47"],
164
- columns="task_metric",
165
- values="score",
166
- aggfunc="mean"
167
  )
168
-
169
- # Ensure all task_metrics columns exist
170
  for metric in task_metrics:
171
- if metric not in pivot.columns:
172
- pivot[metric] = np.nan
173
-
174
- # Calculate proficiency score for each model-language pair
175
  pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics)
176
  pivot = pivot.reset_index()
177
-
178
- # Create all tier-level aggregations (allowing overlapping tiers)
179
- all_tier_scores = []
180
- for tier_name, (start, end) in tier_ranges.items():
181
- tier_langs = ranked_langs.iloc[start:end]["bcp_47"].tolist()
182
- tier_data = pivot[pivot["bcp_47"].isin(tier_langs)]
183
- tier_scores = tier_data.groupby("model")["proficiency_score"].mean().reset_index()
184
- tier_scores["tier"] = tier_name
185
- all_tier_scores.append(tier_scores)
186
-
187
- tier_scores = pd.concat(all_tier_scores, ignore_index=True)
188
-
189
- # Merge with models data
190
- tier_scores = pd.merge(tier_scores, models, left_on="model", right_on="id", how="left")
191
-
192
- # Select relevant columns
193
- tier_scores = tier_scores[
194
- ["model", "name", "provider_name", "creation_date", "size", "tier", "proficiency_score"]
195
- ]
196
-
197
  tier_scores["creation_date"] = tier_scores["creation_date"].apply(
198
  lambda x: x.isoformat() if x else None
199
  )
200
-
201
- return tier_scores
 
 
 
 
 
 
 
 
 
 
202
 
203
 
204
  def make_license_history(scores_df, models):
205
  scores_df = scores_df.copy()
206
  scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
207
-
208
- # Pivot to get model-level scores
209
  pivot = scores_df.pivot_table(
210
- index="model",
211
- columns="task_metric",
212
- values="score",
213
- aggfunc="mean"
214
  )
215
-
216
- # Ensure all task_metrics columns exist
217
  for metric in task_metrics:
218
- if metric not in pivot.columns:
219
- pivot[metric] = np.nan
220
-
221
- # Calculate proficiency score for each model
222
  pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics)
223
- pivot = pivot.reset_index()
224
-
225
- # Merge with models data
226
- df = pd.merge(pivot, models, left_on="model", right_on="id", how="left")
227
-
228
- # Classify as commercial or open
229
  df["license_type"] = df["type"].apply(
230
  lambda x: "Open-source" if x == "open-source" else "Commercial"
231
  )
232
-
233
- # Select relevant columns
234
- df = df[
235
- ["model", "name", "provider_name", "creation_date", "size", "license_type", "proficiency_score"]
236
- ]
237
-
238
  df["creation_date"] = df["creation_date"].apply(
239
  lambda x: x.isoformat() if x else None
240
  )
241
-
242
- return df
 
 
 
 
 
 
 
 
 
 
243
 
244
 
245
  app = FastAPI()
@@ -257,38 +362,53 @@ async def data(request: Request):
257
  body = await request.body()
258
  data = json.loads(body)
259
  selected_languages = data.get("selectedLanguages", {})
260
-
261
  # Identify which metrics have machine translations available
262
  machine_translated_metrics = {
263
- f"{row['task']}_{row['metric']}"
264
- for _, row in scores.iterrows()
265
  if row["origin"] == "machine"
266
  }
267
 
268
  # Filter by selected languages if provided
269
- df = scores[scores["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)] if selected_languages else scores
270
-
 
 
 
 
 
 
 
 
 
 
 
 
 
271
  if len(df) == 0:
272
  model_table = pd.DataFrame()
273
  countries = pd.DataFrame()
274
  else:
275
- model_table = make_model_table(df, models)
276
- countries = make_country_table(make_language_table(df, languages))
277
-
278
- language_table = make_language_table(scores, languages)
279
  language_tier_history = make_language_tier_history(scores, languages, models)
280
  license_history = make_license_history(scores, models)
281
  datasets_df = pd.read_json("data/datasets.json")
282
-
283
- return JSONResponse(content={
284
- "model_table": serialize(model_table),
285
- "language_table": serialize(language_table),
286
- "dataset_table": serialize(datasets_df),
287
- "countries": serialize(countries),
288
- "machine_translated_metrics": list(machine_translated_metrics),
289
- "language_tier_history": serialize(language_tier_history),
290
- "license_history": serialize(license_history),
291
- })
 
 
292
 
293
 
294
  # Only serve static files if build directory exists
 
4
  import numpy as np
5
  import pandas as pd
6
  import uvicorn
 
7
  from countries import make_country_table
8
  from datasets_.util import load
9
  from fastapi import FastAPI, Request
 
11
  from fastapi.middleware.gzip import GZipMiddleware
12
  from fastapi.responses import JSONResponse
13
  from fastapi.staticfiles import StaticFiles
14
+ from joblib.memory import Memory
15
+
16
+ cache = Memory(location=".cache", verbose=0).cache
17
 
18
  scores = load("results")
19
+ scores_detailed = load("results-detailed")
20
  languages = load("languages")
21
  models = load("models")
22
 
 
34
  "mgsm_accuracy",
35
  ]
36
 
37
+
38
  def compute_normalized_average(df, metrics):
39
  """Compute simple average across metric columns without normalization."""
40
  return df[metrics].mean(axis=1, skipna=False)
41
 
42
 
43
+ @cache
44
+ def compute_bootstrap_ci(
45
+ data_hash, group_cols_tuple, n_bootstrap=1000, ci_level=0.95, seed=42
46
+ ):
47
+ """Compute bootstrap CIs for grouped data. Cached based on data hash."""
48
+ # This function is called with the actual data passed separately via _ci_cache
49
+ df, group_cols = _ci_cache[data_hash]
50
+ np.random.seed(seed)
51
+ percentiles = [(1 - ci_level) / 2 * 100, (1 + ci_level) / 2 * 100]
52
+
53
+ def bootstrap_group(group):
54
+ scores = group["score"].values
55
+ if len(scores) == 0:
56
+ return pd.Series({"ci_lower": None, "ci_upper": None})
57
+ bootstrap_means = [
58
+ np.random.choice(scores, len(scores), replace=True).mean()
59
+ for _ in range(n_bootstrap)
60
+ ]
61
+ ci_lower, ci_upper = np.percentile(bootstrap_means, percentiles)
62
+ return pd.Series({"ci_lower": ci_lower, "ci_upper": ci_upper})
63
 
64
+ result = df.groupby(group_cols, as_index=False).apply(
65
+ bootstrap_group, include_groups=False
 
 
 
 
66
  )
67
+ result.columns = group_cols + ["ci_lower", "ci_upper"]
68
+ return result
69
+
70
+
71
+ # Thread-safe cache for passing DataFrames to cached function
72
+ _ci_cache = {}
73
+
74
+
75
+ def add_confidence_intervals(df, scores_df_detailed, group_col, metrics):
76
+ """DRY helper to add CI columns for metrics and average to a dataframe."""
77
+ if scores_df_detailed is None or scores_df_detailed.empty:
78
+ return df
79
+
80
+ detailed = scores_df_detailed.copy()
81
+ detailed["task_metric"] = detailed["task"] + "_" + detailed["metric"]
82
+
83
+ # Add CI for each metric
84
+ for metric in metrics:
85
+ metric_data = detailed[detailed["task_metric"] == metric]
86
+ if not metric_data.empty:
87
+ # Create hash based on data shape, groups, and statistics
88
+ group_stats = (
89
+ metric_data.groupby(group_col)["score"]
90
+ .agg(["count", "mean", "std"])
91
+ .round(6)
92
+ )
93
+ data_hash = hash(
94
+ (
95
+ metric,
96
+ group_col,
97
+ len(metric_data),
98
+ tuple(group_stats.index),
99
+ tuple(map(tuple, group_stats.values)),
100
+ )
101
+ )
102
+ _ci_cache[data_hash] = (metric_data, [group_col])
103
+ ci_df = compute_bootstrap_ci(data_hash, (group_col,))
104
+ ci_df = ci_df.rename(
105
+ columns={
106
+ "ci_lower": f"{metric}_ci_lower",
107
+ "ci_upper": f"{metric}_ci_upper",
108
+ }
109
+ )
110
+ df = pd.merge(df, ci_df, on=group_col, how="left")
111
+
112
+ # Add CI for average
113
+ avg_data = detailed[detailed["task_metric"].isin(metrics)]
114
+ if not avg_data.empty:
115
+ # Create hash based on data shape, groups, and statistics
116
+ group_stats = (
117
+ avg_data.groupby(group_col)["score"].agg(["count", "mean", "std"]).round(6)
118
+ )
119
+ data_hash = hash(
120
+ (
121
+ "average",
122
+ group_col,
123
+ len(avg_data),
124
+ tuple(group_stats.index),
125
+ tuple(map(tuple, group_stats.values)),
126
+ )
127
+ )
128
+ _ci_cache[data_hash] = (avg_data, [group_col])
129
+ avg_ci_df = compute_bootstrap_ci(data_hash, (group_col,))
130
+ avg_ci_df = avg_ci_df.rename(
131
+ columns={"ci_lower": "average_ci_lower", "ci_upper": "average_ci_upper"}
132
+ )
133
+ df = pd.merge(df, avg_ci_df, on=group_col, how="left")
134
+
135
+ return df
136
 
137
+
138
+ def make_model_table(scores_df, models, scores_df_detailed=None):
139
+ scores_df = scores_df.copy()
140
  scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
141
+ scores_df["task_metric_origin"] = (
142
+ scores_df["task_metric"] + "_" + scores_df["origin"]
143
+ )
144
+
145
+ # Pivot scores
146
  main_pivot = scores_df.pivot_table(
147
  index="model", columns="task_metric", values="score", aggfunc="mean"
148
  )
149
+ scores_pivot = scores_df.pivot_table(
150
+ index="model", columns="task_metric_origin", values="score", aggfunc="mean"
151
+ )
152
  df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
153
 
154
+ # Fill missing metrics and compute average
155
  for metric in task_metrics:
156
+ df[metric] = df.get(metric, np.nan)
 
 
157
  df["average"] = compute_normalized_average(df, task_metrics)
158
+ df = add_confidence_intervals(df, scores_df_detailed, "model", task_metrics)
159
 
160
+ # Add machine-origin flags
161
+ machine_presence = (
162
+ scores_df[scores_df["origin"] == "machine"]
163
+ .groupby(["model", "task_metric"])
164
+ .size()
165
+ )
166
  for metric in task_metrics:
167
+ df[f"{metric}_contains_machine"] = df.index.map(
168
+ lambda m: (m, metric) in machine_presence.index
169
+ )
170
+
171
+ # Sort and add metadata
172
  df = df.sort_values(by="average", ascending=False).reset_index()
173
  df = pd.merge(df, models, left_on="model", right_on="id", how="left")
174
  df["rank"] = df.index + 1
175
+ df["creation_date"] = df["creation_date"].apply(
176
+ lambda x: x.isoformat() if x else None
177
+ )
178
 
179
+ # Select columns dynamically
180
+ metric_cols = [m for m in df.columns if any(tm in m for tm in task_metrics)]
181
+ avg_ci_cols = [
182
+ c for c in df.columns if c in ["average_ci_lower", "average_ci_upper"]
183
+ ]
184
 
185
+ return df[
186
  [
187
  "rank",
188
  "model",
 
195
  "license",
196
  "cost",
197
  "average",
198
+ *avg_ci_cols,
199
+ *sorted(set(metric_cols)),
200
  ]
201
  ]
 
202
 
203
 
204
+ def make_language_table(scores_df, languages, scores_df_detailed=None):
205
  scores_df = scores_df.copy()
206
  scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
207
+
208
+ # Pivot scores and origins
209
  score_pivot = scores_df.pivot_table(
210
  index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
211
  )
 
 
212
  origin_pivot = scores_df.pivot_table(
213
  index="bcp_47", columns="task_metric", values="origin", aggfunc="first"
214
  )
215
  origin_pivot = origin_pivot.add_suffix("_origin")
 
216
  df = pd.merge(score_pivot, origin_pivot, on="bcp_47", how="outer")
 
 
 
 
217
 
218
+ # Fill missing metrics and compute average
219
+ for metric in task_metrics:
220
+ df[metric] = df.get(metric, np.nan)
221
  df["average"] = compute_normalized_average(df, task_metrics)
 
 
222
 
223
+ # For language table, we need to compute scores from detailed data to match CI calculation
224
+ # (CI is computed from all samples, so score should be too)
225
+ if scores_df_detailed is not None and not scores_df_detailed.empty:
226
+ detailed = scores_df_detailed.copy()
227
+ detailed["task_metric"] = detailed["task"] + "_" + detailed["metric"]
228
+ detailed_pivot = detailed.pivot_table(
229
+ index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
230
+ )
231
+ for metric in task_metrics:
232
+ if metric in detailed_pivot.columns:
233
+ df[metric] = detailed_pivot[metric]
234
+ df["average"] = compute_normalized_average(df, task_metrics)
235
+
236
+ df = add_confidence_intervals(df, scores_df_detailed, "bcp_47", task_metrics)
237
+
238
+ # Merge with language metadata and sort
239
+ df = pd.merge(languages, df, on="bcp_47", how="outer").sort_values(
240
+ by="speakers", ascending=False
241
+ )
242
+
243
+ # Select columns dynamically
244
+ metric_cols = [m for m in df.columns if any(tm in m for tm in task_metrics)]
245
+ avg_ci_cols = [
246
+ c for c in df.columns if c in ["average_ci_lower", "average_ci_upper"]
247
+ ]
248
 
249
+ return df[
250
  [
251
  "bcp_47",
252
  "language_name",
 
254
  "speakers",
255
  "family",
256
  "average",
257
+ *avg_ci_cols,
258
  "in_benchmark",
259
+ *sorted(set(metric_cols)),
260
  ]
261
  ]
 
262
 
263
 
264
  def make_language_tier_history(scores_df, languages, models):
265
+ ranked_langs = languages.sort_values(by="speakers", ascending=False).reset_index(
266
+ drop=True
267
+ )
268
+ tier_ranges = {"Top 1": (0, 1), "Top 2-20": (1, 20), "Top 20-200": (19, 500)}
269
+
 
 
 
 
 
270
  # Calculate model-language proficiency scores
271
  scores_df = scores_df.copy()
272
  scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
 
 
273
  pivot = scores_df.pivot_table(
274
+ index=["model", "bcp_47"], columns="task_metric", values="score", aggfunc="mean"
 
 
 
275
  )
 
 
276
  for metric in task_metrics:
277
+ pivot[metric] = pivot.get(metric, np.nan)
 
 
 
278
  pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics)
279
  pivot = pivot.reset_index()
280
+
281
+ # Aggregate by tier
282
+ tier_scores = pd.concat(
283
+ [
284
+ pivot[pivot["bcp_47"].isin(ranked_langs.iloc[start:end]["bcp_47"])]
285
+ .groupby("model")["proficiency_score"]
286
+ .mean()
287
+ .reset_index()
288
+ .assign(tier=tier_name)
289
+ for tier_name, (start, end) in tier_ranges.items()
290
+ ],
291
+ ignore_index=True,
292
+ )
293
+
294
+ tier_scores = pd.merge(
295
+ tier_scores, models, left_on="model", right_on="id", how="left"
296
+ )
 
 
 
297
  tier_scores["creation_date"] = tier_scores["creation_date"].apply(
298
  lambda x: x.isoformat() if x else None
299
  )
300
+
301
+ return tier_scores[
302
+ [
303
+ "model",
304
+ "name",
305
+ "provider_name",
306
+ "creation_date",
307
+ "size",
308
+ "tier",
309
+ "proficiency_score",
310
+ ]
311
+ ]
312
 
313
 
314
  def make_license_history(scores_df, models):
315
  scores_df = scores_df.copy()
316
  scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
317
+
318
+ # Pivot and compute proficiency
319
  pivot = scores_df.pivot_table(
320
+ index="model", columns="task_metric", values="score", aggfunc="mean"
 
 
 
321
  )
 
 
322
  for metric in task_metrics:
323
+ pivot[metric] = pivot.get(metric, np.nan)
 
 
 
324
  pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics)
325
+
326
+ # Merge and classify
327
+ df = pd.merge(
328
+ pivot.reset_index(), models, left_on="model", right_on="id", how="left"
329
+ )
 
330
  df["license_type"] = df["type"].apply(
331
  lambda x: "Open-source" if x == "open-source" else "Commercial"
332
  )
 
 
 
 
 
 
333
  df["creation_date"] = df["creation_date"].apply(
334
  lambda x: x.isoformat() if x else None
335
  )
336
+
337
+ return df[
338
+ [
339
+ "model",
340
+ "name",
341
+ "provider_name",
342
+ "creation_date",
343
+ "size",
344
+ "license_type",
345
+ "proficiency_score",
346
+ ]
347
+ ]
348
 
349
 
350
  app = FastAPI()
 
362
  body = await request.body()
363
  data = json.loads(body)
364
  selected_languages = data.get("selectedLanguages", {})
365
+
366
  # Identify which metrics have machine translations available
367
  machine_translated_metrics = {
368
+ f"{row['task']}_{row['metric']}"
369
+ for _, row in scores.iterrows()
370
  if row["origin"] == "machine"
371
  }
372
 
373
  # Filter by selected languages if provided
374
+ df = (
375
+ scores[scores["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
376
+ if selected_languages
377
+ else scores
378
+ )
379
+ df_detailed = (
380
+ scores_detailed[
381
+ scores_detailed["bcp_47"].isin(
382
+ lang["bcp_47"] for lang in selected_languages
383
+ )
384
+ ]
385
+ if selected_languages
386
+ else scores_detailed
387
+ )
388
+
389
  if len(df) == 0:
390
  model_table = pd.DataFrame()
391
  countries = pd.DataFrame()
392
  else:
393
+ model_table = make_model_table(df, models, df_detailed)
394
+ countries = make_country_table(make_language_table(df, languages, df_detailed))
395
+
396
+ language_table = make_language_table(scores, languages, scores_detailed)
397
  language_tier_history = make_language_tier_history(scores, languages, models)
398
  license_history = make_license_history(scores, models)
399
  datasets_df = pd.read_json("data/datasets.json")
400
+
401
+ return JSONResponse(
402
+ content={
403
+ "model_table": serialize(model_table),
404
+ "language_table": serialize(language_table),
405
+ "dataset_table": serialize(datasets_df),
406
+ "countries": serialize(countries),
407
+ "machine_translated_metrics": list(machine_translated_metrics),
408
+ "language_tier_history": serialize(language_tier_history),
409
+ "license_history": serialize(license_history),
410
+ }
411
+ )
412
 
413
 
414
  # Only serve static files if build directory exists
evals/models.py CHANGED
@@ -364,7 +364,8 @@ def load_models(date: date) -> pd.DataFrame:
364
  models = models.assign(
365
  name=or_metadata.str["short_name"]
366
  .str.replace(" (free)", "")
367
- .str.replace(" (self-moderated)", ""),
 
368
  provider_name=or_metadata.str["name"].str.split(": ").str[0],
369
  # openrouter_metadata=or_metadata.astype(str),
370
  cost=or_metadata.apply(get_cost),
 
364
  models = models.assign(
365
  name=or_metadata.str["short_name"]
366
  .str.replace(" (free)", "")
367
+ .str.replace(" (self-moderated)", "")
368
+ .str.replace(r"\s*\([^)]*\)\s*$", "", regex=True),
369
  provider_name=or_metadata.str["name"].str.split(": ").str[0],
370
  # openrouter_metadata=or_metadata.astype(str),
371
  cost=or_metadata.apply(get_cost),
frontend/src/App.js CHANGED
@@ -20,6 +20,7 @@ function App () {
20
  const [data, setData] = useState(null)
21
  const [baseData, setBaseData] = useState(null)
22
  const [loading, setLoading] = useState(true)
 
23
  const [error, setError] = useState(null)
24
  const [selectedLanguages, setSelectedLanguages] = useState([])
25
  const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
@@ -32,6 +33,13 @@ function App () {
32
  const [fullScreenCarouselItems, setFullScreenCarouselItems] = useState([])
33
 
34
  useEffect(() => {
 
 
 
 
 
 
 
35
  fetch('/api/data', {
36
  method: 'POST',
37
  body: JSON.stringify({ selectedLanguages })
@@ -47,10 +55,12 @@ function App () {
47
  setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
48
  if (!baseData) setBaseData(jsonData)
49
  setLoading(false)
 
50
  })
51
  .catch(err => {
52
  setError(err.message)
53
  setLoading(false)
 
54
  })
55
  }, [selectedLanguages])
56
 
@@ -146,7 +156,7 @@ function App () {
146
  width: '100vw'
147
  }}
148
  >
149
- <div
150
  style={{
151
  backgroundColor: '#fff3cd',
152
  color: '#856404',
@@ -161,7 +171,7 @@ function App () {
161
  >
162
  <strong>Work in Progress:</strong> This dashboard is currently under
163
  active development. Evaluation results are not yet final. More extensive evaluation runs will be released later this year.
164
- </div>
165
  <div
166
  style={{
167
  display: 'flex',
@@ -361,12 +371,30 @@ function App () {
361
  )}
362
  {data && (
363
  <>
364
- <ModelTable
365
- data={data.model_table}
366
- selectedLanguages={selectedLanguages}
367
- allLanguages={data.language_table || []}
368
- machineTranslatedMetrics={machineTranslatedMetrics}
369
- />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
370
  <LanguageTable
371
  data={data.language_table}
372
  selectedLanguages={selectedLanguages}
 
20
  const [data, setData] = useState(null)
21
  const [baseData, setBaseData] = useState(null)
22
  const [loading, setLoading] = useState(true)
23
+ const [modelTableLoading, setModelTableLoading] = useState(false)
24
  const [error, setError] = useState(null)
25
  const [selectedLanguages, setSelectedLanguages] = useState([])
26
  const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
 
33
  const [fullScreenCarouselItems, setFullScreenCarouselItems] = useState([])
34
 
35
  useEffect(() => {
36
+ // For initial load, use main loading state; for language changes, use model table loading
37
+ if (!data) {
38
+ setLoading(true)
39
+ } else {
40
+ setModelTableLoading(true)
41
+ }
42
+
43
  fetch('/api/data', {
44
  method: 'POST',
45
  body: JSON.stringify({ selectedLanguages })
 
55
  setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
56
  if (!baseData) setBaseData(jsonData)
57
  setLoading(false)
58
+ setModelTableLoading(false)
59
  })
60
  .catch(err => {
61
  setError(err.message)
62
  setLoading(false)
63
+ setModelTableLoading(false)
64
  })
65
  }, [selectedLanguages])
66
 
 
156
  width: '100vw'
157
  }}
158
  >
159
+ {/* <div
160
  style={{
161
  backgroundColor: '#fff3cd',
162
  color: '#856404',
 
171
  >
172
  <strong>Work in Progress:</strong> This dashboard is currently under
173
  active development. Evaluation results are not yet final. More extensive evaluation runs will be released later this year.
174
+ </div> */}
175
  <div
176
  style={{
177
  display: 'flex',
 
371
  )}
372
  {data && (
373
  <>
374
+ <div style={{ position: 'relative' }}>
375
+ {modelTableLoading && (
376
+ <div style={{
377
+ position: 'absolute',
378
+ top: 0,
379
+ left: 0,
380
+ right: 0,
381
+ bottom: 0,
382
+ backgroundColor: 'rgba(255, 255, 255, 0.8)',
383
+ display: 'flex',
384
+ alignItems: 'center',
385
+ justifyContent: 'center',
386
+ zIndex: 1000
387
+ }}>
388
+ <i className='pi pi-spinner pi-spin' style={{ fontSize: '3rem' }} />
389
+ </div>
390
+ )}
391
+ <ModelTable
392
+ data={data.model_table}
393
+ selectedLanguages={selectedLanguages}
394
+ allLanguages={data.language_table || []}
395
+ machineTranslatedMetrics={machineTranslatedMetrics}
396
+ />
397
+ </div>
398
  <LanguageTable
399
  data={data.language_table}
400
  selectedLanguages={selectedLanguages}
frontend/src/components/DatasetTable.js CHANGED
@@ -98,6 +98,7 @@ const DatasetTable = ({ data }) => {
98
  return (
99
  <DataTable
100
  value={table}
 
101
  rowGroupMode='subheader'
102
  rowGroupHeaderTemplate={rowData => {
103
  return <div style={{ fontWeight: 'bold' }}>{rowData.group}</div>
 
98
  return (
99
  <DataTable
100
  value={table}
101
+ dataKey='name'
102
  rowGroupMode='subheader'
103
  rowGroupHeaderTemplate={rowData => {
104
  return <div style={{ fontWeight: 'bold' }}>{rowData.group}</div>
frontend/src/components/LanguageTable.js CHANGED
@@ -122,6 +122,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
122
  value={data.filter(
123
  item => !selectedLanguages.some(l => l.bcp_47 === item.bcp_47)
124
  )}
 
125
  header={
126
  <span>
127
  <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>Languages</span>
 
122
  value={data.filter(
123
  item => !selectedLanguages.some(l => l.bcp_47 === item.bcp_47)
124
  )}
125
+ dataKey='bcp_47'
126
  header={
127
  <span>
128
  <span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>Languages</span>
frontend/src/components/ModelTable.js CHANGED
@@ -225,6 +225,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
225
  return (
226
  <DataTable
227
  value={data}
 
228
  header={<>{getHeaderText()}</>}
229
  sortField='average'
230
  removableSort
 
225
  return (
226
  <DataTable
227
  value={data}
228
+ dataKey='name'
229
  header={<>{getHeaderText()}</>}
230
  sortField='average'
231
  removableSort
frontend/src/components/ScoreColumns.js CHANGED
@@ -2,112 +2,117 @@ import { Column } from 'primereact/column'
2
  import ScoreField from './ScoreField'
3
 
4
  const scoreBodyTemplate = (field, options = {}) => {
5
- const { minScore = 0, maxScore = 1, machineTranslatedMetrics = [] } = options
 
 
 
 
 
 
6
 
7
  return rowData => {
8
  const score = rowData[field]
9
- // Prefer per-row flag if present (backend sets `<metric>_is_machine`),
10
- // otherwise fall back to global list
11
  const rowFlagKey = `${field}_is_machine`
12
  const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
13
  const isMachineTranslated = hasRowFlag
14
  ? !!rowData[rowFlagKey]
15
  : machineTranslatedMetrics.includes(field)
16
- return ScoreField(score, minScore, maxScore, isMachineTranslated)
 
 
 
 
 
 
 
 
 
 
 
17
  }
18
  }
19
 
20
- const ScoreColumns = (machineTranslatedMetrics = []) => [
21
- <Column
22
- field='average'
23
- header='Proficiency'
24
- headerTooltip='Language Proficiency Score (average of the scores for each task)'
25
- sortable
26
- body={scoreBodyTemplate('average', { minScore: 0.3, maxScore: 0.7, machineTranslatedMetrics })}
27
- style={{ minWidth: '5rem', maxWidth: '10rem' }}
28
- />,
29
- <Column
30
- field='translation_from_bleu'
31
- header='Translation (from)'
32
- headerTooltip='Translation performance from a language to all other languages (spBLEU score on a sample of the FLORES+ benchmark)'
33
- sortable
34
- body={scoreBodyTemplate('translation_from_bleu', {
35
- minScore: 0,
36
- maxScore: 0.4,
37
- machineTranslatedMetrics
38
- })}
39
- style={{ minWidth: '5rem', maxWidth: '10rem' }}
40
- />,
41
- <Column
42
- field='translation_to_bleu'
43
- header='Translation (to)'
44
- headerTooltip='Translation performance from all other languages to a language (spBLEU score on a sample of the FLORES+ benchmark)'
45
- sortable
46
- body={scoreBodyTemplate('translation_to_bleu', {
47
- minScore: 0,
48
- maxScore: 0.4,
49
- machineTranslatedMetrics
50
- })}
51
- style={{ minWidth: '5rem', maxWidth: '10rem' }}
52
- />,
53
- <Column
54
- field='classification_accuracy'
55
- header='Classification'
56
- headerTooltip='Classification performance (accuracy on a sample of the SIB-200 / FLORES+ classification benchmark)'
57
- sortable
58
- body={scoreBodyTemplate('classification_accuracy', {
59
- minScore: 0.4,
60
- maxScore: 1,
61
- machineTranslatedMetrics
62
- })}
63
- style={{ minWidth: '5rem', maxWidth: '10rem' }}
64
- />,
65
- // <Column
66
- // field='language_modeling_chrf'
67
- // header='Language Modeling'
68
- // sortable
69
- // body={scoreBodyTemplate('language_modeling_chrf', {
70
- // minScore: 0.8,
71
- // maxScore: 1
72
- // })}
73
- // style={{ minWidth: '5rem', maxWidth: '10rem' }}
74
- // />,
75
- <Column
76
- field='mmlu_accuracy'
77
- header='Q&A'
78
- headerTooltip='Question Answering performance (accuracy on a sample of multilingual versions of the MMLU benchmark)'
79
- sortable
80
- body={scoreBodyTemplate('mmlu_accuracy', {
81
- minScore: 0,
82
- maxScore: 1,
83
- machineTranslatedMetrics
84
- })}
85
- style={{ minWidth: '5rem', maxWidth: '10rem' }}
86
- />,
87
  <Column
88
- field='arc_accuracy'
89
- header='Advanced Q&A'
90
- headerTooltip='Advanced Question Answering performance (accuracy on a sample of multilingual versions of the ARC-Easy benchmark)'
91
  sortable
92
- body={scoreBodyTemplate('arc_accuracy', {
93
- minScore: 0,
94
- maxScore: 1,
95
- machineTranslatedMetrics
 
 
96
  })}
97
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
98
- />,
99
- <Column
100
- field='mgsm_accuracy'
101
- header='Math'
102
- headerTooltip='Math Problem Solving performance (accuracy on a sample of multilingual versions of the GSM8K benchmark)'
103
- sortable
104
- body={scoreBodyTemplate('mgsm_accuracy', {
105
- minScore: 0,
106
- maxScore: 1,
107
- machineTranslatedMetrics
108
- })}
109
- style={{ minWidth: '5rem', maxWidth: '10rem' }}
110
- />,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  ]
112
 
113
  export default ScoreColumns
 
2
  import ScoreField from './ScoreField'
3
 
4
  const scoreBodyTemplate = (field, options = {}) => {
5
+ const {
6
+ minScore = 0,
7
+ maxScore = 1,
8
+ machineTranslatedMetrics = [],
9
+ ciLowerField = null,
10
+ ciUpperField = null
11
+ } = options
12
 
13
  return rowData => {
14
  const score = rowData[field]
 
 
15
  const rowFlagKey = `${field}_is_machine`
16
  const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
17
  const isMachineTranslated = hasRowFlag
18
  ? !!rowData[rowFlagKey]
19
  : machineTranslatedMetrics.includes(field)
20
+ const ciLower = ciLowerField ? rowData[ciLowerField] : null
21
+ const ciUpper = ciUpperField ? rowData[ciUpperField] : null
22
+ return (
23
+ <ScoreField
24
+ score={score}
25
+ minScore={minScore}
26
+ maxScore={maxScore}
27
+ isMachineTranslated={isMachineTranslated}
28
+ ciLower={ciLower}
29
+ ciUpper={ciUpper}
30
+ />
31
+ )
32
  }
33
  }
34
 
35
+ const createScoreColumn = (
36
+ field,
37
+ header,
38
+ tooltip,
39
+ minScore,
40
+ maxScore,
41
+ machineTranslatedMetrics
42
+ ) => (
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  <Column
44
+ field={field}
45
+ header={header}
46
+ headerTooltip={tooltip}
47
  sortable
48
+ body={scoreBodyTemplate(field, {
49
+ minScore,
50
+ maxScore,
51
+ machineTranslatedMetrics,
52
+ ciLowerField: `${field}_ci_lower`,
53
+ ciUpperField: `${field}_ci_upper`
54
  })}
55
  style={{ minWidth: '5rem', maxWidth: '10rem' }}
56
+ />
57
+ )
58
+
59
+ const ScoreColumns = (machineTranslatedMetrics = []) => [
60
+ createScoreColumn(
61
+ 'average',
62
+ 'Proficiency',
63
+ 'Language Proficiency Score (average of the scores for each task)',
64
+ 0,
65
+ 1,
66
+ machineTranslatedMetrics
67
+ ),
68
+ createScoreColumn(
69
+ 'translation_from_bleu',
70
+ 'Translation (from)',
71
+ 'Translation performance from a language to all other languages (spBLEU score on a sample of the FLORES+ benchmark)',
72
+ 0,
73
+ 1,
74
+ machineTranslatedMetrics
75
+ ),
76
+ createScoreColumn(
77
+ 'translation_to_bleu',
78
+ 'Translation (to)',
79
+ 'Translation performance from all other languages to a language (spBLEU score on a sample of the FLORES+ benchmark)',
80
+ 0,
81
+ 1,
82
+ machineTranslatedMetrics
83
+ ),
84
+ createScoreColumn(
85
+ 'classification_accuracy',
86
+ 'Classification',
87
+ 'Classification performance (accuracy on a sample of the SIB-200 / FLORES+ classification benchmark)',
88
+ 0,
89
+ 1,
90
+ machineTranslatedMetrics
91
+ ),
92
+ createScoreColumn(
93
+ 'mmlu_accuracy',
94
+ 'Q&A',
95
+ 'Question Answering performance (accuracy on a sample of multilingual versions of the MMLU benchmark)',
96
+ 0,
97
+ 1,
98
+ machineTranslatedMetrics
99
+ ),
100
+ createScoreColumn(
101
+ 'arc_accuracy',
102
+ 'Advanced Q&A',
103
+ 'Advanced Question Answering performance (accuracy on a sample of multilingual versions of the ARC-Easy benchmark)',
104
+ 0,
105
+ 1,
106
+ machineTranslatedMetrics
107
+ ),
108
+ createScoreColumn(
109
+ 'mgsm_accuracy',
110
+ 'Math',
111
+ 'Math Problem Solving performance (accuracy on a sample of multilingual versions of the GSM8K benchmark)',
112
+ 0,
113
+ 1,
114
+ machineTranslatedMetrics
115
+ )
116
  ]
117
 
118
  export default ScoreColumns
frontend/src/components/ScoreField.js CHANGED
@@ -1,24 +1,34 @@
1
- const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
2
  let percentage = 100
3
  let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
 
 
 
4
  if (score !== null) {
5
- // Calculate percentage based on the provided min and max scores
6
- // This normalizes the score to a 0-100 range for visualization
7
- const normalizedScore = Math.min(Math.max(score, minScore), maxScore)
8
- percentage =
9
- ((normalizedScore - minScore) / (maxScore - minScore)) * 100
10
 
11
- // Continuous color gradient from red to green based on score
12
- // For a smooth transition, calculate the RGB values directly
13
 
14
- // Red component decreases as score increases
15
- const red = Math.round(255 * (1 - percentage / 100))
16
- // Green component increases as score increases
17
- const green = Math.round(255 * (percentage / 100))
18
- // Use a low opacity for subtlety (0.1-0.2 range)
19
- const opacity = 0.1 + (percentage / 100) * 0.1
20
 
21
- barColor = `rgba(${red}, ${green}, 0, ${opacity.toFixed(2)})`
 
 
 
 
 
 
 
 
22
  }
23
 
24
  return (
@@ -39,14 +49,56 @@ const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
39
  width: `${percentage}%`,
40
  backgroundColor: barColor,
41
  zIndex: 0,
42
- transition: 'width 0.3s, background-color 0.3s'
43
  }}
44
  />
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  <span
47
  style={{
48
  position: 'relative',
49
- zIndex: 1
50
  }}
51
  >
52
  {score !== null ? (score * 100).toFixed(1)+"%" : '–'}
 
1
+ const ScoreField = ({ score, minScore, maxScore, isMachineTranslated = false, ciLower = null, ciUpper = null }) => {
2
  let percentage = 100
3
  let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
4
+ let ciLowerPercentage = null
5
+ let ciUpperPercentage = null
6
+
7
  if (score !== null) {
8
+ // Calculate percentage based on the provided min and max scores
9
+ // This normalizes the score to a 0-100 range for visualization
10
+ const normalizedScore = Math.min(Math.max(score, minScore), maxScore)
11
+ percentage = ((normalizedScore - minScore) / (maxScore - minScore)) * 100
 
12
 
13
+ // Continuous color gradient from red to green based on score
14
+ // For a smooth transition, calculate the RGB values directly
15
 
16
+ // Red component decreases as score increases
17
+ const red = Math.round(255 * (1 - percentage / 100))
18
+ // Green component increases as score increases
19
+ const green = Math.round(255 * (percentage / 100))
20
+ // Use a low opacity for subtlety (0.1-0.2 range)
21
+ const opacity = 0.1 + (percentage / 100) * 0.1
22
 
23
+ barColor = `rgba(${red}, ${green}, 0, ${opacity.toFixed(2)})`
24
+
25
+ // Calculate CI percentages if available
26
+ if (ciLower !== null && ciUpper !== null) {
27
+ const normalizedCiLower = Math.min(Math.max(ciLower, minScore), maxScore)
28
+ const normalizedCiUpper = Math.min(Math.max(ciUpper, minScore), maxScore)
29
+ ciLowerPercentage = ((normalizedCiLower - minScore) / (maxScore - minScore)) * 100
30
+ ciUpperPercentage = ((normalizedCiUpper - minScore) / (maxScore - minScore)) * 100
31
+ }
32
  }
33
 
34
  return (
 
49
  width: `${percentage}%`,
50
  backgroundColor: barColor,
51
  zIndex: 0,
52
+ // transition: 'width 0.3s, background-color 0.3s'
53
  }}
54
  />
55
+
56
+ {/* Confidence interval error bar */}
57
+ {ciLowerPercentage !== null && ciUpperPercentage !== null && (
58
+ <div
59
+ style={{
60
+ position: 'absolute',
61
+ top: '50%',
62
+ left: `${ciLowerPercentage}%`,
63
+ width: `${ciUpperPercentage - ciLowerPercentage}%`,
64
+ height: '2px',
65
+ backgroundColor: 'rgba(0, 0, 0, 0.3)',
66
+ zIndex: 1,
67
+ transform: 'translateY(-50%)',
68
+ // transition: 'left 0.3s, width 0.3s'
69
+ }}
70
+ >
71
+ {/* Left cap */}
72
+ <div
73
+ style={{
74
+ position: 'absolute',
75
+ left: 0,
76
+ top: '50%',
77
+ width: '1px',
78
+ height: '8px',
79
+ backgroundColor: 'rgba(0, 0, 0, 0.3)',
80
+ transform: 'translate(-50%, -50%)'
81
+ }}
82
+ />
83
+ {/* Right cap */}
84
+ <div
85
+ style={{
86
+ position: 'absolute',
87
+ right: 0,
88
+ top: '50%',
89
+ width: '1px',
90
+ height: '8px',
91
+ backgroundColor: 'rgba(0, 0, 0, 0.3)',
92
+ transform: 'translate(50%, -50%)'
93
+ }}
94
+ />
95
+ </div>
96
+ )}
97
 
98
  <span
99
  style={{
100
  position: 'relative',
101
+ zIndex: 2
102
  }}
103
  >
104
  {score !== null ? (score * 100).toFixed(1)+"%" : '–'}