Spaces:
Running
Running
Upload from GitHub Actions: model name no bracket stuff
Browse files- evals/backend.py +257 -137
- evals/models.py +2 -1
- frontend/src/App.js +36 -8
- frontend/src/components/DatasetTable.js +1 -0
- frontend/src/components/LanguageTable.js +1 -0
- frontend/src/components/ModelTable.js +1 -0
- frontend/src/components/ScoreColumns.js +96 -91
- frontend/src/components/ScoreField.js +69 -17
evals/backend.py
CHANGED
|
@@ -4,7 +4,6 @@ import os
|
|
| 4 |
import numpy as np
|
| 5 |
import pandas as pd
|
| 6 |
import uvicorn
|
| 7 |
-
|
| 8 |
from countries import make_country_table
|
| 9 |
from datasets_.util import load
|
| 10 |
from fastapi import FastAPI, Request
|
|
@@ -12,8 +11,12 @@ from fastapi.middleware.cors import CORSMiddleware
|
|
| 12 |
from fastapi.middleware.gzip import GZipMiddleware
|
| 13 |
from fastapi.responses import JSONResponse
|
| 14 |
from fastapi.staticfiles import StaticFiles
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
scores = load("results")
|
|
|
|
| 17 |
languages = load("languages")
|
| 18 |
models = load("models")
|
| 19 |
|
|
@@ -31,56 +34,155 @@ task_metrics = [
|
|
| 31 |
"mgsm_accuracy",
|
| 32 |
]
|
| 33 |
|
|
|
|
| 34 |
def compute_normalized_average(df, metrics):
|
| 35 |
"""Compute simple average across metric columns without normalization."""
|
| 36 |
return df[metrics].mean(axis=1, skipna=False)
|
| 37 |
|
| 38 |
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
index="model",
|
| 49 |
-
columns="task_metric_origin",
|
| 50 |
-
values="score",
|
| 51 |
-
aggfunc="mean",
|
| 52 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
-
|
|
|
|
|
|
|
| 55 |
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
main_pivot = scores_df.pivot_table(
|
| 57 |
index="model", columns="task_metric", values="score", aggfunc="mean"
|
| 58 |
)
|
| 59 |
-
|
| 60 |
-
|
|
|
|
| 61 |
df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
|
| 62 |
|
|
|
|
| 63 |
for metric in task_metrics:
|
| 64 |
-
|
| 65 |
-
df[metric] = np.nan
|
| 66 |
-
|
| 67 |
df["average"] = compute_normalized_average(df, task_metrics)
|
|
|
|
| 68 |
|
| 69 |
-
# Add
|
| 70 |
-
machine_presence =
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
for metric in task_metrics:
|
| 72 |
-
df[f"{metric}_contains_machine"] = df.index.map(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
| 74 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
| 75 |
df["rank"] = df.index + 1
|
|
|
|
|
|
|
|
|
|
| 76 |
|
| 77 |
-
#
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
|
| 83 |
-
|
| 84 |
[
|
| 85 |
"rank",
|
| 86 |
"model",
|
|
@@ -93,42 +195,58 @@ def make_model_table(scores_df, models):
|
|
| 93 |
"license",
|
| 94 |
"cost",
|
| 95 |
"average",
|
| 96 |
-
*
|
|
|
|
| 97 |
]
|
| 98 |
]
|
| 99 |
-
return df
|
| 100 |
|
| 101 |
|
| 102 |
-
def make_language_table(scores_df, languages):
|
| 103 |
scores_df = scores_df.copy()
|
| 104 |
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 105 |
-
|
| 106 |
-
# Pivot scores
|
| 107 |
score_pivot = scores_df.pivot_table(
|
| 108 |
index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
|
| 109 |
)
|
| 110 |
-
|
| 111 |
-
# Pivot origins (first origin since each task+lang combo has only one)
|
| 112 |
origin_pivot = scores_df.pivot_table(
|
| 113 |
index="bcp_47", columns="task_metric", values="origin", aggfunc="first"
|
| 114 |
)
|
| 115 |
origin_pivot = origin_pivot.add_suffix("_origin")
|
| 116 |
-
|
| 117 |
df = pd.merge(score_pivot, origin_pivot, on="bcp_47", how="outer")
|
| 118 |
-
|
| 119 |
-
for metric in task_metrics:
|
| 120 |
-
if metric not in df.columns:
|
| 121 |
-
df[metric] = np.nan
|
| 122 |
|
|
|
|
|
|
|
|
|
|
| 123 |
df["average"] = compute_normalized_average(df, task_metrics)
|
| 124 |
-
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
| 125 |
-
df = df.sort_values(by="speakers", ascending=False)
|
| 126 |
|
| 127 |
-
#
|
| 128 |
-
|
| 129 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
|
| 131 |
-
|
| 132 |
[
|
| 133 |
"bcp_47",
|
| 134 |
"language_name",
|
|
@@ -136,110 +254,97 @@ def make_language_table(scores_df, languages):
|
|
| 136 |
"speakers",
|
| 137 |
"family",
|
| 138 |
"average",
|
|
|
|
| 139 |
"in_benchmark",
|
| 140 |
-
*sorted(
|
| 141 |
]
|
| 142 |
]
|
| 143 |
-
return df
|
| 144 |
|
| 145 |
|
| 146 |
def make_language_tier_history(scores_df, languages, models):
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
"Top 1": (0, 1),
|
| 153 |
-
"Top 2-20": (1, 20),
|
| 154 |
-
"Top 20-200": (19, 500),
|
| 155 |
-
}
|
| 156 |
-
|
| 157 |
# Calculate model-language proficiency scores
|
| 158 |
scores_df = scores_df.copy()
|
| 159 |
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 160 |
-
|
| 161 |
-
# Pivot to get model-language-metric scores
|
| 162 |
pivot = scores_df.pivot_table(
|
| 163 |
-
index=["model", "bcp_47"],
|
| 164 |
-
columns="task_metric",
|
| 165 |
-
values="score",
|
| 166 |
-
aggfunc="mean"
|
| 167 |
)
|
| 168 |
-
|
| 169 |
-
# Ensure all task_metrics columns exist
|
| 170 |
for metric in task_metrics:
|
| 171 |
-
|
| 172 |
-
pivot[metric] = np.nan
|
| 173 |
-
|
| 174 |
-
# Calculate proficiency score for each model-language pair
|
| 175 |
pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics)
|
| 176 |
pivot = pivot.reset_index()
|
| 177 |
-
|
| 178 |
-
#
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
["model", "name", "provider_name", "creation_date", "size", "tier", "proficiency_score"]
|
| 195 |
-
]
|
| 196 |
-
|
| 197 |
tier_scores["creation_date"] = tier_scores["creation_date"].apply(
|
| 198 |
lambda x: x.isoformat() if x else None
|
| 199 |
)
|
| 200 |
-
|
| 201 |
-
return tier_scores
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
|
| 204 |
def make_license_history(scores_df, models):
|
| 205 |
scores_df = scores_df.copy()
|
| 206 |
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 207 |
-
|
| 208 |
-
# Pivot
|
| 209 |
pivot = scores_df.pivot_table(
|
| 210 |
-
index="model",
|
| 211 |
-
columns="task_metric",
|
| 212 |
-
values="score",
|
| 213 |
-
aggfunc="mean"
|
| 214 |
)
|
| 215 |
-
|
| 216 |
-
# Ensure all task_metrics columns exist
|
| 217 |
for metric in task_metrics:
|
| 218 |
-
|
| 219 |
-
pivot[metric] = np.nan
|
| 220 |
-
|
| 221 |
-
# Calculate proficiency score for each model
|
| 222 |
pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics)
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
# Classify as commercial or open
|
| 229 |
df["license_type"] = df["type"].apply(
|
| 230 |
lambda x: "Open-source" if x == "open-source" else "Commercial"
|
| 231 |
)
|
| 232 |
-
|
| 233 |
-
# Select relevant columns
|
| 234 |
-
df = df[
|
| 235 |
-
["model", "name", "provider_name", "creation_date", "size", "license_type", "proficiency_score"]
|
| 236 |
-
]
|
| 237 |
-
|
| 238 |
df["creation_date"] = df["creation_date"].apply(
|
| 239 |
lambda x: x.isoformat() if x else None
|
| 240 |
)
|
| 241 |
-
|
| 242 |
-
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
|
| 244 |
|
| 245 |
app = FastAPI()
|
|
@@ -257,38 +362,53 @@ async def data(request: Request):
|
|
| 257 |
body = await request.body()
|
| 258 |
data = json.loads(body)
|
| 259 |
selected_languages = data.get("selectedLanguages", {})
|
| 260 |
-
|
| 261 |
# Identify which metrics have machine translations available
|
| 262 |
machine_translated_metrics = {
|
| 263 |
-
f"{row['task']}_{row['metric']}"
|
| 264 |
-
for _, row in scores.iterrows()
|
| 265 |
if row["origin"] == "machine"
|
| 266 |
}
|
| 267 |
|
| 268 |
# Filter by selected languages if provided
|
| 269 |
-
df =
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
if len(df) == 0:
|
| 272 |
model_table = pd.DataFrame()
|
| 273 |
countries = pd.DataFrame()
|
| 274 |
else:
|
| 275 |
-
model_table = make_model_table(df, models)
|
| 276 |
-
countries = make_country_table(make_language_table(df, languages))
|
| 277 |
-
|
| 278 |
-
language_table = make_language_table(scores, languages)
|
| 279 |
language_tier_history = make_language_tier_history(scores, languages, models)
|
| 280 |
license_history = make_license_history(scores, models)
|
| 281 |
datasets_df = pd.read_json("data/datasets.json")
|
| 282 |
-
|
| 283 |
-
return JSONResponse(
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
|
|
|
|
|
|
| 292 |
|
| 293 |
|
| 294 |
# Only serve static files if build directory exists
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
import pandas as pd
|
| 6 |
import uvicorn
|
|
|
|
| 7 |
from countries import make_country_table
|
| 8 |
from datasets_.util import load
|
| 9 |
from fastapi import FastAPI, Request
|
|
|
|
| 11 |
from fastapi.middleware.gzip import GZipMiddleware
|
| 12 |
from fastapi.responses import JSONResponse
|
| 13 |
from fastapi.staticfiles import StaticFiles
|
| 14 |
+
from joblib.memory import Memory
|
| 15 |
+
|
| 16 |
+
cache = Memory(location=".cache", verbose=0).cache
|
| 17 |
|
| 18 |
scores = load("results")
|
| 19 |
+
scores_detailed = load("results-detailed")
|
| 20 |
languages = load("languages")
|
| 21 |
models = load("models")
|
| 22 |
|
|
|
|
| 34 |
"mgsm_accuracy",
|
| 35 |
]
|
| 36 |
|
| 37 |
+
|
| 38 |
def compute_normalized_average(df, metrics):
|
| 39 |
"""Compute simple average across metric columns without normalization."""
|
| 40 |
return df[metrics].mean(axis=1, skipna=False)
|
| 41 |
|
| 42 |
|
| 43 |
+
@cache
|
| 44 |
+
def compute_bootstrap_ci(
|
| 45 |
+
data_hash, group_cols_tuple, n_bootstrap=1000, ci_level=0.95, seed=42
|
| 46 |
+
):
|
| 47 |
+
"""Compute bootstrap CIs for grouped data. Cached based on data hash."""
|
| 48 |
+
# This function is called with the actual data passed separately via _ci_cache
|
| 49 |
+
df, group_cols = _ci_cache[data_hash]
|
| 50 |
+
np.random.seed(seed)
|
| 51 |
+
percentiles = [(1 - ci_level) / 2 * 100, (1 + ci_level) / 2 * 100]
|
| 52 |
+
|
| 53 |
+
def bootstrap_group(group):
|
| 54 |
+
scores = group["score"].values
|
| 55 |
+
if len(scores) == 0:
|
| 56 |
+
return pd.Series({"ci_lower": None, "ci_upper": None})
|
| 57 |
+
bootstrap_means = [
|
| 58 |
+
np.random.choice(scores, len(scores), replace=True).mean()
|
| 59 |
+
for _ in range(n_bootstrap)
|
| 60 |
+
]
|
| 61 |
+
ci_lower, ci_upper = np.percentile(bootstrap_means, percentiles)
|
| 62 |
+
return pd.Series({"ci_lower": ci_lower, "ci_upper": ci_upper})
|
| 63 |
|
| 64 |
+
result = df.groupby(group_cols, as_index=False).apply(
|
| 65 |
+
bootstrap_group, include_groups=False
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
)
|
| 67 |
+
result.columns = group_cols + ["ci_lower", "ci_upper"]
|
| 68 |
+
return result
|
| 69 |
+
|
| 70 |
+
|
| 71 |
+
# Thread-safe cache for passing DataFrames to cached function
|
| 72 |
+
_ci_cache = {}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def add_confidence_intervals(df, scores_df_detailed, group_col, metrics):
|
| 76 |
+
"""DRY helper to add CI columns for metrics and average to a dataframe."""
|
| 77 |
+
if scores_df_detailed is None or scores_df_detailed.empty:
|
| 78 |
+
return df
|
| 79 |
+
|
| 80 |
+
detailed = scores_df_detailed.copy()
|
| 81 |
+
detailed["task_metric"] = detailed["task"] + "_" + detailed["metric"]
|
| 82 |
+
|
| 83 |
+
# Add CI for each metric
|
| 84 |
+
for metric in metrics:
|
| 85 |
+
metric_data = detailed[detailed["task_metric"] == metric]
|
| 86 |
+
if not metric_data.empty:
|
| 87 |
+
# Create hash based on data shape, groups, and statistics
|
| 88 |
+
group_stats = (
|
| 89 |
+
metric_data.groupby(group_col)["score"]
|
| 90 |
+
.agg(["count", "mean", "std"])
|
| 91 |
+
.round(6)
|
| 92 |
+
)
|
| 93 |
+
data_hash = hash(
|
| 94 |
+
(
|
| 95 |
+
metric,
|
| 96 |
+
group_col,
|
| 97 |
+
len(metric_data),
|
| 98 |
+
tuple(group_stats.index),
|
| 99 |
+
tuple(map(tuple, group_stats.values)),
|
| 100 |
+
)
|
| 101 |
+
)
|
| 102 |
+
_ci_cache[data_hash] = (metric_data, [group_col])
|
| 103 |
+
ci_df = compute_bootstrap_ci(data_hash, (group_col,))
|
| 104 |
+
ci_df = ci_df.rename(
|
| 105 |
+
columns={
|
| 106 |
+
"ci_lower": f"{metric}_ci_lower",
|
| 107 |
+
"ci_upper": f"{metric}_ci_upper",
|
| 108 |
+
}
|
| 109 |
+
)
|
| 110 |
+
df = pd.merge(df, ci_df, on=group_col, how="left")
|
| 111 |
+
|
| 112 |
+
# Add CI for average
|
| 113 |
+
avg_data = detailed[detailed["task_metric"].isin(metrics)]
|
| 114 |
+
if not avg_data.empty:
|
| 115 |
+
# Create hash based on data shape, groups, and statistics
|
| 116 |
+
group_stats = (
|
| 117 |
+
avg_data.groupby(group_col)["score"].agg(["count", "mean", "std"]).round(6)
|
| 118 |
+
)
|
| 119 |
+
data_hash = hash(
|
| 120 |
+
(
|
| 121 |
+
"average",
|
| 122 |
+
group_col,
|
| 123 |
+
len(avg_data),
|
| 124 |
+
tuple(group_stats.index),
|
| 125 |
+
tuple(map(tuple, group_stats.values)),
|
| 126 |
+
)
|
| 127 |
+
)
|
| 128 |
+
_ci_cache[data_hash] = (avg_data, [group_col])
|
| 129 |
+
avg_ci_df = compute_bootstrap_ci(data_hash, (group_col,))
|
| 130 |
+
avg_ci_df = avg_ci_df.rename(
|
| 131 |
+
columns={"ci_lower": "average_ci_lower", "ci_upper": "average_ci_upper"}
|
| 132 |
+
)
|
| 133 |
+
df = pd.merge(df, avg_ci_df, on=group_col, how="left")
|
| 134 |
+
|
| 135 |
+
return df
|
| 136 |
|
| 137 |
+
|
| 138 |
+
def make_model_table(scores_df, models, scores_df_detailed=None):
|
| 139 |
+
scores_df = scores_df.copy()
|
| 140 |
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 141 |
+
scores_df["task_metric_origin"] = (
|
| 142 |
+
scores_df["task_metric"] + "_" + scores_df["origin"]
|
| 143 |
+
)
|
| 144 |
+
|
| 145 |
+
# Pivot scores
|
| 146 |
main_pivot = scores_df.pivot_table(
|
| 147 |
index="model", columns="task_metric", values="score", aggfunc="mean"
|
| 148 |
)
|
| 149 |
+
scores_pivot = scores_df.pivot_table(
|
| 150 |
+
index="model", columns="task_metric_origin", values="score", aggfunc="mean"
|
| 151 |
+
)
|
| 152 |
df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
|
| 153 |
|
| 154 |
+
# Fill missing metrics and compute average
|
| 155 |
for metric in task_metrics:
|
| 156 |
+
df[metric] = df.get(metric, np.nan)
|
|
|
|
|
|
|
| 157 |
df["average"] = compute_normalized_average(df, task_metrics)
|
| 158 |
+
df = add_confidence_intervals(df, scores_df_detailed, "model", task_metrics)
|
| 159 |
|
| 160 |
+
# Add machine-origin flags
|
| 161 |
+
machine_presence = (
|
| 162 |
+
scores_df[scores_df["origin"] == "machine"]
|
| 163 |
+
.groupby(["model", "task_metric"])
|
| 164 |
+
.size()
|
| 165 |
+
)
|
| 166 |
for metric in task_metrics:
|
| 167 |
+
df[f"{metric}_contains_machine"] = df.index.map(
|
| 168 |
+
lambda m: (m, metric) in machine_presence.index
|
| 169 |
+
)
|
| 170 |
+
|
| 171 |
+
# Sort and add metadata
|
| 172 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
| 173 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
| 174 |
df["rank"] = df.index + 1
|
| 175 |
+
df["creation_date"] = df["creation_date"].apply(
|
| 176 |
+
lambda x: x.isoformat() if x else None
|
| 177 |
+
)
|
| 178 |
|
| 179 |
+
# Select columns dynamically
|
| 180 |
+
metric_cols = [m for m in df.columns if any(tm in m for tm in task_metrics)]
|
| 181 |
+
avg_ci_cols = [
|
| 182 |
+
c for c in df.columns if c in ["average_ci_lower", "average_ci_upper"]
|
| 183 |
+
]
|
| 184 |
|
| 185 |
+
return df[
|
| 186 |
[
|
| 187 |
"rank",
|
| 188 |
"model",
|
|
|
|
| 195 |
"license",
|
| 196 |
"cost",
|
| 197 |
"average",
|
| 198 |
+
*avg_ci_cols,
|
| 199 |
+
*sorted(set(metric_cols)),
|
| 200 |
]
|
| 201 |
]
|
|
|
|
| 202 |
|
| 203 |
|
| 204 |
+
def make_language_table(scores_df, languages, scores_df_detailed=None):
|
| 205 |
scores_df = scores_df.copy()
|
| 206 |
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 207 |
+
|
| 208 |
+
# Pivot scores and origins
|
| 209 |
score_pivot = scores_df.pivot_table(
|
| 210 |
index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
|
| 211 |
)
|
|
|
|
|
|
|
| 212 |
origin_pivot = scores_df.pivot_table(
|
| 213 |
index="bcp_47", columns="task_metric", values="origin", aggfunc="first"
|
| 214 |
)
|
| 215 |
origin_pivot = origin_pivot.add_suffix("_origin")
|
|
|
|
| 216 |
df = pd.merge(score_pivot, origin_pivot, on="bcp_47", how="outer")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 217 |
|
| 218 |
+
# Fill missing metrics and compute average
|
| 219 |
+
for metric in task_metrics:
|
| 220 |
+
df[metric] = df.get(metric, np.nan)
|
| 221 |
df["average"] = compute_normalized_average(df, task_metrics)
|
|
|
|
|
|
|
| 222 |
|
| 223 |
+
# For language table, we need to compute scores from detailed data to match CI calculation
|
| 224 |
+
# (CI is computed from all samples, so score should be too)
|
| 225 |
+
if scores_df_detailed is not None and not scores_df_detailed.empty:
|
| 226 |
+
detailed = scores_df_detailed.copy()
|
| 227 |
+
detailed["task_metric"] = detailed["task"] + "_" + detailed["metric"]
|
| 228 |
+
detailed_pivot = detailed.pivot_table(
|
| 229 |
+
index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
|
| 230 |
+
)
|
| 231 |
+
for metric in task_metrics:
|
| 232 |
+
if metric in detailed_pivot.columns:
|
| 233 |
+
df[metric] = detailed_pivot[metric]
|
| 234 |
+
df["average"] = compute_normalized_average(df, task_metrics)
|
| 235 |
+
|
| 236 |
+
df = add_confidence_intervals(df, scores_df_detailed, "bcp_47", task_metrics)
|
| 237 |
+
|
| 238 |
+
# Merge with language metadata and sort
|
| 239 |
+
df = pd.merge(languages, df, on="bcp_47", how="outer").sort_values(
|
| 240 |
+
by="speakers", ascending=False
|
| 241 |
+
)
|
| 242 |
+
|
| 243 |
+
# Select columns dynamically
|
| 244 |
+
metric_cols = [m for m in df.columns if any(tm in m for tm in task_metrics)]
|
| 245 |
+
avg_ci_cols = [
|
| 246 |
+
c for c in df.columns if c in ["average_ci_lower", "average_ci_upper"]
|
| 247 |
+
]
|
| 248 |
|
| 249 |
+
return df[
|
| 250 |
[
|
| 251 |
"bcp_47",
|
| 252 |
"language_name",
|
|
|
|
| 254 |
"speakers",
|
| 255 |
"family",
|
| 256 |
"average",
|
| 257 |
+
*avg_ci_cols,
|
| 258 |
"in_benchmark",
|
| 259 |
+
*sorted(set(metric_cols)),
|
| 260 |
]
|
| 261 |
]
|
|
|
|
| 262 |
|
| 263 |
|
| 264 |
def make_language_tier_history(scores_df, languages, models):
|
| 265 |
+
ranked_langs = languages.sort_values(by="speakers", ascending=False).reset_index(
|
| 266 |
+
drop=True
|
| 267 |
+
)
|
| 268 |
+
tier_ranges = {"Top 1": (0, 1), "Top 2-20": (1, 20), "Top 20-200": (19, 500)}
|
| 269 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 270 |
# Calculate model-language proficiency scores
|
| 271 |
scores_df = scores_df.copy()
|
| 272 |
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
|
|
|
|
|
|
| 273 |
pivot = scores_df.pivot_table(
|
| 274 |
+
index=["model", "bcp_47"], columns="task_metric", values="score", aggfunc="mean"
|
|
|
|
|
|
|
|
|
|
| 275 |
)
|
|
|
|
|
|
|
| 276 |
for metric in task_metrics:
|
| 277 |
+
pivot[metric] = pivot.get(metric, np.nan)
|
|
|
|
|
|
|
|
|
|
| 278 |
pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics)
|
| 279 |
pivot = pivot.reset_index()
|
| 280 |
+
|
| 281 |
+
# Aggregate by tier
|
| 282 |
+
tier_scores = pd.concat(
|
| 283 |
+
[
|
| 284 |
+
pivot[pivot["bcp_47"].isin(ranked_langs.iloc[start:end]["bcp_47"])]
|
| 285 |
+
.groupby("model")["proficiency_score"]
|
| 286 |
+
.mean()
|
| 287 |
+
.reset_index()
|
| 288 |
+
.assign(tier=tier_name)
|
| 289 |
+
for tier_name, (start, end) in tier_ranges.items()
|
| 290 |
+
],
|
| 291 |
+
ignore_index=True,
|
| 292 |
+
)
|
| 293 |
+
|
| 294 |
+
tier_scores = pd.merge(
|
| 295 |
+
tier_scores, models, left_on="model", right_on="id", how="left"
|
| 296 |
+
)
|
|
|
|
|
|
|
|
|
|
| 297 |
tier_scores["creation_date"] = tier_scores["creation_date"].apply(
|
| 298 |
lambda x: x.isoformat() if x else None
|
| 299 |
)
|
| 300 |
+
|
| 301 |
+
return tier_scores[
|
| 302 |
+
[
|
| 303 |
+
"model",
|
| 304 |
+
"name",
|
| 305 |
+
"provider_name",
|
| 306 |
+
"creation_date",
|
| 307 |
+
"size",
|
| 308 |
+
"tier",
|
| 309 |
+
"proficiency_score",
|
| 310 |
+
]
|
| 311 |
+
]
|
| 312 |
|
| 313 |
|
| 314 |
def make_license_history(scores_df, models):
|
| 315 |
scores_df = scores_df.copy()
|
| 316 |
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 317 |
+
|
| 318 |
+
# Pivot and compute proficiency
|
| 319 |
pivot = scores_df.pivot_table(
|
| 320 |
+
index="model", columns="task_metric", values="score", aggfunc="mean"
|
|
|
|
|
|
|
|
|
|
| 321 |
)
|
|
|
|
|
|
|
| 322 |
for metric in task_metrics:
|
| 323 |
+
pivot[metric] = pivot.get(metric, np.nan)
|
|
|
|
|
|
|
|
|
|
| 324 |
pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics)
|
| 325 |
+
|
| 326 |
+
# Merge and classify
|
| 327 |
+
df = pd.merge(
|
| 328 |
+
pivot.reset_index(), models, left_on="model", right_on="id", how="left"
|
| 329 |
+
)
|
|
|
|
| 330 |
df["license_type"] = df["type"].apply(
|
| 331 |
lambda x: "Open-source" if x == "open-source" else "Commercial"
|
| 332 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
df["creation_date"] = df["creation_date"].apply(
|
| 334 |
lambda x: x.isoformat() if x else None
|
| 335 |
)
|
| 336 |
+
|
| 337 |
+
return df[
|
| 338 |
+
[
|
| 339 |
+
"model",
|
| 340 |
+
"name",
|
| 341 |
+
"provider_name",
|
| 342 |
+
"creation_date",
|
| 343 |
+
"size",
|
| 344 |
+
"license_type",
|
| 345 |
+
"proficiency_score",
|
| 346 |
+
]
|
| 347 |
+
]
|
| 348 |
|
| 349 |
|
| 350 |
app = FastAPI()
|
|
|
|
| 362 |
body = await request.body()
|
| 363 |
data = json.loads(body)
|
| 364 |
selected_languages = data.get("selectedLanguages", {})
|
| 365 |
+
|
| 366 |
# Identify which metrics have machine translations available
|
| 367 |
machine_translated_metrics = {
|
| 368 |
+
f"{row['task']}_{row['metric']}"
|
| 369 |
+
for _, row in scores.iterrows()
|
| 370 |
if row["origin"] == "machine"
|
| 371 |
}
|
| 372 |
|
| 373 |
# Filter by selected languages if provided
|
| 374 |
+
df = (
|
| 375 |
+
scores[scores["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)]
|
| 376 |
+
if selected_languages
|
| 377 |
+
else scores
|
| 378 |
+
)
|
| 379 |
+
df_detailed = (
|
| 380 |
+
scores_detailed[
|
| 381 |
+
scores_detailed["bcp_47"].isin(
|
| 382 |
+
lang["bcp_47"] for lang in selected_languages
|
| 383 |
+
)
|
| 384 |
+
]
|
| 385 |
+
if selected_languages
|
| 386 |
+
else scores_detailed
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
if len(df) == 0:
|
| 390 |
model_table = pd.DataFrame()
|
| 391 |
countries = pd.DataFrame()
|
| 392 |
else:
|
| 393 |
+
model_table = make_model_table(df, models, df_detailed)
|
| 394 |
+
countries = make_country_table(make_language_table(df, languages, df_detailed))
|
| 395 |
+
|
| 396 |
+
language_table = make_language_table(scores, languages, scores_detailed)
|
| 397 |
language_tier_history = make_language_tier_history(scores, languages, models)
|
| 398 |
license_history = make_license_history(scores, models)
|
| 399 |
datasets_df = pd.read_json("data/datasets.json")
|
| 400 |
+
|
| 401 |
+
return JSONResponse(
|
| 402 |
+
content={
|
| 403 |
+
"model_table": serialize(model_table),
|
| 404 |
+
"language_table": serialize(language_table),
|
| 405 |
+
"dataset_table": serialize(datasets_df),
|
| 406 |
+
"countries": serialize(countries),
|
| 407 |
+
"machine_translated_metrics": list(machine_translated_metrics),
|
| 408 |
+
"language_tier_history": serialize(language_tier_history),
|
| 409 |
+
"license_history": serialize(license_history),
|
| 410 |
+
}
|
| 411 |
+
)
|
| 412 |
|
| 413 |
|
| 414 |
# Only serve static files if build directory exists
|
evals/models.py
CHANGED
|
@@ -364,7 +364,8 @@ def load_models(date: date) -> pd.DataFrame:
|
|
| 364 |
models = models.assign(
|
| 365 |
name=or_metadata.str["short_name"]
|
| 366 |
.str.replace(" (free)", "")
|
| 367 |
-
.str.replace(" (self-moderated)", "")
|
|
|
|
| 368 |
provider_name=or_metadata.str["name"].str.split(": ").str[0],
|
| 369 |
# openrouter_metadata=or_metadata.astype(str),
|
| 370 |
cost=or_metadata.apply(get_cost),
|
|
|
|
| 364 |
models = models.assign(
|
| 365 |
name=or_metadata.str["short_name"]
|
| 366 |
.str.replace(" (free)", "")
|
| 367 |
+
.str.replace(" (self-moderated)", "")
|
| 368 |
+
.str.replace(r"\s*\([^)]*\)\s*$", "", regex=True),
|
| 369 |
provider_name=or_metadata.str["name"].str.split(": ").str[0],
|
| 370 |
# openrouter_metadata=or_metadata.astype(str),
|
| 371 |
cost=or_metadata.apply(get_cost),
|
frontend/src/App.js
CHANGED
|
@@ -20,6 +20,7 @@ function App () {
|
|
| 20 |
const [data, setData] = useState(null)
|
| 21 |
const [baseData, setBaseData] = useState(null)
|
| 22 |
const [loading, setLoading] = useState(true)
|
|
|
|
| 23 |
const [error, setError] = useState(null)
|
| 24 |
const [selectedLanguages, setSelectedLanguages] = useState([])
|
| 25 |
const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
|
|
@@ -32,6 +33,13 @@ function App () {
|
|
| 32 |
const [fullScreenCarouselItems, setFullScreenCarouselItems] = useState([])
|
| 33 |
|
| 34 |
useEffect(() => {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
fetch('/api/data', {
|
| 36 |
method: 'POST',
|
| 37 |
body: JSON.stringify({ selectedLanguages })
|
|
@@ -47,10 +55,12 @@ function App () {
|
|
| 47 |
setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
|
| 48 |
if (!baseData) setBaseData(jsonData)
|
| 49 |
setLoading(false)
|
|
|
|
| 50 |
})
|
| 51 |
.catch(err => {
|
| 52 |
setError(err.message)
|
| 53 |
setLoading(false)
|
|
|
|
| 54 |
})
|
| 55 |
}, [selectedLanguages])
|
| 56 |
|
|
@@ -146,7 +156,7 @@ function App () {
|
|
| 146 |
width: '100vw'
|
| 147 |
}}
|
| 148 |
>
|
| 149 |
-
<div
|
| 150 |
style={{
|
| 151 |
backgroundColor: '#fff3cd',
|
| 152 |
color: '#856404',
|
|
@@ -161,7 +171,7 @@ function App () {
|
|
| 161 |
>
|
| 162 |
<strong>Work in Progress:</strong> This dashboard is currently under
|
| 163 |
active development. Evaluation results are not yet final. More extensive evaluation runs will be released later this year.
|
| 164 |
-
</div>
|
| 165 |
<div
|
| 166 |
style={{
|
| 167 |
display: 'flex',
|
|
@@ -361,12 +371,30 @@ function App () {
|
|
| 361 |
)}
|
| 362 |
{data && (
|
| 363 |
<>
|
| 364 |
-
<
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 370 |
<LanguageTable
|
| 371 |
data={data.language_table}
|
| 372 |
selectedLanguages={selectedLanguages}
|
|
|
|
| 20 |
const [data, setData] = useState(null)
|
| 21 |
const [baseData, setBaseData] = useState(null)
|
| 22 |
const [loading, setLoading] = useState(true)
|
| 23 |
+
const [modelTableLoading, setModelTableLoading] = useState(false)
|
| 24 |
const [error, setError] = useState(null)
|
| 25 |
const [selectedLanguages, setSelectedLanguages] = useState([])
|
| 26 |
const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
|
|
|
|
| 33 |
const [fullScreenCarouselItems, setFullScreenCarouselItems] = useState([])
|
| 34 |
|
| 35 |
useEffect(() => {
|
| 36 |
+
// For initial load, use main loading state; for language changes, use model table loading
|
| 37 |
+
if (!data) {
|
| 38 |
+
setLoading(true)
|
| 39 |
+
} else {
|
| 40 |
+
setModelTableLoading(true)
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
fetch('/api/data', {
|
| 44 |
method: 'POST',
|
| 45 |
body: JSON.stringify({ selectedLanguages })
|
|
|
|
| 55 |
setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
|
| 56 |
if (!baseData) setBaseData(jsonData)
|
| 57 |
setLoading(false)
|
| 58 |
+
setModelTableLoading(false)
|
| 59 |
})
|
| 60 |
.catch(err => {
|
| 61 |
setError(err.message)
|
| 62 |
setLoading(false)
|
| 63 |
+
setModelTableLoading(false)
|
| 64 |
})
|
| 65 |
}, [selectedLanguages])
|
| 66 |
|
|
|
|
| 156 |
width: '100vw'
|
| 157 |
}}
|
| 158 |
>
|
| 159 |
+
{/* <div
|
| 160 |
style={{
|
| 161 |
backgroundColor: '#fff3cd',
|
| 162 |
color: '#856404',
|
|
|
|
| 171 |
>
|
| 172 |
<strong>Work in Progress:</strong> This dashboard is currently under
|
| 173 |
active development. Evaluation results are not yet final. More extensive evaluation runs will be released later this year.
|
| 174 |
+
</div> */}
|
| 175 |
<div
|
| 176 |
style={{
|
| 177 |
display: 'flex',
|
|
|
|
| 371 |
)}
|
| 372 |
{data && (
|
| 373 |
<>
|
| 374 |
+
<div style={{ position: 'relative' }}>
|
| 375 |
+
{modelTableLoading && (
|
| 376 |
+
<div style={{
|
| 377 |
+
position: 'absolute',
|
| 378 |
+
top: 0,
|
| 379 |
+
left: 0,
|
| 380 |
+
right: 0,
|
| 381 |
+
bottom: 0,
|
| 382 |
+
backgroundColor: 'rgba(255, 255, 255, 0.8)',
|
| 383 |
+
display: 'flex',
|
| 384 |
+
alignItems: 'center',
|
| 385 |
+
justifyContent: 'center',
|
| 386 |
+
zIndex: 1000
|
| 387 |
+
}}>
|
| 388 |
+
<i className='pi pi-spinner pi-spin' style={{ fontSize: '3rem' }} />
|
| 389 |
+
</div>
|
| 390 |
+
)}
|
| 391 |
+
<ModelTable
|
| 392 |
+
data={data.model_table}
|
| 393 |
+
selectedLanguages={selectedLanguages}
|
| 394 |
+
allLanguages={data.language_table || []}
|
| 395 |
+
machineTranslatedMetrics={machineTranslatedMetrics}
|
| 396 |
+
/>
|
| 397 |
+
</div>
|
| 398 |
<LanguageTable
|
| 399 |
data={data.language_table}
|
| 400 |
selectedLanguages={selectedLanguages}
|
frontend/src/components/DatasetTable.js
CHANGED
|
@@ -98,6 +98,7 @@ const DatasetTable = ({ data }) => {
|
|
| 98 |
return (
|
| 99 |
<DataTable
|
| 100 |
value={table}
|
|
|
|
| 101 |
rowGroupMode='subheader'
|
| 102 |
rowGroupHeaderTemplate={rowData => {
|
| 103 |
return <div style={{ fontWeight: 'bold' }}>{rowData.group}</div>
|
|
|
|
| 98 |
return (
|
| 99 |
<DataTable
|
| 100 |
value={table}
|
| 101 |
+
dataKey='name'
|
| 102 |
rowGroupMode='subheader'
|
| 103 |
rowGroupHeaderTemplate={rowData => {
|
| 104 |
return <div style={{ fontWeight: 'bold' }}>{rowData.group}</div>
|
frontend/src/components/LanguageTable.js
CHANGED
|
@@ -122,6 +122,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
|
|
| 122 |
value={data.filter(
|
| 123 |
item => !selectedLanguages.some(l => l.bcp_47 === item.bcp_47)
|
| 124 |
)}
|
|
|
|
| 125 |
header={
|
| 126 |
<span>
|
| 127 |
<span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>Languages</span>
|
|
|
|
| 122 |
value={data.filter(
|
| 123 |
item => !selectedLanguages.some(l => l.bcp_47 === item.bcp_47)
|
| 124 |
)}
|
| 125 |
+
dataKey='bcp_47'
|
| 126 |
header={
|
| 127 |
<span>
|
| 128 |
<span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>Languages</span>
|
frontend/src/components/ModelTable.js
CHANGED
|
@@ -225,6 +225,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTr
|
|
| 225 |
return (
|
| 226 |
<DataTable
|
| 227 |
value={data}
|
|
|
|
| 228 |
header={<>{getHeaderText()}</>}
|
| 229 |
sortField='average'
|
| 230 |
removableSort
|
|
|
|
| 225 |
return (
|
| 226 |
<DataTable
|
| 227 |
value={data}
|
| 228 |
+
dataKey='name'
|
| 229 |
header={<>{getHeaderText()}</>}
|
| 230 |
sortField='average'
|
| 231 |
removableSort
|
frontend/src/components/ScoreColumns.js
CHANGED
|
@@ -2,112 +2,117 @@ import { Column } from 'primereact/column'
|
|
| 2 |
import ScoreField from './ScoreField'
|
| 3 |
|
| 4 |
const scoreBodyTemplate = (field, options = {}) => {
|
| 5 |
-
const {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
return rowData => {
|
| 8 |
const score = rowData[field]
|
| 9 |
-
// Prefer per-row flag if present (backend sets `<metric>_is_machine`),
|
| 10 |
-
// otherwise fall back to global list
|
| 11 |
const rowFlagKey = `${field}_is_machine`
|
| 12 |
const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
|
| 13 |
const isMachineTranslated = hasRowFlag
|
| 14 |
? !!rowData[rowFlagKey]
|
| 15 |
: machineTranslatedMetrics.includes(field)
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
}
|
| 18 |
}
|
| 19 |
|
| 20 |
-
const
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
/>,
|
| 29 |
-
<Column
|
| 30 |
-
field='translation_from_bleu'
|
| 31 |
-
header='Translation (from)'
|
| 32 |
-
headerTooltip='Translation performance from a language to all other languages (spBLEU score on a sample of the FLORES+ benchmark)'
|
| 33 |
-
sortable
|
| 34 |
-
body={scoreBodyTemplate('translation_from_bleu', {
|
| 35 |
-
minScore: 0,
|
| 36 |
-
maxScore: 0.4,
|
| 37 |
-
machineTranslatedMetrics
|
| 38 |
-
})}
|
| 39 |
-
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 40 |
-
/>,
|
| 41 |
-
<Column
|
| 42 |
-
field='translation_to_bleu'
|
| 43 |
-
header='Translation (to)'
|
| 44 |
-
headerTooltip='Translation performance from all other languages to a language (spBLEU score on a sample of the FLORES+ benchmark)'
|
| 45 |
-
sortable
|
| 46 |
-
body={scoreBodyTemplate('translation_to_bleu', {
|
| 47 |
-
minScore: 0,
|
| 48 |
-
maxScore: 0.4,
|
| 49 |
-
machineTranslatedMetrics
|
| 50 |
-
})}
|
| 51 |
-
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 52 |
-
/>,
|
| 53 |
-
<Column
|
| 54 |
-
field='classification_accuracy'
|
| 55 |
-
header='Classification'
|
| 56 |
-
headerTooltip='Classification performance (accuracy on a sample of the SIB-200 / FLORES+ classification benchmark)'
|
| 57 |
-
sortable
|
| 58 |
-
body={scoreBodyTemplate('classification_accuracy', {
|
| 59 |
-
minScore: 0.4,
|
| 60 |
-
maxScore: 1,
|
| 61 |
-
machineTranslatedMetrics
|
| 62 |
-
})}
|
| 63 |
-
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 64 |
-
/>,
|
| 65 |
-
// <Column
|
| 66 |
-
// field='language_modeling_chrf'
|
| 67 |
-
// header='Language Modeling'
|
| 68 |
-
// sortable
|
| 69 |
-
// body={scoreBodyTemplate('language_modeling_chrf', {
|
| 70 |
-
// minScore: 0.8,
|
| 71 |
-
// maxScore: 1
|
| 72 |
-
// })}
|
| 73 |
-
// style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 74 |
-
// />,
|
| 75 |
-
<Column
|
| 76 |
-
field='mmlu_accuracy'
|
| 77 |
-
header='Q&A'
|
| 78 |
-
headerTooltip='Question Answering performance (accuracy on a sample of multilingual versions of the MMLU benchmark)'
|
| 79 |
-
sortable
|
| 80 |
-
body={scoreBodyTemplate('mmlu_accuracy', {
|
| 81 |
-
minScore: 0,
|
| 82 |
-
maxScore: 1,
|
| 83 |
-
machineTranslatedMetrics
|
| 84 |
-
})}
|
| 85 |
-
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 86 |
-
/>,
|
| 87 |
<Column
|
| 88 |
-
field=
|
| 89 |
-
header=
|
| 90 |
-
headerTooltip=
|
| 91 |
sortable
|
| 92 |
-
body={scoreBodyTemplate(
|
| 93 |
-
minScore
|
| 94 |
-
maxScore
|
| 95 |
-
machineTranslatedMetrics
|
|
|
|
|
|
|
| 96 |
})}
|
| 97 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
]
|
| 112 |
|
| 113 |
export default ScoreColumns
|
|
|
|
| 2 |
import ScoreField from './ScoreField'
|
| 3 |
|
| 4 |
const scoreBodyTemplate = (field, options = {}) => {
|
| 5 |
+
const {
|
| 6 |
+
minScore = 0,
|
| 7 |
+
maxScore = 1,
|
| 8 |
+
machineTranslatedMetrics = [],
|
| 9 |
+
ciLowerField = null,
|
| 10 |
+
ciUpperField = null
|
| 11 |
+
} = options
|
| 12 |
|
| 13 |
return rowData => {
|
| 14 |
const score = rowData[field]
|
|
|
|
|
|
|
| 15 |
const rowFlagKey = `${field}_is_machine`
|
| 16 |
const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
|
| 17 |
const isMachineTranslated = hasRowFlag
|
| 18 |
? !!rowData[rowFlagKey]
|
| 19 |
: machineTranslatedMetrics.includes(field)
|
| 20 |
+
const ciLower = ciLowerField ? rowData[ciLowerField] : null
|
| 21 |
+
const ciUpper = ciUpperField ? rowData[ciUpperField] : null
|
| 22 |
+
return (
|
| 23 |
+
<ScoreField
|
| 24 |
+
score={score}
|
| 25 |
+
minScore={minScore}
|
| 26 |
+
maxScore={maxScore}
|
| 27 |
+
isMachineTranslated={isMachineTranslated}
|
| 28 |
+
ciLower={ciLower}
|
| 29 |
+
ciUpper={ciUpper}
|
| 30 |
+
/>
|
| 31 |
+
)
|
| 32 |
}
|
| 33 |
}
|
| 34 |
|
| 35 |
+
const createScoreColumn = (
|
| 36 |
+
field,
|
| 37 |
+
header,
|
| 38 |
+
tooltip,
|
| 39 |
+
minScore,
|
| 40 |
+
maxScore,
|
| 41 |
+
machineTranslatedMetrics
|
| 42 |
+
) => (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
<Column
|
| 44 |
+
field={field}
|
| 45 |
+
header={header}
|
| 46 |
+
headerTooltip={tooltip}
|
| 47 |
sortable
|
| 48 |
+
body={scoreBodyTemplate(field, {
|
| 49 |
+
minScore,
|
| 50 |
+
maxScore,
|
| 51 |
+
machineTranslatedMetrics,
|
| 52 |
+
ciLowerField: `${field}_ci_lower`,
|
| 53 |
+
ciUpperField: `${field}_ci_upper`
|
| 54 |
})}
|
| 55 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 56 |
+
/>
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
const ScoreColumns = (machineTranslatedMetrics = []) => [
|
| 60 |
+
createScoreColumn(
|
| 61 |
+
'average',
|
| 62 |
+
'Proficiency',
|
| 63 |
+
'Language Proficiency Score (average of the scores for each task)',
|
| 64 |
+
0,
|
| 65 |
+
1,
|
| 66 |
+
machineTranslatedMetrics
|
| 67 |
+
),
|
| 68 |
+
createScoreColumn(
|
| 69 |
+
'translation_from_bleu',
|
| 70 |
+
'Translation (from)',
|
| 71 |
+
'Translation performance from a language to all other languages (spBLEU score on a sample of the FLORES+ benchmark)',
|
| 72 |
+
0,
|
| 73 |
+
1,
|
| 74 |
+
machineTranslatedMetrics
|
| 75 |
+
),
|
| 76 |
+
createScoreColumn(
|
| 77 |
+
'translation_to_bleu',
|
| 78 |
+
'Translation (to)',
|
| 79 |
+
'Translation performance from all other languages to a language (spBLEU score on a sample of the FLORES+ benchmark)',
|
| 80 |
+
0,
|
| 81 |
+
1,
|
| 82 |
+
machineTranslatedMetrics
|
| 83 |
+
),
|
| 84 |
+
createScoreColumn(
|
| 85 |
+
'classification_accuracy',
|
| 86 |
+
'Classification',
|
| 87 |
+
'Classification performance (accuracy on a sample of the SIB-200 / FLORES+ classification benchmark)',
|
| 88 |
+
0,
|
| 89 |
+
1,
|
| 90 |
+
machineTranslatedMetrics
|
| 91 |
+
),
|
| 92 |
+
createScoreColumn(
|
| 93 |
+
'mmlu_accuracy',
|
| 94 |
+
'Q&A',
|
| 95 |
+
'Question Answering performance (accuracy on a sample of multilingual versions of the MMLU benchmark)',
|
| 96 |
+
0,
|
| 97 |
+
1,
|
| 98 |
+
machineTranslatedMetrics
|
| 99 |
+
),
|
| 100 |
+
createScoreColumn(
|
| 101 |
+
'arc_accuracy',
|
| 102 |
+
'Advanced Q&A',
|
| 103 |
+
'Advanced Question Answering performance (accuracy on a sample of multilingual versions of the ARC-Easy benchmark)',
|
| 104 |
+
0,
|
| 105 |
+
1,
|
| 106 |
+
machineTranslatedMetrics
|
| 107 |
+
),
|
| 108 |
+
createScoreColumn(
|
| 109 |
+
'mgsm_accuracy',
|
| 110 |
+
'Math',
|
| 111 |
+
'Math Problem Solving performance (accuracy on a sample of multilingual versions of the GSM8K benchmark)',
|
| 112 |
+
0,
|
| 113 |
+
1,
|
| 114 |
+
machineTranslatedMetrics
|
| 115 |
+
)
|
| 116 |
]
|
| 117 |
|
| 118 |
export default ScoreColumns
|
frontend/src/components/ScoreField.js
CHANGED
|
@@ -1,24 +1,34 @@
|
|
| 1 |
-
const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
|
| 2 |
let percentage = 100
|
| 3 |
let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
|
|
|
|
|
|
|
|
|
|
| 4 |
if (score !== null) {
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
((normalizedScore - minScore) / (maxScore - minScore)) * 100
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
}
|
| 23 |
|
| 24 |
return (
|
|
@@ -39,14 +49,56 @@ const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
|
|
| 39 |
width: `${percentage}%`,
|
| 40 |
backgroundColor: barColor,
|
| 41 |
zIndex: 0,
|
| 42 |
-
transition: 'width 0.3s, background-color 0.3s'
|
| 43 |
}}
|
| 44 |
/>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
<span
|
| 47 |
style={{
|
| 48 |
position: 'relative',
|
| 49 |
-
zIndex:
|
| 50 |
}}
|
| 51 |
>
|
| 52 |
{score !== null ? (score * 100).toFixed(1)+"%" : '–'}
|
|
|
|
| 1 |
+
const ScoreField = ({ score, minScore, maxScore, isMachineTranslated = false, ciLower = null, ciUpper = null }) => {
|
| 2 |
let percentage = 100
|
| 3 |
let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
|
| 4 |
+
let ciLowerPercentage = null
|
| 5 |
+
let ciUpperPercentage = null
|
| 6 |
+
|
| 7 |
if (score !== null) {
|
| 8 |
+
// Calculate percentage based on the provided min and max scores
|
| 9 |
+
// This normalizes the score to a 0-100 range for visualization
|
| 10 |
+
const normalizedScore = Math.min(Math.max(score, minScore), maxScore)
|
| 11 |
+
percentage = ((normalizedScore - minScore) / (maxScore - minScore)) * 100
|
|
|
|
| 12 |
|
| 13 |
+
// Continuous color gradient from red to green based on score
|
| 14 |
+
// For a smooth transition, calculate the RGB values directly
|
| 15 |
|
| 16 |
+
// Red component decreases as score increases
|
| 17 |
+
const red = Math.round(255 * (1 - percentage / 100))
|
| 18 |
+
// Green component increases as score increases
|
| 19 |
+
const green = Math.round(255 * (percentage / 100))
|
| 20 |
+
// Use a low opacity for subtlety (0.1-0.2 range)
|
| 21 |
+
const opacity = 0.1 + (percentage / 100) * 0.1
|
| 22 |
|
| 23 |
+
barColor = `rgba(${red}, ${green}, 0, ${opacity.toFixed(2)})`
|
| 24 |
+
|
| 25 |
+
// Calculate CI percentages if available
|
| 26 |
+
if (ciLower !== null && ciUpper !== null) {
|
| 27 |
+
const normalizedCiLower = Math.min(Math.max(ciLower, minScore), maxScore)
|
| 28 |
+
const normalizedCiUpper = Math.min(Math.max(ciUpper, minScore), maxScore)
|
| 29 |
+
ciLowerPercentage = ((normalizedCiLower - minScore) / (maxScore - minScore)) * 100
|
| 30 |
+
ciUpperPercentage = ((normalizedCiUpper - minScore) / (maxScore - minScore)) * 100
|
| 31 |
+
}
|
| 32 |
}
|
| 33 |
|
| 34 |
return (
|
|
|
|
| 49 |
width: `${percentage}%`,
|
| 50 |
backgroundColor: barColor,
|
| 51 |
zIndex: 0,
|
| 52 |
+
// transition: 'width 0.3s, background-color 0.3s'
|
| 53 |
}}
|
| 54 |
/>
|
| 55 |
+
|
| 56 |
+
{/* Confidence interval error bar */}
|
| 57 |
+
{ciLowerPercentage !== null && ciUpperPercentage !== null && (
|
| 58 |
+
<div
|
| 59 |
+
style={{
|
| 60 |
+
position: 'absolute',
|
| 61 |
+
top: '50%',
|
| 62 |
+
left: `${ciLowerPercentage}%`,
|
| 63 |
+
width: `${ciUpperPercentage - ciLowerPercentage}%`,
|
| 64 |
+
height: '2px',
|
| 65 |
+
backgroundColor: 'rgba(0, 0, 0, 0.3)',
|
| 66 |
+
zIndex: 1,
|
| 67 |
+
transform: 'translateY(-50%)',
|
| 68 |
+
// transition: 'left 0.3s, width 0.3s'
|
| 69 |
+
}}
|
| 70 |
+
>
|
| 71 |
+
{/* Left cap */}
|
| 72 |
+
<div
|
| 73 |
+
style={{
|
| 74 |
+
position: 'absolute',
|
| 75 |
+
left: 0,
|
| 76 |
+
top: '50%',
|
| 77 |
+
width: '1px',
|
| 78 |
+
height: '8px',
|
| 79 |
+
backgroundColor: 'rgba(0, 0, 0, 0.3)',
|
| 80 |
+
transform: 'translate(-50%, -50%)'
|
| 81 |
+
}}
|
| 82 |
+
/>
|
| 83 |
+
{/* Right cap */}
|
| 84 |
+
<div
|
| 85 |
+
style={{
|
| 86 |
+
position: 'absolute',
|
| 87 |
+
right: 0,
|
| 88 |
+
top: '50%',
|
| 89 |
+
width: '1px',
|
| 90 |
+
height: '8px',
|
| 91 |
+
backgroundColor: 'rgba(0, 0, 0, 0.3)',
|
| 92 |
+
transform: 'translate(50%, -50%)'
|
| 93 |
+
}}
|
| 94 |
+
/>
|
| 95 |
+
</div>
|
| 96 |
+
)}
|
| 97 |
|
| 98 |
<span
|
| 99 |
style={{
|
| 100 |
position: 'relative',
|
| 101 |
+
zIndex: 2
|
| 102 |
}}
|
| 103 |
>
|
| 104 |
{score !== null ? (score * 100).toFixed(1)+"%" : '–'}
|