{ "lastUpdated": "2025-09-05", "models": [ { "name": "llama-3-1-8b-instruct", "overall_score": 0.428, "valid_tool_schema": 96.1, "compliance": 89.4, "task_success": 90.9, "schema_understanding": 0.261, "task_completion": 0.295, "tool_usage": 0.352, "planning_effectiveness": 0.310, "task_information": 0.221, "tool_parameter": 0.141, "dependency": 0.428 }, { "name": "llama-3-2-90b-vision-instruct", "overall_score": 0.495, "valid_tool_schema": 99.6, "compliance": 85.0, "task_success": 90.9, "schema_understanding": 0.293, "task_completion": 0.444, "tool_usage": 0.515, "planning_effectiveness": 0.427, "task_information": 0.267, "tool_parameter": 0.173, "dependency": 0.495 }, { "name": "nova-micro-v1", "overall_score": 0.508, "valid_tool_schema": 96.0, "compliance": 93.1, "task_success": 87.8, "schema_understanding": 0.339, "task_completion": 0.419, "tool_usage": 0.504, "planning_effectiveness": 0.428, "task_information": 0.315, "tool_parameter": 0.212, "dependency": 0.508 }, { "name": "llama-3-1-70b-instruct", "overall_score": 0.510, "valid_tool_schema": 99.2, "compliance": 90.5, "task_success": 92.5, "schema_understanding": 0.314, "task_completion": 0.432, "tool_usage": 0.523, "planning_effectiveness": 0.451, "task_information": 0.287, "tool_parameter": 0.191, "dependency": 0.510 }, { "name": "mistral-small-2503", "overall_score": 0.530, "valid_tool_schema": 96.4, "compliance": 95.6, "task_success": 86.2, "schema_understanding": 0.373, "task_completion": 0.445, "tool_usage": 0.537, "planning_effectiveness": 0.446, "task_information": 0.349, "tool_parameter": 0.232, "dependency": 0.530 }, { "name": "gpt-4o-mini", "overall_score": 0.557, "valid_tool_schema": 97.5, "compliance": 98.1, "task_success": 93.9, "schema_understanding": 0.374, "task_completion": 0.500, "tool_usage": 0.555, "planning_effectiveness": 0.544, "task_information": 0.352, "tool_parameter": 0.201, "dependency": 0.557 }, { "name": "llama-3-3-70b-instruct", "overall_score": 0.558, "valid_tool_schema": 99.5, "compliance": 93.8, "task_success": 91.6, "schema_understanding": 0.349, "task_completion": 0.493, "tool_usage": 0.583, "planning_effectiveness": 0.525, "task_information": 0.355, "tool_parameter": 0.262, "dependency": 0.558 }, { "name": "gemma-3-27b-it", "overall_score": 0.582, "valid_tool_schema": 98.8, "compliance": 97.6, "task_success": 94.4, "schema_understanding": 0.378, "task_completion": 0.530, "tool_usage": 0.608, "planning_effectiveness": 0.572, "task_information": 0.383, "tool_parameter": 0.249, "dependency": 0.582 }, { "name": "gpt-4o", "overall_score": 0.595, "valid_tool_schema": 98.9, "compliance": 98.3, "task_success": 92.8, "schema_understanding": 0.394, "task_completion": 0.542, "tool_usage": 0.627, "planning_effectiveness": 0.587, "task_information": 0.405, "tool_parameter": 0.272, "dependency": 0.595 }, { "name": "gemini-2.5-flash-lite", "overall_score": 0.598, "valid_tool_schema": 99.4, "compliance": 97.8, "task_success": 94.3, "schema_understanding": 0.412, "task_completion": 0.577, "tool_usage": 0.627, "planning_effectiveness": 0.597, "task_information": 0.404, "tool_parameter": 0.226, "dependency": 0.598 }, { "name": "qwen3-30b-a3b-instruct-2507", "overall_score": 0.627, "valid_tool_schema": 99.0, "compliance": 98.4, "task_success": 92.3, "schema_understanding": 0.481, "task_completion": 0.530, "tool_usage": 0.658, "planning_effectiveness": 0.638, "task_information": 0.473, "tool_parameter": 0.303, "dependency": 0.627 }, { "name": "kimi-k2", "overall_score": 0.629, "valid_tool_schema": 98.8, "compliance": 98.1, "task_success": 94.5, "schema_understanding": 0.502, "task_completion": 0.577, "tool_usage": 0.631, "planning_effectiveness": 0.623, "task_information": 0.448, "tool_parameter": 0.307, "dependency": 0.629 }, { "name": "gpt-oss-20b", "overall_score": 0.654, "valid_tool_schema": 98.8, "compliance": 99.1, "task_success": 93.6, "schema_understanding": 0.547, "task_completion": 0.623, "tool_usage": 0.661, "planning_effectiveness": 0.638, "task_information": 0.509, "tool_parameter": 0.309, "dependency": 0.654 }, { "name": "glm-4.5", "overall_score": 0.668, "valid_tool_schema": 99.7, "compliance": 99.7, "task_success": 97.4, "schema_understanding": 0.525, "task_completion": 0.682, "tool_usage": 0.680, "planning_effectiveness": 0.661, "task_information": 0.523, "tool_parameter": 0.297, "dependency": 0.668 }, { "name": "qwen3-235b-a22b-2507", "overall_score": 0.678, "valid_tool_schema": 99.1, "compliance": 99.3, "task_success": 94.8, "schema_understanding": 0.549, "task_completion": 0.625, "tool_usage": 0.688, "planning_effectiveness": 0.712, "task_information": 0.542, "tool_parameter": 0.355, "dependency": 0.678 }, { "name": "claude-sonnet-4", "overall_score": 0.681, "valid_tool_schema": 100.0, "compliance": 99.8, "task_success": 98.8, "schema_understanding": 0.554, "task_completion": 0.676, "tool_usage": 0.689, "planning_effectiveness": 0.671, "task_information": 0.541, "tool_parameter": 0.328, "dependency": 0.681 }, { "name": "gemini-2.5-pro", "overall_score": 0.690, "valid_tool_schema": 99.4, "compliance": 99.6, "task_success": 96.9, "schema_understanding": 0.562, "task_completion": 0.725, "tool_usage": 0.717, "planning_effectiveness": 0.670, "task_information": 0.541, "tool_parameter": 0.329, "dependency": 0.690 }, { "name": "gpt-oss-120b", "overall_score": 0.692, "valid_tool_schema": 97.7, "compliance": 98.8, "task_success": 94.0, "schema_understanding": 0.636, "task_completion": 0.705, "tool_usage": 0.691, "planning_effectiveness": 0.661, "task_information": 0.576, "tool_parameter": 0.329, "dependency": 0.692 }, { "name": "o3", "overall_score": 0.715, "valid_tool_schema": 99.3, "compliance": 99.9, "task_success": 97.1, "schema_understanding": 0.641, "task_completion": 0.706, "tool_usage": 0.724, "planning_effectiveness": 0.726, "task_information": 0.592, "tool_parameter": 0.359, "dependency": 0.715 }, { "name": "gpt-5", "overall_score": 0.749, "valid_tool_schema": 100.0, "compliance": 99.3, "task_success": 99.1, "schema_understanding": 0.677, "task_completion": 0.828, "tool_usage": 0.767, "planning_effectiveness": 0.749, "task_information": 0.649, "tool_parameter": 0.339, "dependency": 0.749 } ] }