Nora Petrova
Add project to new space
20e666e
raw
history blame
26.4 kB
// lib/utils.js
/**
* Constants
*/
const MODEL_COLORS = {
"gpt-4o": "#0072B2", // Strong blue
"claude-3.7-sonnet": "#D55E00", // Vermillion/orange-red
"deepseek-r1": "#F0E442", // Yellow
o1: "#CC79A7", // Pink
"gemini-2.0-flash-001": "#009E73", // Bluish green
"llama-3.1-405b-instruct": "#56B4E9", // Light blue
};
// --- Helper Functions ---
/**
* Converts camelCase to Title Case.
* @param {string} str Input string.
* @returns {string} Title Case string.
*/
export const camelToTitle = (str) => {
if (!str) return str;
const spaced = str.replace(/([A-Z])/g, " $1");
return spaced.charAt(0).toUpperCase() + spaced.slice(1).trim();
};
/**
* Helper to format metric/factor names (snake/kebab to Title Case)
* Needed for display consistency when keys are snake_case.
*/
export const formatDisplayKey = (key) => {
if (!key || typeof key !== "string") return "N/A";
if (key === "N/A") return "N/A";
// Handle snake_case or kebab-case input
return key
.replace(/_/g, " ")
.replace(/-/g, " ")
.trim()
.replace(/\b\w/g, (l) => l.toUpperCase());
};
/**
* Helper to get Significance indicator style and tooltip
*/
export function getSignificanceIndicator(isSignificant, pValue, alpha = 0.05) {
const pValueFormatted =
typeof pValue === "number" && !isNaN(pValue) ? pValue.toFixed(3) : "N/A";
if (isSignificant === true) {
return {
symbol: "✓",
className: "text-green-600",
tooltip: `Statistically Significant (p=${pValueFormatted} < ${alpha})`,
};
} else if (isSignificant === false) {
return {
symbol: "✗",
className: "text-red-600",
tooltip: `Not Statistically Significant (p=${pValueFormatted}${alpha})`,
};
} else {
return {
symbol: "?",
className: "text-gray-400",
tooltip: "Significance Undetermined",
};
}
}
/**
* Determines the style and tooltip for an equity gap status indicator.
*/
export function getEquityIndicatorStyle(
isConcern,
isLargeEffect,
isSignificant,
pValue,
effectSizeClass
) {
const pValueText =
typeof pValue === "number" && !isNaN(pValue)
? `p=${pValue.toFixed(3)}`
: "p=N/A";
const effectText = `Effect: ${effectSizeClass || "N/A"}`;
if (isConcern === true) {
return {
icon: "▲",
colorClass: "text-red-600",
tooltip: `Equity Concern (${effectText}, Significant, ${pValueText})`,
};
} else if (isSignificant === null) {
return {
icon: "?",
colorClass: "text-gray-500",
tooltip: `Significance Undetermined (${effectText})`,
};
} else if (isLargeEffect === true && isSignificant === false) {
return {
icon: "●",
colorClass: "text-yellow-600",
tooltip: `Large Effect but Not Statistically Significant (${pValueText})`,
};
} else if (isSignificant === true) {
return {
icon: "✓",
colorClass: "text-green-600",
tooltip: `Statistically Significant but Not Large Effect (${effectText}, ${pValueText})`,
};
} else {
return {
icon: "✓",
colorClass: "text-gray-400",
tooltip: `Not Statistically Significant (${effectText}, ${pValueText})`,
};
}
}
/**
* Determine styling based on score for generic BADGES (background + text)
*/
export function getScoreBadgeColor(score, min = 0, max = 100) {
const numericScore = Number(score);
if (
score === null ||
score === undefined ||
score === "N/A" ||
isNaN(numericScore)
) {
return "bg-gray-100 text-gray-800";
}
const range = Math.abs(max - min);
if (range <= 0) return "bg-gray-100 text-gray-800";
let percent;
if (max > min) {
percent = ((numericScore - min) / range) * 100;
} else {
percent = ((min - numericScore) / range) * 100;
}
if (percent >= 80) return "bg-green-100 text-green-800";
if (percent >= 50) return "bg-blue-100 text-blue-800";
if (percent >= 20) return "bg-yellow-100 text-yellow-800";
return "bg-red-100 text-red-800";
}
/**
* Determine TEXT color based on score (0-100 scale, higher is better)
*/
export function getScoreColor(score) {
const numericScore = Number(score);
if (
score === null ||
score === undefined ||
score === "N/A" ||
isNaN(numericScore)
) {
return "text-gray-400";
}
if (numericScore >= 80) return "text-green-600 font-medium";
if (numericScore >= 60) return "text-blue-600";
if (numericScore >= 40) return "text-yellow-600";
return "text-red-600";
}
/**
* Tooltip text for metrics and table headers - Accepts original keys
*/
export const getMetricTooltip = (key) => {
// Format the key for display/lookup in tooltips map if needed
const titleCaseKey = formatDisplayKey(key); // Convert snake_case/camelCase to Title Case
const tooltips = {
// Use Title Case keys matching dropdowns/headers
// High-level
Helpfulness:
"How well the model provides useful assistance that addresses user needs",
Communication:
"Quality of clarity, coherence, and appropriateness of writing style",
Understanding:
"How well the model comprehends requests and contextual information",
Adaptiveness:
"How well the model adjusts to user needs and feedback during conversation",
Trustworthiness:
"Transparency, accuracy, and consistency in model responses",
Personality:
"Consistency and definition of the model's persona and ethical alignment",
"Background And Culture":
"Cultural sensitivity, relevance, and freedom from bias",
"Repeat Usage":
"User satisfaction and willingness to use the model again (score 0-100).",
// Low-level (use formatted names matching display)
Effectiveness: "How effectively the model helps accomplish specific goals",
Comprehensiveness:
"How thoroughly the model addresses all aspects of requests",
Usefulness: "Practicality and relevance of suggestions or solutions",
"Tone And Language Style":
"Appropriateness of tone and language for the context",
"Conversation Flow": "Natural and conversational quality of responses",
"Detail And Technical Language":
"Appropriate level of detail and technical language",
Accuracy: "Accuracy in interpreting user requests",
"Context Memory": "Ability to maintain conversation context",
Intuitiveness: "Ability to pick up on implicit aspects of requests",
Flexibility: "Adapting responses based on user feedback",
Clarity: "Ability to clarify ambiguities or misunderstandings",
"Conversation Building": "Building upon previous exchanges in conversation",
Consistency: "Consistency of responses across similar questions",
Confidence: "User confidence in accuracy of information",
Transparency: "Openness about limitations or uncertainties",
"Personality Consistency":
"Consistency of personality throughout interactions",
"Distinct Personality": "How well-defined the model's personality is",
"Honesty Empathy Fairness": "Alignment with ethical expectations",
"Ethical Alignment": "Alignment with user culture, viewpoint, or values",
"Cultural Awareness":
"Recognition of when cultural perspective is relevant",
"Bias And Stereotypes": "Freedom from stereotypes and bias in responses",
// Table headers
"Overall Score":
"Average score across high-level categories (0-100). Higher is better.",
"Overall SD":
"Standard Deviation (± points) of scores across high-level categories. Lower indicates more consistent performance across capabilities.",
"Max Equity Gap":
"Score difference (points) for the demographic gap with the largest statistical effect size for this model. Status icon indicates Equity Concern (▲) and/or Significance (✓/✗/?). Hover for details.",
"Max Gap Area":
"The specific Demographic Factor and Category where the 'Max Equity Gap' (largest effect size gap) occurred for this model.",
"Equity Concerns (%)":
"Percentage of evaluated demographic gaps flagged as Equity Concerns (Large Effect & Statistically Significant, p<0.05). Lower is better.",
"User Retention":
"Model score for the 'Repeat Usage' category (0-100), indicating likelihood of users using the model again.",
};
// Try lookup with formatted key, then original key as fallback
return tooltips[titleCaseKey] || tooltips[key] || "No description available";
};
/**
* Badge color based on Effect Size Class
*/
export function getEffectSizeBadgeColor(effectSizeClass) {
if (!effectSizeClass || effectSizeClass === "N/A") {
return "bg-gray-100 text-gray-800";
}
switch (effectSizeClass) {
case "Negligible":
return "bg-green-100 text-green-800";
case "Small":
return "bg-blue-100 text-blue-800";
case "Medium":
return "bg-yellow-100 text-yellow-800";
case "Large":
return "bg-red-100 text-red-800";
default:
return "bg-gray-100 text-gray-800";
}
}
/**
* Helper function to process task performance data
* Expects rawData input with snake_case keys
*/
function processTaskPerformance(rawData, taskCategoryMap, modelOrder) {
const result = {
bestModelPerTask: {},
keyMetricsByTask: {},
bestModelPerTaskCategory: {
creative: null,
practical: null,
analytical: null,
},
keyMetricsByTaskCategory: { creative: [], practical: [], analytical: [] },
};
// Access original snake_case key from input
const taskPerformance = rawData?.task_level_performance;
if (!taskPerformance || typeof taskPerformance !== "object") {
console.warn(
"Task level performance data missing or invalid in processTaskPerformance input."
);
return result;
}
// Task names are keys in taskPerformance
Object.keys(taskPerformance).forEach((taskName) => {
const taskData = taskPerformance[taskName];
if (!taskData) return;
let taskBestModel = null;
let taskBestAvgScore = -Infinity;
let taskBestModelMetrics = null;
modelOrder.forEach((modelName) => {
// Iterate through known models
const modelMetrics = taskData[modelName];
if (modelMetrics && typeof modelMetrics === "object") {
// Access metric scores using original snake_case keys within modelMetrics
const scores = Object.values(modelMetrics)
.map((s) => Number(s))
.filter((s) => !isNaN(s));
if (scores.length > 0) {
const avgScore =
scores.reduce((sum, score) => sum + score, 0) / scores.length;
if (avgScore > taskBestAvgScore) {
taskBestAvgScore = avgScore;
taskBestModel = modelName;
taskBestModelMetrics = modelMetrics;
}
}
}
});
if (taskBestModel && taskBestModelMetrics) {
result.bestModelPerTask[taskName] = {
model: taskBestModel,
score: taskBestAvgScore,
color: MODEL_COLORS[taskBestModel] || "#999999",
};
// Extract top metrics (keys are snake_case)
const metricsArray = Object.entries(taskBestModelMetrics)
.map(([metricKey, score]) => ({ metricKey, score: Number(score) || 0 }))
.sort((a, b) => b.score - a.score);
// Store with snake_case key, add display name
result.keyMetricsByTask[taskName] = metricsArray
.slice(0, 3)
.map((m) => ({ ...m, metricName: formatDisplayKey(m.metricKey) }));
} else {
result.bestModelPerTask[taskName] = {
model: "N/A",
score: "N/A",
color: "#999999",
};
result.keyMetricsByTask[taskName] = [];
}
});
// Task Categories processing
const tasksByCategory = { creative: [], practical: [], analytical: [] };
Object.entries(taskCategoryMap).forEach(([task, category]) => {
if (tasksByCategory[category] && taskPerformance[task]) {
tasksByCategory[category].push(task);
}
});
Object.entries(tasksByCategory).forEach(([category, tasks]) => {
const categoryNameDisplay = `${
category.charAt(0).toUpperCase() + category.slice(1)
} Tasks`;
if (tasks.length === 0) {
result.bestModelPerTaskCategory[category] = {
model: "N/A",
score: "N/A",
color: "#999999",
categoryName: categoryNameDisplay,
};
result.keyMetricsByTaskCategory[category] = [];
return;
}
const categoryModelScores = {};
modelOrder.forEach((modelName) => {
categoryModelScores[modelName] = { totalScore: 0, count: 0, metrics: {} };
tasks.forEach((task) => {
if (taskPerformance[task]?.[modelName]) {
// metricKey is original snake_case here
Object.entries(taskPerformance[task][modelName]).forEach(
([metricKey, score]) => {
const numScore = Number(score);
if (!isNaN(numScore)) {
categoryModelScores[modelName].totalScore += numScore;
categoryModelScores[modelName].count++;
if (!categoryModelScores[modelName].metrics[metricKey])
categoryModelScores[modelName].metrics[metricKey] = {
sum: 0,
count: 0,
};
categoryModelScores[modelName].metrics[metricKey].sum +=
numScore;
categoryModelScores[modelName].metrics[metricKey].count++;
}
}
);
}
});
});
let bestAvg = -Infinity;
let bestCatModel = null;
Object.entries(categoryModelScores).forEach(([model, data]) => {
if (data.count > 0) {
const avg = data.totalScore / data.count;
if (avg > bestAvg) {
bestAvg = avg;
bestCatModel = model;
}
}
});
if (bestCatModel) {
result.bestModelPerTaskCategory[category] = {
model: bestCatModel,
score: Number(bestAvg.toFixed(1)),
color: MODEL_COLORS[bestCatModel] || "#999999",
categoryName: categoryNameDisplay,
};
const bestModelMetricsData =
categoryModelScores[bestCatModel]?.metrics || {};
// metricKey is snake_case
const metricAverages = Object.entries(bestModelMetricsData)
.map(([metricKey, data]) => ({
metricKey,
score: data.count > 0 ? data.sum / data.count : 0,
}))
.sort((a, b) => b.score - a.score);
// Store with original key, add display name
result.keyMetricsByTaskCategory[category] = metricAverages
.slice(0, 5)
.map((m) => ({
metric: formatDisplayKey(m.metricKey),
score: m.score,
scoreDisplay: m.score.toFixed(1),
}));
} else {
result.bestModelPerTaskCategory[category] = {
model: "N/A",
score: "N/A",
color: "#999999",
categoryName: categoryNameDisplay,
};
result.keyMetricsByTaskCategory[category] = [];
}
});
return result; // Returns object with camelCase keys
}
/**
* Prepares the data from leaderboard_data.json for visualization
* FINAL v4: Reverted deep camelCase conversion. Processes top-level keys and adds equity concern %.
* Keeps nested raw data keys as original (snake_case).
* @param {Object} rawDataInput - The raw data from leaderboard_data.json (expected snake_case)
* @returns {Object} - Processed data ready for visualization
*/
export function prepareDataForVisualization(rawDataInput) {
// Basic Validation
const defaultReturn = {
models: [],
metricsData: { highLevelCategories: {}, lowLevelMetrics: {} },
radarData: [],
bestPerCategory: {},
bestPerMetric: {},
overviewCardData: {},
rawData: {},
metadata: {},
equityAnalysis: {},
};
if (
!rawDataInput ||
!rawDataInput.model_order ||
!Array.isArray(rawDataInput.model_order)
) {
console.error(
"prepareDataForVisualization received invalid rawData.",
rawDataInput
);
return defaultReturn;
}
// Keep original references where structure is maintained
const modelOrder = rawDataInput.model_order;
const equityAnalysis = rawDataInput.equity_analysis || {
all_equity_gaps: [],
model_max_effect_gaps: {},
universal_issues: [],
assessment_method: {},
demographic_variation_stats: {},
};
const allGaps = equityAnalysis.all_equity_gaps || [];
const metadata = rawDataInput.metadata || {};
const mrpDemographicsRaw = rawDataInput.mrp_demographics || {};
const taskLevelPerformanceRaw = rawDataInput.task_level_performance || {};
// Process MRP Demographics for filtering options
const demographicFactors = new Set();
const demographicLevels = {};
const availableMetrics = new Set();
if (mrpDemographicsRaw && typeof mrpDemographicsRaw === "object") {
Object.values(mrpDemographicsRaw).forEach((modelData) => {
Object.entries(modelData || {}).forEach(([factor, factorData]) => {
demographicFactors.add(factor);
if (!demographicLevels[factor]) demographicLevels[factor] = new Set();
Object.entries(factorData || {}).forEach(([level, levelData]) => {
demographicLevels[factor].add(level);
Object.keys(levelData || {}).forEach((metric) =>
availableMetrics.add(metric)
);
});
});
}); // metric is Title Case here from Python processing
}
const demographicOptions = {};
demographicFactors.forEach((factor) => {
demographicOptions[factor] = Array.from(
demographicLevels[factor] || new Set()
).sort();
});
const availableMetricsList = Array.from(availableMetrics).sort(); // These are Title Case
// Process Overall Rankings -> camelCase & add equity concern %
const overallRankingProcessed = (rawDataInput.overall_ranking || []).map(
(modelData) => {
const modelName = modelData.model;
// details object keys are snake_case from python
const maxEffectGapDetails = modelData.max_effect_gap_details || {};
const safeParseFloat = (val) => {
const num = Number(val);
return isNaN(num) ? null : num;
};
const modelSpecificGaps = allGaps.filter(
(gap) => gap.model === modelName
); // Access snake_case keys in allGaps
const totalGapsForModel = modelSpecificGaps.length;
const concernCountForModel = modelSpecificGaps.filter(
(gap) => gap.is_equity_concern === true
).length;
let equityConcernPercentage = null;
if (totalGapsForModel > 0) {
equityConcernPercentage =
(concernCountForModel / totalGapsForModel) * 100;
}
// Return structure with camelCase keys
return {
rank: modelData.rank,
model: modelName,
overallScore: safeParseFloat(modelData.overall_score),
highLevelCatScore: safeParseFloat(modelData.high_level_cat_score),
lowLevelCatScore: safeParseFloat(modelData.low_level_cat_score),
color: MODEL_COLORS[modelName] || "#999999",
// Use snake_case keys from input JSON for these fields
stdDevAcrossCats: modelData.std_dev_across_cats,
stdDevAcrossCatsNumeric: safeParseFloat(modelData.std_dev_across_cats),
repeatUsageScore: safeParseFloat(modelData.repeat_usage_score),
maxEffectCategory: modelData.max_effect_category, // snake_case from input
maxEffectFactor: maxEffectGapDetails.demographic_factor, // snake_case from input
maxEffectSize: safeParseFloat(maxEffectGapDetails.effect_size),
maxEffectGap: safeParseFloat(maxEffectGapDetails.score_range),
maxEffectConcernFlag: maxEffectGapDetails.is_equity_concern ?? false,
maxEffectSignificant: maxEffectGapDetails.is_statistically_significant,
maxEffectPValue: maxEffectGapDetails.p_value,
maxEffectSizeClass: maxEffectGapDetails.effect_size_class || "N/A",
maxEffectRawNHeuristic:
maxEffectGapDetails.raw_n_confidence_heuristic || "N/A",
maxEffectGapDetails: maxEffectGapDetails, // Pass original snake_case details
equityConcernPercentage: equityConcernPercentage,
};
}
);
// Process Metrics Breakdown -> camelCase keys for structure, keep original metric keys inside
const metricsBreakdownProcessed = {
highLevelCategories: {},
lowLevelMetrics: {},
};
if (
rawDataInput.metrics_breakdown &&
typeof rawDataInput.metrics_breakdown === "object"
) {
const processCategory = (displayKey, categoryData) => {
// Input displayKey is Title Case from python output
if (!categoryData || !categoryData.model_scores) {
console.warn(`Missing model_scores for category: ${displayKey}`);
return {
modelScores: {},
topPerformer: { model: "N/A", score: null, color: "#999999" },
};
}
const internalMetricKey = categoryData._internal_category_name; // Get original snake_case key
const processedModelScores = {};
modelOrder.forEach((modelName) => {
const scores = categoryData.model_scores[modelName]; // Access model scores
if (!scores) {
processedModelScores[modelName] = {
nationalScore: null,
color: MODEL_COLORS[modelName] || "#999999",
maxEffectGapInfo: {},
};
return;
}
const maxEffectGapInfoForCat = scores.max_effect_gap_info || {}; // snake_case keys inside? Check python output. Assume yes.
processedModelScores[modelName] = {
nationalScore: scores.national_score ?? null,
color: MODEL_COLORS[modelName] || "#999999",
// Keep original snake_case keys for gap info within this structure
maxEffectGapInfo: maxEffectGapInfoForCat,
};
});
const topPerf = categoryData.top_performer || {};
const topPerfScore =
topPerf.score === "N/A" || topPerf.score === null
? null
: Number(topPerf.score);
return {
modelScores: processedModelScores, // Nested scores
topPerformer: {
model: topPerf.model || "N/A",
score: isNaN(topPerfScore) ? null : topPerfScore,
color: MODEL_COLORS[topPerf.model] || "#999999",
},
internalMetricKey: internalMetricKey, // Store original snake_case key
};
};
Object.entries(
rawDataInput.metrics_breakdown.high_level_categories || {}
).forEach(([displayKey, catData]) => {
metricsBreakdownProcessed.highLevelCategories[displayKey] =
processCategory(displayKey, catData);
});
Object.entries(
rawDataInput.metrics_breakdown.low_level_metrics || {}
).forEach(([displayKey, metricData]) => {
metricsBreakdownProcessed.lowLevelMetrics[displayKey] = processCategory(
displayKey,
metricData
);
});
} else {
console.warn("rawDataInput.metrics_breakdown is missing or not an object.");
}
// Prepare Radar Chart Data
const radarChartData = Object.entries(
metricsBreakdownProcessed.highLevelCategories
).map(([displayKey, categoryData]) => {
// displayKey is Title Case here
const radarEntry = { category: displayKey }; // Use Title Case for radar axis label
modelOrder.forEach((modelName) => {
radarEntry[modelName] =
Number(categoryData.modelScores[modelName]?.nationalScore) || 0;
});
return radarEntry;
});
// Prepare Top Performers
const bestPerCategory = {};
Object.entries(metricsBreakdownProcessed.highLevelCategories).forEach(
([displayKey, catData]) => {
bestPerCategory[displayKey] = catData.topPerformer;
}
);
const bestPerMetric = {};
Object.entries(metricsBreakdownProcessed.lowLevelMetrics).forEach(
([displayKey, metricData]) => {
bestPerMetric[displayKey] = metricData.topPerformer;
}
);
// Prepare Task Performance Data
const taskCategoryMap = {
"Generating a Creative Idea": "creative",
"Creating a Travel Itinerary": "creative",
"Following Up on a Job Application": "practical",
"Planning Your Weekly Meals": "practical",
"Making a Decision Between Options": "analytical",
"Understanding a Complex Topic": "analytical",
};
// Pass the original rawDataInput to the helper, which expects snake_case keys internally
const taskPerformanceResults = processTaskPerformance(
rawDataInput,
taskCategoryMap,
modelOrder
);
const tasks = Object.keys(taskLevelPerformanceRaw || {}); // Use original snake_case keys
const taskCategories = {};
Object.entries(taskCategoryMap).forEach(([task, category]) => {
if (!taskCategories[category]) taskCategories[category] = [];
if (tasks.includes(task)) taskCategories[category].push(task);
});
const taskMetrics = new Set();
Object.values(taskLevelPerformanceRaw || {}).forEach((taskData) => {
Object.values(taskData || {}).forEach((modelData) => {
Object.keys(modelData || {}).forEach((metric) => taskMetrics.add(metric));
});
}); // metric is snake_case
const taskMetricsDisplayList = Array.from(taskMetrics)
.map(formatDisplayKey)
.sort(); // Create display list
const taskMetricsSnakeList = Array.from(taskMetrics).sort(); // List of original snake_case keys
// Final Return Structure
return {
models: overallRankingProcessed, // camelCase keys for top level
metricsData: metricsBreakdownProcessed, // Title Case keys for categories/metrics
radarData: radarChartData,
bestPerCategory: bestPerCategory, // Title Case keys
bestPerMetric: bestPerMetric, // Title Case keys
overviewCardData: taskPerformanceResults, // camelCase keys expected from helper
rawData: {
// Keep original structures under camelCase keys for clarity
taskLevelPerformance: taskLevelPerformanceRaw, // snake_case keys inside
mrpDemographics: mrpDemographicsRaw, // Title Case metric keys inside
// Processed lists/maps for filtering/display
demographicOptions: demographicOptions,
availableMetrics: availableMetricsList, // Title Case metric names
tasks: tasks,
taskCategories: taskCategories,
taskMetrics: taskMetricsDisplayList, // Title Case metric names for display
taskMetricsSnake: taskMetricsSnakeList, // snake_case keys for lookup
taskCategoryMap: taskCategoryMap,
},
metadata: metadata, // Original structure
equityAnalysis: equityAnalysis, // Original structure (snake_case keys)
};
}