// lib/utils.js /** * Constants */ const MODEL_COLORS = { "gpt-4o": "#0072B2", // Strong blue "claude-3.7-sonnet": "#D55E00", // Vermillion/orange-red "deepseek-r1": "#F0E442", // Yellow o1: "#CC79A7", // Pink "gemini-2.0-flash-001": "#009E73", // Bluish green "llama-3.1-405b-instruct": "#56B4E9", // Light blue }; // --- Helper Functions --- /** * Converts camelCase to Title Case. * @param {string} str Input string. * @returns {string} Title Case string. */ export const camelToTitle = (str) => { if (!str) return str; const spaced = str.replace(/([A-Z])/g, " $1"); return spaced.charAt(0).toUpperCase() + spaced.slice(1).trim(); }; /** * Helper to format metric/factor names (snake/kebab to Title Case) * Needed for display consistency when keys are snake_case. */ export const formatDisplayKey = (key) => { if (!key || typeof key !== "string") return "N/A"; if (key === "N/A") return "N/A"; // Handle snake_case or kebab-case input return key .replace(/_/g, " ") .replace(/-/g, " ") .trim() .replace(/\b\w/g, (l) => l.toUpperCase()); }; /** * Helper to get Significance indicator style and tooltip */ export function getSignificanceIndicator(isSignificant, pValue, alpha = 0.05) { const pValueFormatted = typeof pValue === "number" && !isNaN(pValue) ? pValue.toFixed(3) : "N/A"; if (isSignificant === true) { return { symbol: "✓", className: "text-green-600", tooltip: `Statistically Significant (p=${pValueFormatted} < ${alpha})`, }; } else if (isSignificant === false) { return { symbol: "✗", className: "text-red-600", tooltip: `Not Statistically Significant (p=${pValueFormatted} ≥ ${alpha})`, }; } else { return { symbol: "?", className: "text-gray-400", tooltip: "Significance Undetermined", }; } } /** * Determines the style and tooltip for an equity gap status indicator. */ export function getEquityIndicatorStyle( isConcern, isLargeEffect, isSignificant, pValue, effectSizeClass ) { const pValueText = typeof pValue === "number" && !isNaN(pValue) ? `p=${pValue.toFixed(3)}` : "p=N/A"; const effectText = `Effect: ${effectSizeClass || "N/A"}`; if (isConcern === true) { return { icon: "▲", colorClass: "text-red-600", tooltip: `Equity Concern (${effectText}, Significant, ${pValueText})`, }; } else if (isSignificant === null) { return { icon: "?", colorClass: "text-gray-500", tooltip: `Significance Undetermined (${effectText})`, }; } else if (isLargeEffect === true && isSignificant === false) { return { icon: "●", colorClass: "text-yellow-600", tooltip: `Large Effect but Not Statistically Significant (${pValueText})`, }; } else if (isSignificant === true) { return { icon: "✓", colorClass: "text-green-600", tooltip: `Statistically Significant but Not Large Effect (${effectText}, ${pValueText})`, }; } else { return { icon: "✓", colorClass: "text-gray-400", tooltip: `Not Statistically Significant (${effectText}, ${pValueText})`, }; } } /** * Determine styling based on score for generic BADGES (background + text) */ export function getScoreBadgeColor(score, min = 0, max = 100) { const numericScore = Number(score); if ( score === null || score === undefined || score === "N/A" || isNaN(numericScore) ) { return "bg-gray-100 text-gray-800"; } const range = Math.abs(max - min); if (range <= 0) return "bg-gray-100 text-gray-800"; let percent; if (max > min) { percent = ((numericScore - min) / range) * 100; } else { percent = ((min - numericScore) / range) * 100; } if (percent >= 80) return "bg-green-100 text-green-800"; if (percent >= 50) return "bg-blue-100 text-blue-800"; if (percent >= 20) return "bg-yellow-100 text-yellow-800"; return "bg-red-100 text-red-800"; } /** * Determine TEXT color based on score (0-100 scale, higher is better) */ export function getScoreColor(score) { const numericScore = Number(score); if ( score === null || score === undefined || score === "N/A" || isNaN(numericScore) ) { return "text-gray-400"; } if (numericScore >= 80) return "text-green-600 font-medium"; if (numericScore >= 60) return "text-blue-600"; if (numericScore >= 40) return "text-yellow-600"; return "text-red-600"; } /** * Tooltip text for metrics and table headers - Accepts original keys */ export const getMetricTooltip = (key) => { // Format the key for display/lookup in tooltips map if needed const titleCaseKey = formatDisplayKey(key); // Convert snake_case/camelCase to Title Case const tooltips = { // Use Title Case keys matching dropdowns/headers // High-level Helpfulness: "How well the model provides useful assistance that addresses user needs", Communication: "Quality of clarity, coherence, and appropriateness of writing style", Understanding: "How well the model comprehends requests and contextual information", Adaptiveness: "How well the model adjusts to user needs and feedback during conversation", Trustworthiness: "Transparency, accuracy, and consistency in model responses", Personality: "Consistency and definition of the model's persona and ethical alignment", "Background And Culture": "Cultural sensitivity, relevance, and freedom from bias", "Repeat Usage": "User satisfaction and willingness to use the model again (score 0-100).", // Low-level (use formatted names matching display) Effectiveness: "How effectively the model helps accomplish specific goals", Comprehensiveness: "How thoroughly the model addresses all aspects of requests", Usefulness: "Practicality and relevance of suggestions or solutions", "Tone And Language Style": "Appropriateness of tone and language for the context", "Conversation Flow": "Natural and conversational quality of responses", "Detail And Technical Language": "Appropriate level of detail and technical language", Accuracy: "Accuracy in interpreting user requests", "Context Memory": "Ability to maintain conversation context", Intuitiveness: "Ability to pick up on implicit aspects of requests", Flexibility: "Adapting responses based on user feedback", Clarity: "Ability to clarify ambiguities or misunderstandings", "Conversation Building": "Building upon previous exchanges in conversation", Consistency: "Consistency of responses across similar questions", Confidence: "User confidence in accuracy of information", Transparency: "Openness about limitations or uncertainties", "Personality Consistency": "Consistency of personality throughout interactions", "Distinct Personality": "How well-defined the model's personality is", "Honesty Empathy Fairness": "Alignment with ethical expectations", "Ethical Alignment": "Alignment with user culture, viewpoint, or values", "Cultural Awareness": "Recognition of when cultural perspective is relevant", "Bias And Stereotypes": "Freedom from stereotypes and bias in responses", // Table headers "Overall Score": "Average score across high-level categories (0-100). Higher is better.", "Overall SD": "Standard Deviation (± points) of scores across high-level categories. Lower indicates more consistent performance across capabilities.", "Max Equity Gap": "Score difference (points) for the demographic gap with the largest statistical effect size for this model. Status icon indicates Equity Concern (▲) and/or Significance (✓/✗/?). Hover for details.", "Max Gap Area": "The specific Demographic Factor and Category where the 'Max Equity Gap' (largest effect size gap) occurred for this model.", "Equity Concerns (%)": "Percentage of evaluated demographic gaps flagged as Equity Concerns (Large Effect & Statistically Significant, p<0.05). Lower is better.", "User Retention": "Model score for the 'Repeat Usage' category (0-100), indicating likelihood of users using the model again.", }; // Try lookup with formatted key, then original key as fallback return tooltips[titleCaseKey] || tooltips[key] || "No description available"; }; /** * Badge color based on Effect Size Class */ export function getEffectSizeBadgeColor(effectSizeClass) { if (!effectSizeClass || effectSizeClass === "N/A") { return "bg-gray-100 text-gray-800"; } switch (effectSizeClass) { case "Negligible": return "bg-green-100 text-green-800"; case "Small": return "bg-blue-100 text-blue-800"; case "Medium": return "bg-yellow-100 text-yellow-800"; case "Large": return "bg-red-100 text-red-800"; default: return "bg-gray-100 text-gray-800"; } } /** * Helper function to process task performance data * Expects rawData input with snake_case keys */ function processTaskPerformance(rawData, taskCategoryMap, modelOrder) { const result = { bestModelPerTask: {}, keyMetricsByTask: {}, bestModelPerTaskCategory: { creative: null, practical: null, analytical: null, }, keyMetricsByTaskCategory: { creative: [], practical: [], analytical: [] }, }; // Access original snake_case key from input const taskPerformance = rawData?.task_level_performance; if (!taskPerformance || typeof taskPerformance !== "object") { console.warn( "Task level performance data missing or invalid in processTaskPerformance input." ); return result; } // Task names are keys in taskPerformance Object.keys(taskPerformance).forEach((taskName) => { const taskData = taskPerformance[taskName]; if (!taskData) return; let taskBestModel = null; let taskBestAvgScore = -Infinity; let taskBestModelMetrics = null; modelOrder.forEach((modelName) => { // Iterate through known models const modelMetrics = taskData[modelName]; if (modelMetrics && typeof modelMetrics === "object") { // Access metric scores using original snake_case keys within modelMetrics const scores = Object.values(modelMetrics) .map((s) => Number(s)) .filter((s) => !isNaN(s)); if (scores.length > 0) { const avgScore = scores.reduce((sum, score) => sum + score, 0) / scores.length; if (avgScore > taskBestAvgScore) { taskBestAvgScore = avgScore; taskBestModel = modelName; taskBestModelMetrics = modelMetrics; } } } }); if (taskBestModel && taskBestModelMetrics) { result.bestModelPerTask[taskName] = { model: taskBestModel, score: taskBestAvgScore, color: MODEL_COLORS[taskBestModel] || "#999999", }; // Extract top metrics (keys are snake_case) const metricsArray = Object.entries(taskBestModelMetrics) .map(([metricKey, score]) => ({ metricKey, score: Number(score) || 0 })) .sort((a, b) => b.score - a.score); // Store with snake_case key, add display name result.keyMetricsByTask[taskName] = metricsArray .slice(0, 3) .map((m) => ({ ...m, metricName: formatDisplayKey(m.metricKey) })); } else { result.bestModelPerTask[taskName] = { model: "N/A", score: "N/A", color: "#999999", }; result.keyMetricsByTask[taskName] = []; } }); // Task Categories processing const tasksByCategory = { creative: [], practical: [], analytical: [] }; Object.entries(taskCategoryMap).forEach(([task, category]) => { if (tasksByCategory[category] && taskPerformance[task]) { tasksByCategory[category].push(task); } }); Object.entries(tasksByCategory).forEach(([category, tasks]) => { const categoryNameDisplay = `${ category.charAt(0).toUpperCase() + category.slice(1) } Tasks`; if (tasks.length === 0) { result.bestModelPerTaskCategory[category] = { model: "N/A", score: "N/A", color: "#999999", categoryName: categoryNameDisplay, }; result.keyMetricsByTaskCategory[category] = []; return; } const categoryModelScores = {}; modelOrder.forEach((modelName) => { categoryModelScores[modelName] = { totalScore: 0, count: 0, metrics: {} }; tasks.forEach((task) => { if (taskPerformance[task]?.[modelName]) { // metricKey is original snake_case here Object.entries(taskPerformance[task][modelName]).forEach( ([metricKey, score]) => { const numScore = Number(score); if (!isNaN(numScore)) { categoryModelScores[modelName].totalScore += numScore; categoryModelScores[modelName].count++; if (!categoryModelScores[modelName].metrics[metricKey]) categoryModelScores[modelName].metrics[metricKey] = { sum: 0, count: 0, }; categoryModelScores[modelName].metrics[metricKey].sum += numScore; categoryModelScores[modelName].metrics[metricKey].count++; } } ); } }); }); let bestAvg = -Infinity; let bestCatModel = null; Object.entries(categoryModelScores).forEach(([model, data]) => { if (data.count > 0) { const avg = data.totalScore / data.count; if (avg > bestAvg) { bestAvg = avg; bestCatModel = model; } } }); if (bestCatModel) { result.bestModelPerTaskCategory[category] = { model: bestCatModel, score: Number(bestAvg.toFixed(1)), color: MODEL_COLORS[bestCatModel] || "#999999", categoryName: categoryNameDisplay, }; const bestModelMetricsData = categoryModelScores[bestCatModel]?.metrics || {}; // metricKey is snake_case const metricAverages = Object.entries(bestModelMetricsData) .map(([metricKey, data]) => ({ metricKey, score: data.count > 0 ? data.sum / data.count : 0, })) .sort((a, b) => b.score - a.score); // Store with original key, add display name result.keyMetricsByTaskCategory[category] = metricAverages .slice(0, 5) .map((m) => ({ metric: formatDisplayKey(m.metricKey), score: m.score, scoreDisplay: m.score.toFixed(1), })); } else { result.bestModelPerTaskCategory[category] = { model: "N/A", score: "N/A", color: "#999999", categoryName: categoryNameDisplay, }; result.keyMetricsByTaskCategory[category] = []; } }); return result; // Returns object with camelCase keys } /** * Prepares the data from leaderboard_data.json for visualization * FINAL v4: Reverted deep camelCase conversion. Processes top-level keys and adds equity concern %. * Keeps nested raw data keys as original (snake_case). * @param {Object} rawDataInput - The raw data from leaderboard_data.json (expected snake_case) * @returns {Object} - Processed data ready for visualization */ export function prepareDataForVisualization(rawDataInput) { // Basic Validation const defaultReturn = { models: [], metricsData: { highLevelCategories: {}, lowLevelMetrics: {} }, radarData: [], bestPerCategory: {}, bestPerMetric: {}, overviewCardData: {}, rawData: {}, metadata: {}, equityAnalysis: {}, }; if ( !rawDataInput || !rawDataInput.model_order || !Array.isArray(rawDataInput.model_order) ) { console.error( "prepareDataForVisualization received invalid rawData.", rawDataInput ); return defaultReturn; } // Keep original references where structure is maintained const modelOrder = rawDataInput.model_order; const equityAnalysis = rawDataInput.equity_analysis || { all_equity_gaps: [], model_max_effect_gaps: {}, universal_issues: [], assessment_method: {}, demographic_variation_stats: {}, }; const allGaps = equityAnalysis.all_equity_gaps || []; const metadata = rawDataInput.metadata || {}; const mrpDemographicsRaw = rawDataInput.mrp_demographics || {}; const taskLevelPerformanceRaw = rawDataInput.task_level_performance || {}; // Process MRP Demographics for filtering options const demographicFactors = new Set(); const demographicLevels = {}; const availableMetrics = new Set(); if (mrpDemographicsRaw && typeof mrpDemographicsRaw === "object") { Object.values(mrpDemographicsRaw).forEach((modelData) => { Object.entries(modelData || {}).forEach(([factor, factorData]) => { demographicFactors.add(factor); if (!demographicLevels[factor]) demographicLevels[factor] = new Set(); Object.entries(factorData || {}).forEach(([level, levelData]) => { demographicLevels[factor].add(level); Object.keys(levelData || {}).forEach((metric) => availableMetrics.add(metric) ); }); }); }); // metric is Title Case here from Python processing } const demographicOptions = {}; demographicFactors.forEach((factor) => { demographicOptions[factor] = Array.from( demographicLevels[factor] || new Set() ).sort(); }); const availableMetricsList = Array.from(availableMetrics).sort(); // These are Title Case // Process Overall Rankings -> camelCase & add equity concern % const overallRankingProcessed = (rawDataInput.overall_ranking || []).map( (modelData) => { const modelName = modelData.model; // details object keys are snake_case from python const maxEffectGapDetails = modelData.max_effect_gap_details || {}; const safeParseFloat = (val) => { const num = Number(val); return isNaN(num) ? null : num; }; const modelSpecificGaps = allGaps.filter( (gap) => gap.model === modelName ); // Access snake_case keys in allGaps const totalGapsForModel = modelSpecificGaps.length; const concernCountForModel = modelSpecificGaps.filter( (gap) => gap.is_equity_concern === true ).length; let equityConcernPercentage = null; if (totalGapsForModel > 0) { equityConcernPercentage = (concernCountForModel / totalGapsForModel) * 100; } // Return structure with camelCase keys return { rank: modelData.rank, model: modelName, overallScore: safeParseFloat(modelData.overall_score), highLevelCatScore: safeParseFloat(modelData.high_level_cat_score), lowLevelCatScore: safeParseFloat(modelData.low_level_cat_score), color: MODEL_COLORS[modelName] || "#999999", // Use snake_case keys from input JSON for these fields stdDevAcrossCats: modelData.std_dev_across_cats, stdDevAcrossCatsNumeric: safeParseFloat(modelData.std_dev_across_cats), repeatUsageScore: safeParseFloat(modelData.repeat_usage_score), maxEffectCategory: modelData.max_effect_category, // snake_case from input maxEffectFactor: maxEffectGapDetails.demographic_factor, // snake_case from input maxEffectSize: safeParseFloat(maxEffectGapDetails.effect_size), maxEffectGap: safeParseFloat(maxEffectGapDetails.score_range), maxEffectConcernFlag: maxEffectGapDetails.is_equity_concern ?? false, maxEffectSignificant: maxEffectGapDetails.is_statistically_significant, maxEffectPValue: maxEffectGapDetails.p_value, maxEffectSizeClass: maxEffectGapDetails.effect_size_class || "N/A", maxEffectRawNHeuristic: maxEffectGapDetails.raw_n_confidence_heuristic || "N/A", maxEffectGapDetails: maxEffectGapDetails, // Pass original snake_case details equityConcernPercentage: equityConcernPercentage, }; } ); // Process Metrics Breakdown -> camelCase keys for structure, keep original metric keys inside const metricsBreakdownProcessed = { highLevelCategories: {}, lowLevelMetrics: {}, }; if ( rawDataInput.metrics_breakdown && typeof rawDataInput.metrics_breakdown === "object" ) { const processCategory = (displayKey, categoryData) => { // Input displayKey is Title Case from python output if (!categoryData || !categoryData.model_scores) { console.warn(`Missing model_scores for category: ${displayKey}`); return { modelScores: {}, topPerformer: { model: "N/A", score: null, color: "#999999" }, }; } const internalMetricKey = categoryData._internal_category_name; // Get original snake_case key const processedModelScores = {}; modelOrder.forEach((modelName) => { const scores = categoryData.model_scores[modelName]; // Access model scores if (!scores) { processedModelScores[modelName] = { nationalScore: null, color: MODEL_COLORS[modelName] || "#999999", maxEffectGapInfo: {}, }; return; } const maxEffectGapInfoForCat = scores.max_effect_gap_info || {}; // snake_case keys inside? Check python output. Assume yes. processedModelScores[modelName] = { nationalScore: scores.national_score ?? null, color: MODEL_COLORS[modelName] || "#999999", // Keep original snake_case keys for gap info within this structure maxEffectGapInfo: maxEffectGapInfoForCat, }; }); const topPerf = categoryData.top_performer || {}; const topPerfScore = topPerf.score === "N/A" || topPerf.score === null ? null : Number(topPerf.score); return { modelScores: processedModelScores, // Nested scores topPerformer: { model: topPerf.model || "N/A", score: isNaN(topPerfScore) ? null : topPerfScore, color: MODEL_COLORS[topPerf.model] || "#999999", }, internalMetricKey: internalMetricKey, // Store original snake_case key }; }; Object.entries( rawDataInput.metrics_breakdown.high_level_categories || {} ).forEach(([displayKey, catData]) => { metricsBreakdownProcessed.highLevelCategories[displayKey] = processCategory(displayKey, catData); }); Object.entries( rawDataInput.metrics_breakdown.low_level_metrics || {} ).forEach(([displayKey, metricData]) => { metricsBreakdownProcessed.lowLevelMetrics[displayKey] = processCategory( displayKey, metricData ); }); } else { console.warn("rawDataInput.metrics_breakdown is missing or not an object."); } // Prepare Radar Chart Data const radarChartData = Object.entries( metricsBreakdownProcessed.highLevelCategories ).map(([displayKey, categoryData]) => { // displayKey is Title Case here const radarEntry = { category: displayKey }; // Use Title Case for radar axis label modelOrder.forEach((modelName) => { radarEntry[modelName] = Number(categoryData.modelScores[modelName]?.nationalScore) || 0; }); return radarEntry; }); // Prepare Top Performers const bestPerCategory = {}; Object.entries(metricsBreakdownProcessed.highLevelCategories).forEach( ([displayKey, catData]) => { bestPerCategory[displayKey] = catData.topPerformer; } ); const bestPerMetric = {}; Object.entries(metricsBreakdownProcessed.lowLevelMetrics).forEach( ([displayKey, metricData]) => { bestPerMetric[displayKey] = metricData.topPerformer; } ); // Prepare Task Performance Data const taskCategoryMap = { "Generating a Creative Idea": "creative", "Creating a Travel Itinerary": "creative", "Following Up on a Job Application": "practical", "Planning Your Weekly Meals": "practical", "Making a Decision Between Options": "analytical", "Understanding a Complex Topic": "analytical", }; // Pass the original rawDataInput to the helper, which expects snake_case keys internally const taskPerformanceResults = processTaskPerformance( rawDataInput, taskCategoryMap, modelOrder ); const tasks = Object.keys(taskLevelPerformanceRaw || {}); // Use original snake_case keys const taskCategories = {}; Object.entries(taskCategoryMap).forEach(([task, category]) => { if (!taskCategories[category]) taskCategories[category] = []; if (tasks.includes(task)) taskCategories[category].push(task); }); const taskMetrics = new Set(); Object.values(taskLevelPerformanceRaw || {}).forEach((taskData) => { Object.values(taskData || {}).forEach((modelData) => { Object.keys(modelData || {}).forEach((metric) => taskMetrics.add(metric)); }); }); // metric is snake_case const taskMetricsDisplayList = Array.from(taskMetrics) .map(formatDisplayKey) .sort(); // Create display list const taskMetricsSnakeList = Array.from(taskMetrics).sort(); // List of original snake_case keys // Final Return Structure return { models: overallRankingProcessed, // camelCase keys for top level metricsData: metricsBreakdownProcessed, // Title Case keys for categories/metrics radarData: radarChartData, bestPerCategory: bestPerCategory, // Title Case keys bestPerMetric: bestPerMetric, // Title Case keys overviewCardData: taskPerformanceResults, // camelCase keys expected from helper rawData: { // Keep original structures under camelCase keys for clarity taskLevelPerformance: taskLevelPerformanceRaw, // snake_case keys inside mrpDemographics: mrpDemographicsRaw, // Title Case metric keys inside // Processed lists/maps for filtering/display demographicOptions: demographicOptions, availableMetrics: availableMetricsList, // Title Case metric names tasks: tasks, taskCategories: taskCategories, taskMetrics: taskMetricsDisplayList, // Title Case metric names for display taskMetricsSnake: taskMetricsSnakeList, // snake_case keys for lookup taskCategoryMap: taskCategoryMap, }, metadata: metadata, // Original structure equityAnalysis: equityAnalysis, // Original structure (snake_case keys) }; }