// components/LLMComparisonDashboard.jsx "use client"; import React, { useState, useMemo } from "react"; import { getScoreBadgeColor, formatDisplayKey, // Use this for displaying snake_case keys nicely getMetricTooltip, getEquityIndicatorStyle, // Use this for Max Equity Gap status } from "../lib/utils"; // Adjust path as needed import TaskPerformance from "./TaskPerformance"; import DemographicAnalysis from "./DemographicAnalysis"; import MetricsBreakdown from "./MetricsBreakdown"; import About from "./About"; import { Tooltip } from "./Tooltip"; // Assuming this is your Tooltip component // Helper component for info tooltips (assuming it exists and works) const InfoTooltip = ({ text }) => { const [isVisible, setIsVisible] = useState(false); return (
{isVisible && (
{text}
)}
); }; // Main dashboard component const LLMComparisonDashboard = ({ data: processedData }) => { const [activeTab, setActiveTab] = useState("overview"); const [topPerformersView, setTopPerformersView] = useState("high-level"); // Destructure data - top-level keys are camelCase // Nested rawData and equityAnalysis retain original snake_case keys const { models: rankedModels = [], // This is overallRankingProcessed with camelCase keys metricsData = { highLevelCategories: {}, lowLevelMetrics: {} }, // Title Case keys inside radarData = [], overviewCardData = {}, // camelCase keys inside expected rawData = { // camelCase keys for objects, snake_case keys inside those objects taskLevelPerformance: {}, mrpDemographics: {}, demographicOptions: {}, availableMetrics: [], // Title Case tasks: [], taskCategories: {}, taskMetrics: [], // Title Case taskMetricsSnake: [], // snake_case taskCategoryMap: {}, }, bestPerCategory = {}, // Title Case keys bestPerMetric = {}, // Title Case keys equityAnalysis = { // Original snake_case keys all_equity_gaps: [], model_max_effect_gaps: {}, universal_issues: [], assessment_method: {}, demographic_variation_stats: {}, }, metadata = {}, // Original keys } = processedData || {}; // NEW: Helper function to get color for Max Equity Gap bubble const getEquityGapBadgeColor = (model) => { const isConcern = model.maxEffectConcernFlag; const isSignificant = model.maxEffectSignificant; const effectSizeClass = model.maxEffectSizeClass; const isLargeEffect = effectSizeClass === "Large"; if (isConcern && isSignificant && isLargeEffect) { return "bg-red-100 text-red-800"; // Equity Concern } if (isLargeEffect) { return "bg-yellow-100 text-yellow-800"; // Large Effect } if (isSignificant) { return "bg-blue-100 text-blue-800"; // Significant } return "bg-gray-100 text-gray-800"; // No concern }; // UPDATED: Render cell for Max Equity Gap column with bubble design const renderMaxEquityGapCell = (model) => { // model object has camelCase keys const gapValue = model.maxEffectGap; const isConcern = model.maxEffectConcernFlag; const significanceStatus = model.maxEffectSignificant; const pValue = model.maxEffectPValue; const effectSizeClass = model.maxEffectSizeClass; const isLargeEffect = effectSizeClass === "Large"; // Access nested details using original snake_case keys const gapDetails = model.maxEffectGapDetails || {}; const ciLower = gapDetails.gap_confidence_interval_95_lower; const ciUpper = gapDetails.gap_confidence_interval_95_upper; const displayValue = typeof gapValue === "number" ? gapValue.toFixed(1) : "N/A"; if (displayValue === "N/A") return N/A; const indicator = getEquityIndicatorStyle( isConcern, isLargeEffect, significanceStatus, pValue, effectSizeClass ); let fullTooltipContent = indicator.tooltip; if (typeof ciLower === "number" && typeof ciUpper === "number") { fullTooltipContent += `\n95% CI: [${ciLower.toFixed( 1 )}, ${ciUpper.toFixed(1)}]`; } else { fullTooltipContent += `\n95% CI: N/A`; } return ( {fullTooltipContent} } > {displayValue} ); }; // NEW: Helper for equity concerns percentage badge color const getEquityConcernBadgeColor = (percentage) => { if (percentage === null || percentage === undefined) return "bg-gray-100 text-gray-800"; if (percentage === 0) return "bg-green-100 text-green-800"; if (percentage <= 2.5) return "bg-blue-100 text-blue-800"; if (percentage <= 5) return "bg-yellow-100 text-yellow-800"; return "bg-red-100 text-red-800"; }; return (
{/* Header */}

Prolific's AI User Experience Leaderboard

A benchmark assessing how well language models handle real-world tasks based on user experiences.

{/* Tab Buttons */}
{[ "overview", "metrics-breakdown", "task-performance", "demographic-analysis", "about", ].map((tab) => ( ))}
{/* Overview Tab */} {activeTab === "overview" && (
{/* Overall Rankings Card */}

Overall Model Rankings

{/* Use camelCase model object from rankedModels */} {rankedModels.map((model) => ( ))}
Rank Model Overall Score Overall SD Max Equity Gap Max Gap Area Equity Concerns User Retention
{model.rank}
{model.model}
{model.overallScore !== null ? model.overallScore.toFixed(1) : "N/A"} {model.stdDevAcrossCats !== "N/A" && model.stdDevAcrossCats !== null ? `± ${Number(model.stdDevAcrossCats).toFixed(1)}` : "N/A"} {renderMaxEquityGapCell(model)} {model.maxEffectFactor && model.maxEffectFactor !== "N/A" ? (
{formatDisplayKey(model.maxEffectFactor)} {formatDisplayKey(model.maxEffectCategory)}
) : ( N/A )}
{model.equityConcernPercentage !== null ? ( {model.equityConcernPercentage.toFixed(1)}% ) : ( N/A )} {model.repeatUsageScore !== null ? ( {model.repeatUsageScore.toFixed(1)}% ) : ( N/A )}
{/* UPDATED: Vertical list for column descriptions with detailed info */}
{/* Column descriptions in vertical list */}
Overall Score: Avg. score across high-level categories
Overall SD: Standard deviation across high-level categories (lower = more consistent)
Max Equity Gap:{" "} Largest demographic score difference (hover for details on significance and effect size)
Max Gap Area:{" "} Demographic group and Category where the Max Equity Gap occurs
Equity Concerns:{" "} Percentage of demographic gaps flagged as concerns (large effect & statistically significant)
User Retention:{" "} Percentage of participants who said they would use the model again
{/* Color key on a single line */}
Color Key:
Equity Concern (Large Effect & Statistically Significant)
Large Effect (Not Statistically Significant)
{/* Top Performers Section */}

Top Performers by Category

{/* Top Performers Tables - Access using Title Case keys */} {topPerformersView === "high-level" && (

Top Performers by High-Level Category

{Object.entries(bestPerCategory || {}).length > 0 ? ( {Object.entries(bestPerCategory) .sort(([a], [b]) => a.localeCompare(b)) .map(([catDisplayKey, bestInfo], idx) => ( ))}
Category Best Model Score
{catDisplayKey} {bestInfo.model !== "N/A" ? (
{bestInfo.model}
) : ( N/A )}
{bestInfo.score !== null ? ( {bestInfo.score.toFixed(1)} ) : ( N/A )}
) : (

Top performer data not available.

)}

Scores based on user ratings, normalized to 0-100.

)} {topPerformersView === "low-level" && (

Top Performers by Low-Level Metric

{Object.entries(bestPerMetric || {}).length > 0 ? ( {Object.entries(bestPerMetric) .sort(([a], [b]) => a.localeCompare(b)) .map(([metricDisplayKey, bestInfo], idx) => ( ))}
Metric Best Model Score
{metricDisplayKey} {bestInfo.model !== "N/A" ? (
{bestInfo.model}
) : ( N/A )}
{bestInfo.score !== null ? ( {bestInfo.score.toFixed(1)} ) : ( N/A )}
) : (

Low-level metric top performer data not available.

)}

Scores based on user ratings, normalized to 0-100.

)}
)}{" "} {/* End Overview Tab */} {/* Other Tabs - Pass Correct Props */} {activeTab === "metrics-breakdown" && ( )} {activeTab === "task-performance" && ( )} {activeTab === "demographic-analysis" && ( )} {activeTab === "about" && }
); }; export default LLMComparisonDashboard;