|
|
|
|
|
"use client"; |
|
|
|
import React, { useState, useMemo } from "react"; |
|
import { |
|
getScoreBadgeColor, |
|
formatDisplayKey, |
|
getMetricTooltip, |
|
getEquityIndicatorStyle, |
|
} from "../lib/utils"; |
|
import TaskPerformance from "./TaskPerformance"; |
|
import DemographicAnalysis from "./DemographicAnalysis"; |
|
import MetricsBreakdown from "./MetricsBreakdown"; |
|
import About from "./About"; |
|
import { Tooltip } from "./Tooltip"; |
|
|
|
|
|
const InfoTooltip = ({ text }) => { |
|
const [isVisible, setIsVisible] = useState(false); |
|
return ( |
|
<div className="relative inline-block ml-1 align-middle"> |
|
<button |
|
className="text-gray-400 hover:text-gray-600 focus:outline-none" |
|
onMouseEnter={() => setIsVisible(true)} |
|
onMouseLeave={() => setIsVisible(false)} |
|
onClick={(e) => { |
|
e.stopPropagation(); |
|
setIsVisible(!isVisible); |
|
}} |
|
aria-label="Info" |
|
> |
|
<svg |
|
xmlns="http://www.w3.org/2000/svg" |
|
className="h-4 w-4" |
|
viewBox="0 0 20 20" |
|
fill="currentColor" |
|
> |
|
<path |
|
fillRule="evenodd" |
|
d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a1 1 0 000 2v3a1 1 0 001 1h1a1 1 0 100-2v-3a1 1 0 00-1-1H9z" |
|
clipRule="evenodd" |
|
/> |
|
</svg> |
|
</button> |
|
{isVisible && ( |
|
<div className="absolute z-10 w-64 p-2 bg-white border rounded shadow-lg text-xs text-gray-700 -translate-x-1/2 left-1/2 mt-1 normal-case"> |
|
{text} |
|
</div> |
|
)} |
|
</div> |
|
); |
|
}; |
|
|
|
|
|
const LLMComparisonDashboard = ({ data: processedData }) => { |
|
const [activeTab, setActiveTab] = useState("overview"); |
|
const [topPerformersView, setTopPerformersView] = useState("high-level"); |
|
|
|
|
|
|
|
const { |
|
models: rankedModels = [], |
|
metricsData = { highLevelCategories: {}, lowLevelMetrics: {} }, |
|
radarData = [], |
|
overviewCardData = {}, |
|
rawData = { |
|
|
|
taskLevelPerformance: {}, |
|
mrpDemographics: {}, |
|
demographicOptions: {}, |
|
availableMetrics: [], |
|
tasks: [], |
|
taskCategories: {}, |
|
taskMetrics: [], |
|
taskMetricsSnake: [], |
|
taskCategoryMap: {}, |
|
}, |
|
bestPerCategory = {}, |
|
bestPerMetric = {}, |
|
equityAnalysis = { |
|
|
|
all_equity_gaps: [], |
|
model_max_effect_gaps: {}, |
|
universal_issues: [], |
|
assessment_method: {}, |
|
demographic_variation_stats: {}, |
|
}, |
|
metadata = {}, |
|
} = processedData || {}; |
|
|
|
|
|
const getEquityGapBadgeColor = (model) => { |
|
const isConcern = model.maxEffectConcernFlag; |
|
const isSignificant = model.maxEffectSignificant; |
|
const effectSizeClass = model.maxEffectSizeClass; |
|
const isLargeEffect = effectSizeClass === "Large"; |
|
|
|
if (isConcern && isSignificant && isLargeEffect) { |
|
return "bg-red-100 text-red-800"; |
|
} |
|
if (isLargeEffect) { |
|
return "bg-yellow-100 text-yellow-800"; |
|
} |
|
if (isSignificant) { |
|
return "bg-blue-100 text-blue-800"; |
|
} |
|
return "bg-gray-100 text-gray-800"; |
|
}; |
|
|
|
|
|
const renderMaxEquityGapCell = (model) => { |
|
|
|
const gapValue = model.maxEffectGap; |
|
const isConcern = model.maxEffectConcernFlag; |
|
const significanceStatus = model.maxEffectSignificant; |
|
const pValue = model.maxEffectPValue; |
|
const effectSizeClass = model.maxEffectSizeClass; |
|
const isLargeEffect = effectSizeClass === "Large"; |
|
|
|
const gapDetails = model.maxEffectGapDetails || {}; |
|
const ciLower = gapDetails.gap_confidence_interval_95_lower; |
|
const ciUpper = gapDetails.gap_confidence_interval_95_upper; |
|
|
|
const displayValue = |
|
typeof gapValue === "number" ? gapValue.toFixed(1) : "N/A"; |
|
if (displayValue === "N/A") |
|
return <span className="text-xs text-gray-500">N/A</span>; |
|
|
|
const indicator = getEquityIndicatorStyle( |
|
isConcern, |
|
isLargeEffect, |
|
significanceStatus, |
|
pValue, |
|
effectSizeClass |
|
); |
|
let fullTooltipContent = indicator.tooltip; |
|
if (typeof ciLower === "number" && typeof ciUpper === "number") { |
|
fullTooltipContent += `\n95% CI: [${ciLower.toFixed( |
|
1 |
|
)}, ${ciUpper.toFixed(1)}]`; |
|
} else { |
|
fullTooltipContent += `\n95% CI: N/A`; |
|
} |
|
|
|
return ( |
|
<Tooltip |
|
content={ |
|
<div className="whitespace-pre-line">{fullTooltipContent}</div> |
|
} |
|
> |
|
<span |
|
className={`px-2 py-0.5 rounded-full text-xs font-medium ${getEquityGapBadgeColor( |
|
model |
|
)}`} |
|
> |
|
{displayValue} |
|
</span> |
|
</Tooltip> |
|
); |
|
}; |
|
|
|
|
|
const getEquityConcernBadgeColor = (percentage) => { |
|
if (percentage === null || percentage === undefined) |
|
return "bg-gray-100 text-gray-800"; |
|
if (percentage === 0) return "bg-green-100 text-green-800"; |
|
if (percentage <= 2.5) return "bg-blue-100 text-blue-800"; |
|
if (percentage <= 5) return "bg-yellow-100 text-yellow-800"; |
|
return "bg-red-100 text-red-800"; |
|
}; |
|
|
|
return ( |
|
<div className="max-w-7xl mx-auto p-4 bg-white"> |
|
{/* Header */} |
|
<div className="relative mb-6 overflow-hidden"> |
|
<div className="absolute inset-0 bg-gradient-to-br from-blue-50 to-sky-50 opacity-70"></div> |
|
<div className="relative max-w-5xl mx-auto px-6 py-6"> |
|
<div className="text-center"> |
|
<h1 className="text-4xl font-bold mb-2 tracking-narrow text-blue-700"> |
|
Prolific's AI User Experience Leaderboard |
|
</h1> |
|
|
|
<p className="text-gray-600 max-w-4xl mx-auto"> |
|
A benchmark assessing how well language models handle real-world |
|
tasks based on user experiences. |
|
</p> |
|
</div> |
|
</div> |
|
</div> |
|
{/* Tab Buttons */} |
|
<div className="flex flex-wrap mb-6 border-b"> |
|
{[ |
|
"overview", |
|
"metrics-breakdown", |
|
"task-performance", |
|
"demographic-analysis", |
|
"about", |
|
].map((tab) => ( |
|
<button |
|
key={tab} |
|
className={`px-4 py-2 font-medium capitalize ${ |
|
activeTab === tab |
|
? "text-blue-600 border-b-2 border-blue-600" |
|
: "text-gray-500 hover:text-gray-700" |
|
}`} |
|
onClick={() => setActiveTab(tab)} |
|
> |
|
{" "} |
|
{tab.replace("-", " ")}{" "} |
|
</button> |
|
))} |
|
</div> |
|
{/* Overview Tab */} |
|
{activeTab === "overview" && ( |
|
<div> |
|
{/* Overall Rankings Card */} |
|
<div className="mb-6 border rounded-lg overflow-hidden shadow-sm"> |
|
<div className="px-4 py-3 bg-gray-50 border-b"> |
|
<h2 className="text-xl font-semibold text-gray-800"> |
|
Overall Model Rankings |
|
</h2> |
|
</div> |
|
<div className="p-4"> |
|
<div className="overflow-x-auto"> |
|
<table className="w-full min-w-[850px] table-auto divide-y divide-gray-200"> |
|
<thead> |
|
<tr className="bg-gray-50"> |
|
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-12"> |
|
Rank |
|
</th> |
|
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-48"> |
|
Model |
|
</th> |
|
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-28"> |
|
<span>Overall Score</span> |
|
</th> |
|
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-24"> |
|
<span>Overall SD</span> |
|
</th> |
|
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-32"> |
|
<span>Max Equity Gap</span> |
|
</th> |
|
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-38"> |
|
<span>Max Gap Area</span> |
|
</th> |
|
<th className="px-3 py-2 text-center text-xs font-medium text-gray-500 uppercase tracking-wider w-36"> |
|
<span>Equity Concerns</span> |
|
</th> |
|
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-32"> |
|
<span>User Retention</span> |
|
</th> |
|
</tr> |
|
</thead> |
|
<tbody className="divide-y divide-gray-200"> |
|
{/* Use camelCase model object from rankedModels */} |
|
{rankedModels.map((model) => ( |
|
<tr key={model.model} className="hover:bg-gray-50"> |
|
<td className="px-3 py-3 text-sm font-medium text-gray-900"> |
|
{model.rank} |
|
</td> |
|
<td className="px-3 py-3"> |
|
<div className="flex items-center"> |
|
<div |
|
className="w-3 h-3 rounded-full mr-2 flex-shrink-0" |
|
style={{ backgroundColor: model.color }} |
|
></div> |
|
<span className="text-sm font-medium text-gray-900"> |
|
{model.model} |
|
</span> |
|
</div> |
|
</td> |
|
<td className="px-3 py-3 text-sm font-semibold text-gray-800"> |
|
{model.overallScore !== null |
|
? model.overallScore.toFixed(1) |
|
: "N/A"} |
|
</td> |
|
<td className="px-3 py-3 text-sm text-gray-600"> |
|
{model.stdDevAcrossCats !== "N/A" && |
|
model.stdDevAcrossCats !== null |
|
? `± ${Number(model.stdDevAcrossCats).toFixed(1)}` |
|
: "N/A"} |
|
</td> |
|
<td className="px-3 py-3 text-sm"> |
|
{renderMaxEquityGapCell(model)} |
|
</td> |
|
<td className="px-3 py-3"> |
|
{model.maxEffectFactor && |
|
model.maxEffectFactor !== "N/A" ? ( |
|
<div className="flex flex-col"> |
|
<span className="text-xs font-medium text-gray-900"> |
|
{formatDisplayKey(model.maxEffectFactor)} |
|
</span> |
|
<span className="text-xs text-gray-500"> |
|
{formatDisplayKey(model.maxEffectCategory)} |
|
</span> |
|
</div> |
|
) : ( |
|
<span className="text-xs text-gray-500">N/A</span> |
|
)} |
|
</td> |
|
<td className="px-3 py-3 text-sm text-center"> |
|
{model.equityConcernPercentage !== null ? ( |
|
<span> |
|
{model.equityConcernPercentage.toFixed(1)}% |
|
</span> |
|
) : ( |
|
<span className="text-xs text-gray-500">N/A</span> |
|
)} |
|
</td> |
|
<td className="px-3 py-3 text-sm"> |
|
{model.repeatUsageScore !== null ? ( |
|
<span |
|
className={`px-2 py-0.5 rounded-full text-xs font-medium ${getScoreBadgeColor( |
|
model.repeatUsageScore |
|
)}`} |
|
> |
|
{model.repeatUsageScore.toFixed(1)}% |
|
</span> |
|
) : ( |
|
<span className="text-xs text-gray-500">N/A</span> |
|
)} |
|
</td> |
|
</tr> |
|
))} |
|
</tbody> |
|
</table> |
|
</div> |
|
{/* UPDATED: Vertical list for column descriptions with detailed info */} |
|
<div className="mt-4 pt-3 border-t border-gray-200 text-xs text-gray-600"> |
|
{/* Column descriptions in vertical list */} |
|
<div className="mb-2"> |
|
<div> |
|
<span className="font-semibold">Overall Score:</span> Avg. |
|
score across high-level categories |
|
</div> |
|
<div> |
|
<span className="font-semibold">Overall SD:</span> Standard |
|
deviation across high-level categories (lower = more |
|
consistent) |
|
</div> |
|
<div> |
|
<span className="font-semibold">Max Equity Gap:</span>{" "} |
|
Largest demographic score difference (hover for details on |
|
significance and effect size) |
|
</div> |
|
<div> |
|
<span className="font-semibold">Max Gap Area:</span>{" "} |
|
Demographic group and Category where the Max Equity Gap |
|
occurs |
|
</div> |
|
<div> |
|
<span className="font-semibold">Equity Concerns:</span>{" "} |
|
Percentage of demographic gaps flagged as concerns (large |
|
effect & statistically significant) |
|
</div> |
|
<div> |
|
<span className="font-semibold">User Retention:</span>{" "} |
|
Percentage of participants who said they would use the model |
|
again |
|
</div> |
|
</div> |
|
|
|
{/* Color key on a single line */} |
|
<div className="mt-2 pt-2 border-t border-gray-100 flex flex-wrap items-center gap-x-4 gap-y-2"> |
|
<span className="font-semibold whitespace-nowrap"> |
|
Color Key: |
|
</span> |
|
<div className="flex items-center"> |
|
<span className="inline-block w-4 h-4 rounded-full bg-red-100 mr-1"></span> |
|
<span> |
|
Equity Concern (Large Effect & Statistically Significant) |
|
</span> |
|
</div> |
|
<div className="flex items-center"> |
|
<span className="inline-block w-4 h-4 rounded-full bg-yellow-100 mr-1"></span> |
|
<span>Large Effect (Not Statistically Significant)</span> |
|
</div> |
|
</div> |
|
</div> |
|
</div> |
|
</div> |
|
|
|
{/* Top Performers Section */} |
|
<div className="mb-4 flex items-center"> |
|
<h3 className="font-semibold text-xl mr-4"> |
|
Top Performers by Category |
|
</h3> |
|
<div className="flex space-x-1 p-1 bg-gray-200 rounded-lg"> |
|
<button |
|
className={`px-4 py-1.5 text-sm font-medium rounded-md transition-colors duration-150 ${ |
|
topPerformersView === "high-level" |
|
? "bg-white shadow text-blue-600" |
|
: "text-gray-600 hover:text-gray-800" |
|
}`} |
|
onClick={() => setTopPerformersView("high-level")} |
|
> |
|
{" "} |
|
High-Level Categories{" "} |
|
</button> |
|
<button |
|
className={`px-4 py-1.5 text-sm font-medium rounded-md transition-colors duration-150 ${ |
|
topPerformersView === "low-level" |
|
? "bg-white shadow text-blue-600" |
|
: "text-gray-600 hover:text-gray-800" |
|
}`} |
|
onClick={() => setTopPerformersView("low-level")} |
|
> |
|
{" "} |
|
Low-Level Metrics{" "} |
|
</button> |
|
</div> |
|
</div> |
|
{/* Top Performers Tables - Access using Title Case keys */} |
|
{topPerformersView === "high-level" && ( |
|
<div className="border rounded-lg overflow-hidden shadow-sm mb-6"> |
|
<div className="px-4 py-3 bg-gray-50 border-b"> |
|
<h3 className="font-semibold text-gray-800"> |
|
Top Performers by High-Level Category |
|
</h3> |
|
</div> |
|
<div className="p-4"> |
|
{Object.entries(bestPerCategory || {}).length > 0 ? ( |
|
<table className="min-w-full divide-y divide-gray-200"> |
|
<thead> |
|
<tr> |
|
<th |
|
scope="col" |
|
className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider" |
|
> |
|
Category |
|
</th> |
|
<th |
|
scope="col" |
|
className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider" |
|
> |
|
Best Model |
|
</th> |
|
<th |
|
scope="col" |
|
className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider" |
|
> |
|
Score |
|
</th> |
|
</tr> |
|
</thead> |
|
<tbody className="bg-white divide-y divide-gray-200"> |
|
{Object.entries(bestPerCategory) |
|
.sort(([a], [b]) => a.localeCompare(b)) |
|
.map(([catDisplayKey, bestInfo], idx) => ( |
|
<tr |
|
key={catDisplayKey} |
|
className={ |
|
idx % 2 === 0 ? "bg-white" : "bg-gray-50" |
|
} |
|
> |
|
<td className="px-3 py-2 font-medium text-sm text-gray-900"> |
|
<Tooltip |
|
content={getMetricTooltip(catDisplayKey)} |
|
> |
|
<span>{catDisplayKey}</span> |
|
</Tooltip> |
|
</td> |
|
<td className="px-3 py-2"> |
|
{bestInfo.model !== "N/A" ? ( |
|
<div className="flex items-center"> |
|
<div |
|
className="w-3 h-3 rounded-full mr-2 shrink-0" |
|
style={{ backgroundColor: bestInfo.color }} |
|
></div> |
|
<span className="text-sm"> |
|
{bestInfo.model} |
|
</span> |
|
</div> |
|
) : ( |
|
<span className="text-sm text-gray-500"> |
|
N/A |
|
</span> |
|
)} |
|
</td> |
|
<td className="px-3 py-2"> |
|
{bestInfo.score !== null ? ( |
|
<span |
|
className={`px-2 py-0.5 rounded-full text-xs font-medium ${getScoreBadgeColor( |
|
bestInfo.score |
|
)}`} |
|
> |
|
{bestInfo.score.toFixed(1)} |
|
</span> |
|
) : ( |
|
<span className="text-sm text-gray-500"> |
|
N/A |
|
</span> |
|
)} |
|
</td> |
|
</tr> |
|
))} |
|
</tbody> |
|
</table> |
|
) : ( |
|
<p className="text-center text-gray-500 py-4"> |
|
Top performer data not available. |
|
</p> |
|
)} |
|
<p className="text-xs text-gray-500 mt-2"> |
|
Scores based on user ratings, normalized to 0-100. |
|
</p> |
|
</div> |
|
</div> |
|
)} |
|
{topPerformersView === "low-level" && ( |
|
<div className="border rounded-lg overflow-hidden shadow-sm mb-6"> |
|
<div className="px-4 py-3 bg-gray-50 border-b"> |
|
<h3 className="font-semibold text-gray-800"> |
|
Top Performers by Low-Level Metric |
|
</h3> |
|
</div> |
|
<div className="p-4"> |
|
{Object.entries(bestPerMetric || {}).length > 0 ? ( |
|
<table className="min-w-full divide-y divide-gray-200"> |
|
<thead> |
|
<tr> |
|
<th |
|
scope="col" |
|
className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider" |
|
> |
|
Metric |
|
</th> |
|
<th |
|
scope="col" |
|
className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider" |
|
> |
|
Best Model |
|
</th> |
|
<th |
|
scope="col" |
|
className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider" |
|
> |
|
Score |
|
</th> |
|
</tr> |
|
</thead> |
|
<tbody className="bg-white divide-y divide-gray-200"> |
|
{Object.entries(bestPerMetric) |
|
.sort(([a], [b]) => a.localeCompare(b)) |
|
.map(([metricDisplayKey, bestInfo], idx) => ( |
|
<tr |
|
key={metricDisplayKey} |
|
className={ |
|
idx % 2 === 0 ? "bg-white" : "bg-gray-50" |
|
} |
|
> |
|
<td className="px-3 py-2 font-medium text-sm text-gray-900"> |
|
<Tooltip |
|
content={getMetricTooltip(metricDisplayKey)} |
|
> |
|
<span>{metricDisplayKey}</span> |
|
</Tooltip> |
|
</td> |
|
<td className="px-3 py-2"> |
|
{bestInfo.model !== "N/A" ? ( |
|
<div className="flex items-center"> |
|
<div |
|
className="w-3 h-3 rounded-full mr-2 shrink-0" |
|
style={{ backgroundColor: bestInfo.color }} |
|
></div> |
|
<span className="text-sm"> |
|
{bestInfo.model} |
|
</span> |
|
</div> |
|
) : ( |
|
<span className="text-sm text-gray-500"> |
|
N/A |
|
</span> |
|
)} |
|
</td> |
|
<td className="px-3 py-2"> |
|
{bestInfo.score !== null ? ( |
|
<span |
|
className={`px-2 py-0.5 rounded-full text-xs font-medium ${getScoreBadgeColor( |
|
bestInfo.score |
|
)}`} |
|
> |
|
{bestInfo.score.toFixed(1)} |
|
</span> |
|
) : ( |
|
<span className="text-sm text-gray-500"> |
|
N/A |
|
</span> |
|
)} |
|
</td> |
|
</tr> |
|
))} |
|
</tbody> |
|
</table> |
|
) : ( |
|
<p className="text-center text-gray-500 py-4"> |
|
Low-level metric top performer data not available. |
|
</p> |
|
)} |
|
<p className="text-xs text-gray-500 mt-2"> |
|
Scores based on user ratings, normalized to 0-100. |
|
</p> |
|
</div> |
|
</div> |
|
)} |
|
</div> |
|
)}{" "} |
|
{/* End Overview Tab */} |
|
{/* Other Tabs - Pass Correct Props */} |
|
{activeTab === "metrics-breakdown" && ( |
|
<MetricsBreakdown |
|
metricsData={metricsData} // Title Case keys inside, plus internalMetricKey |
|
modelsMeta={rankedModels} // camelCase keys inside |
|
radarData={radarData} |
|
/> |
|
)} |
|
{activeTab === "task-performance" && ( |
|
<TaskPerformance |
|
rawData={rawData} // Contains camelCase top-level, snake_case nested |
|
modelsMeta={rankedModels} |
|
metricsData={metricsData} // Title Case keys inside, plus internalMetricKey |
|
overviewCardData={overviewCardData} |
|
/> |
|
)} |
|
{activeTab === "demographic-analysis" && ( |
|
<DemographicAnalysis |
|
rawData={rawData} // Contains camelCase top-level, snake_case/Title Case nested |
|
modelsMeta={rankedModels} |
|
metricsData={metricsData} // Title Case keys inside, plus internalMetricKey |
|
equityAnalysis={equityAnalysis} // Original snake_case structure |
|
/> |
|
)} |
|
{activeTab === "about" && <About metadata={metadata} />} |
|
</div> |
|
); |
|
}; |
|
|
|
export default LLMComparisonDashboard; |
|
|