Nora Petrova
commited on
Commit
·
20e666e
1
Parent(s):
99c7281
Add project to new space
Browse files- Dockerfile +19 -0
- README.md +8 -6
- leaderboard-app/.gitignore +41 -0
- leaderboard-app/README.md +113 -0
- leaderboard-app/app/favicon.ico +0 -0
- leaderboard-app/app/globals.css +29 -0
- leaderboard-app/app/layout.js +19 -0
- leaderboard-app/app/page.js +84 -0
- leaderboard-app/components/About.jsx +741 -0
- leaderboard-app/components/DemographicAnalysis.jsx +925 -0
- leaderboard-app/components/LLMComparisonDashboard.jsx +639 -0
- leaderboard-app/components/MetricsBreakdown.jsx +447 -0
- leaderboard-app/components/TaskPerformance.jsx +756 -0
- leaderboard-app/components/Tooltip.jsx +145 -0
- leaderboard-app/eslint.config.mjs +14 -0
- leaderboard-app/jsconfig.json +7 -0
- leaderboard-app/lib/utils.js +708 -0
- leaderboard-app/next.config.mjs +4 -0
- leaderboard-app/package-lock.json +0 -0
- leaderboard-app/package.json +25 -0
- leaderboard-app/postcss.config.mjs +5 -0
- leaderboard-app/public/file.svg +1 -0
- leaderboard-app/public/globe.svg +1 -0
- leaderboard-app/public/leaderboard_data.json +0 -0
- leaderboard-app/public/next.svg +1 -0
- leaderboard-app/public/vercel.svg +1 -0
- leaderboard-app/public/window.svg +1 -0
Dockerfile
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM node:20.11.0-slim
|
2 |
+
|
3 |
+
WORKDIR /app
|
4 |
+
|
5 |
+
# Copy the rest of the application code
|
6 |
+
COPY --chown=user leaderboard-app/ ./
|
7 |
+
|
8 |
+
RUN npm install
|
9 |
+
|
10 |
+
# Build the app
|
11 |
+
RUN npm run build
|
12 |
+
|
13 |
+
# Expose the port the app will run on
|
14 |
+
# HF Spaces uses port 7860 by default
|
15 |
+
EXPOSE 7860
|
16 |
+
|
17 |
+
# Start the app with the correct port
|
18 |
+
ENV PORT=7860
|
19 |
+
CMD ["npm", "start"]
|
README.md
CHANGED
@@ -1,11 +1,13 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: docker
|
7 |
-
pinned:
|
8 |
-
short_description:
|
|
|
|
|
9 |
---
|
10 |
|
11 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: UX Leaderboard
|
3 |
+
emoji: 🥇
|
4 |
+
colorFrom: blue
|
5 |
+
colorTo: cyan
|
6 |
sdk: docker
|
7 |
+
pinned: true
|
8 |
+
short_description: Leaderboard of LLMs based on detailed human feedback
|
9 |
+
tags:
|
10 |
+
- leaderboard
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
leaderboard-app/.gitignore
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# See https://help.github.com/articles/ignoring-files/ for more about ignoring files.
|
2 |
+
|
3 |
+
# dependencies
|
4 |
+
/node_modules
|
5 |
+
/.pnp
|
6 |
+
.pnp.*
|
7 |
+
.yarn/*
|
8 |
+
!.yarn/patches
|
9 |
+
!.yarn/plugins
|
10 |
+
!.yarn/releases
|
11 |
+
!.yarn/versions
|
12 |
+
|
13 |
+
# testing
|
14 |
+
/coverage
|
15 |
+
|
16 |
+
# next.js
|
17 |
+
/.next/
|
18 |
+
/out/
|
19 |
+
|
20 |
+
# production
|
21 |
+
/build
|
22 |
+
|
23 |
+
# misc
|
24 |
+
.DS_Store
|
25 |
+
*.pem
|
26 |
+
|
27 |
+
# debug
|
28 |
+
npm-debug.log*
|
29 |
+
yarn-debug.log*
|
30 |
+
yarn-error.log*
|
31 |
+
.pnpm-debug.log*
|
32 |
+
|
33 |
+
# env files (can opt-in for committing if needed)
|
34 |
+
.env*
|
35 |
+
|
36 |
+
# vercel
|
37 |
+
.vercel
|
38 |
+
|
39 |
+
# typescript
|
40 |
+
*.tsbuildinfo
|
41 |
+
next-env.d.ts
|
leaderboard-app/README.md
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# LLM Comparison Leaderboard
|
2 |
+
|
3 |
+
An interactive dashboard for comparing the performance of state-of-the-art large language models across various tasks and metrics.
|
4 |
+
|
5 |
+
## Features
|
6 |
+
|
7 |
+
- Overall model rankings with comprehensive scoring
|
8 |
+
- Task-specific performance analysis
|
9 |
+
- Metric breakdowns across different dimensions
|
10 |
+
- User satisfaction and experience metrics
|
11 |
+
- Interactive visualizations using Recharts
|
12 |
+
- Responsive design for all device sizes
|
13 |
+
|
14 |
+
## Getting Started
|
15 |
+
|
16 |
+
### Prerequisites
|
17 |
+
|
18 |
+
- Node.js 16.8 or later
|
19 |
+
- Python 3.8 or later (for data processing)
|
20 |
+
- Python packages: pandas, numpy
|
21 |
+
|
22 |
+
### Installation
|
23 |
+
|
24 |
+
1. Clone the repository:
|
25 |
+
|
26 |
+
```bash
|
27 |
+
git clone https://github.com/yourusername/llm-comparison-leaderboard.git
|
28 |
+
cd llm-comparison-leaderboard
|
29 |
+
```
|
30 |
+
|
31 |
+
2. Install dependencies:
|
32 |
+
|
33 |
+
```bash
|
34 |
+
npm install
|
35 |
+
```
|
36 |
+
|
37 |
+
3. Install Python dependencies (if you plan to process data):
|
38 |
+
|
39 |
+
```bash
|
40 |
+
pip install pandas numpy
|
41 |
+
```
|
42 |
+
|
43 |
+
### Using Sample Data
|
44 |
+
|
45 |
+
The repository includes a sample JSON file with placeholder data in `public/llm_comparison_data.json`. You can start the development server right away to see the dashboard with this data:
|
46 |
+
|
47 |
+
```bash
|
48 |
+
npm run dev
|
49 |
+
```
|
50 |
+
|
51 |
+
Visit [http://localhost:3000](http://localhost:3000) to see the dashboard.
|
52 |
+
|
53 |
+
### Processing Your Own Data
|
54 |
+
|
55 |
+
If you have your own data, follow these steps:
|
56 |
+
|
57 |
+
1. Place your CSV data file in the `data` directory:
|
58 |
+
|
59 |
+
```bash
|
60 |
+
mkdir -p data
|
61 |
+
cp /path/to/your/pilot_data_n20.csv data/
|
62 |
+
```
|
63 |
+
|
64 |
+
2. Run the data processing script:
|
65 |
+
|
66 |
+
```bash
|
67 |
+
npm run process-data
|
68 |
+
```
|
69 |
+
|
70 |
+
This will:
|
71 |
+
- Process the CSV data using the Python script
|
72 |
+
- Generate a JSON file in the `public` directory
|
73 |
+
- Format the data for the dashboard
|
74 |
+
|
75 |
+
3. Start the development server:
|
76 |
+
|
77 |
+
```bash
|
78 |
+
npm run dev
|
79 |
+
```
|
80 |
+
|
81 |
+
## Project Structure
|
82 |
+
|
83 |
+
- `app/` - Next.js App Router components
|
84 |
+
- `page.js` - Main page component that loads data and renders dashboard
|
85 |
+
- `layout.js` - Layout component with metadata and global styles
|
86 |
+
- `globals.css` - Global styles including Tailwind CSS
|
87 |
+
- `components/` - React components
|
88 |
+
- `LLMComparisonDashboard.jsx` - The main dashboard component
|
89 |
+
- `public/` - Static files
|
90 |
+
- `llm_comparison_data.json` - Processed data for the dashboard
|
91 |
+
- `lib/` - Utility functions
|
92 |
+
- `utils.js` - Helper functions for data processing
|
93 |
+
- `scripts/` - Data processing scripts
|
94 |
+
- `process_data.js` - Node.js script for running Python processor
|
95 |
+
- `process_data.py` - Python script for data processing
|
96 |
+
|
97 |
+
## Building for Production
|
98 |
+
|
99 |
+
To build the application for production:
|
100 |
+
|
101 |
+
```bash
|
102 |
+
npm run build
|
103 |
+
```
|
104 |
+
|
105 |
+
To start the production server:
|
106 |
+
|
107 |
+
```bash
|
108 |
+
npm run start
|
109 |
+
```
|
110 |
+
|
111 |
+
## License
|
112 |
+
|
113 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
leaderboard-app/app/favicon.ico
ADDED
|
leaderboard-app/app/globals.css
ADDED
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
@import "tailwindcss";
|
2 |
+
|
3 |
+
:root {
|
4 |
+
--background: #ffffff;
|
5 |
+
--foreground: #171717;
|
6 |
+
}
|
7 |
+
|
8 |
+
@theme inline {
|
9 |
+
--color-background: var(--background);
|
10 |
+
--color-foreground: var(--foreground);
|
11 |
+
--font-sans: var(--font-geist-sans);
|
12 |
+
--font-mono: var(--font-geist-mono);
|
13 |
+
}
|
14 |
+
|
15 |
+
/* Force light theme regardless of color scheme preference */
|
16 |
+
/* Disable dark mode
|
17 |
+
@media (prefers-color-scheme: dark) {
|
18 |
+
:root {
|
19 |
+
--background: #0a0a0a;
|
20 |
+
--foreground: #ededed;
|
21 |
+
}
|
22 |
+
}
|
23 |
+
*/
|
24 |
+
|
25 |
+
body {
|
26 |
+
background: var(--background);
|
27 |
+
color: var(--foreground);
|
28 |
+
font-family: Arial, Helvetica, sans-serif;
|
29 |
+
}
|
leaderboard-app/app/layout.js
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { Inter } from 'next/font/google';
|
2 |
+
import './globals.css';
|
3 |
+
|
4 |
+
const inter = Inter({ subsets: ['latin'] });
|
5 |
+
|
6 |
+
export const metadata = {
|
7 |
+
title: 'LLM Comparison Leaderboard',
|
8 |
+
description: 'Interactive leaderboard comparing performance of state-of-the-art large language models across various tasks and metrics.',
|
9 |
+
};
|
10 |
+
|
11 |
+
export default function RootLayout({ children }) {
|
12 |
+
return (
|
13 |
+
<html lang="en">
|
14 |
+
<body className={`${inter.className} bg-gray-50`}>
|
15 |
+
{children}
|
16 |
+
</body>
|
17 |
+
</html>
|
18 |
+
);
|
19 |
+
}
|
leaderboard-app/app/page.js
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'use client';
|
2 |
+
|
3 |
+
import { useState, useEffect } from 'react';
|
4 |
+
import dynamic from 'next/dynamic';
|
5 |
+
import { prepareDataForVisualization } from '../lib/utils';
|
6 |
+
|
7 |
+
// Dynamically import the dashboard component with SSR disabled
|
8 |
+
// This is important because recharts needs to be rendered on the client side
|
9 |
+
const LLMComparisonDashboard = dynamic(
|
10 |
+
() => import('../components/LLMComparisonDashboard'),
|
11 |
+
{ ssr: false }
|
12 |
+
);
|
13 |
+
|
14 |
+
export default function Home() {
|
15 |
+
const [data, setData] = useState(null);
|
16 |
+
const [loading, setLoading] = useState(true);
|
17 |
+
const [error, setError] = useState(null);
|
18 |
+
|
19 |
+
useEffect(() => {
|
20 |
+
async function fetchData() {
|
21 |
+
try {
|
22 |
+
setLoading(true);
|
23 |
+
|
24 |
+
// Fetch the data from the JSON file in the public directory
|
25 |
+
const response = await fetch('/leaderboard_data.json');
|
26 |
+
|
27 |
+
if (!response.ok) {
|
28 |
+
throw new Error(`Failed to fetch data: ${response.status} ${response.statusText}`);
|
29 |
+
}
|
30 |
+
|
31 |
+
const jsonData = await response.json();
|
32 |
+
|
33 |
+
// Process the data for visualization
|
34 |
+
const processedData = prepareDataForVisualization(jsonData);
|
35 |
+
|
36 |
+
setData(processedData);
|
37 |
+
setLoading(false);
|
38 |
+
} catch (err) {
|
39 |
+
console.error('Error loading data:', err);
|
40 |
+
setError(err.message || 'Failed to load data');
|
41 |
+
setLoading(false);
|
42 |
+
}
|
43 |
+
}
|
44 |
+
|
45 |
+
fetchData();
|
46 |
+
}, []);
|
47 |
+
|
48 |
+
if (loading) {
|
49 |
+
return (
|
50 |
+
<div className="flex items-center justify-center min-h-screen">
|
51 |
+
<div className="text-center">
|
52 |
+
<div className="animate-spin rounded-full h-12 w-12 border-b-2 border-blue-500 mx-auto mb-4"></div>
|
53 |
+
<p className="text-lg text-gray-600">Loading LLM comparison data...</p>
|
54 |
+
</div>
|
55 |
+
</div>
|
56 |
+
);
|
57 |
+
}
|
58 |
+
|
59 |
+
if (error) {
|
60 |
+
return (
|
61 |
+
<div className="flex items-center justify-center min-h-screen">
|
62 |
+
<div className="text-center max-w-md p-6 bg-red-50 rounded-lg border border-red-200">
|
63 |
+
<svg xmlns="http://www.w3.org/2000/svg" className="h-12 w-12 text-red-500 mx-auto mb-4" fill="none" viewBox="0 0 24 24" stroke="currentColor">
|
64 |
+
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M12 8v4m0 4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z" />
|
65 |
+
</svg>
|
66 |
+
<h2 className="text-xl font-bold text-red-700 mb-2">Error Loading Data</h2>
|
67 |
+
<p className="text-gray-600">{error}</p>
|
68 |
+
<button
|
69 |
+
onClick={() => window.location.reload()}
|
70 |
+
className="mt-4 px-4 py-2 bg-blue-500 text-white rounded hover:bg-blue-600 transition-colors"
|
71 |
+
>
|
72 |
+
Try Again
|
73 |
+
</button>
|
74 |
+
</div>
|
75 |
+
</div>
|
76 |
+
);
|
77 |
+
}
|
78 |
+
|
79 |
+
return (
|
80 |
+
<main className="min-h-screen p-4">
|
81 |
+
{data && <LLMComparisonDashboard data={data} />}
|
82 |
+
</main>
|
83 |
+
);
|
84 |
+
}
|
leaderboard-app/components/About.jsx
ADDED
@@ -0,0 +1,741 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"use client";
|
2 |
+
|
3 |
+
import React, { useState } from "react";
|
4 |
+
import {
|
5 |
+
ChevronDown,
|
6 |
+
ChevronUp,
|
7 |
+
Info,
|
8 |
+
Book,
|
9 |
+
Calculator,
|
10 |
+
BarChart,
|
11 |
+
UserCheck,
|
12 |
+
CheckCircle,
|
13 |
+
MessageCircle,
|
14 |
+
Brain,
|
15 |
+
SlidersHorizontal,
|
16 |
+
Shield,
|
17 |
+
Smile,
|
18 |
+
Globe,
|
19 |
+
} from "lucide-react";
|
20 |
+
|
21 |
+
const AboutTab = () => {
|
22 |
+
// Task list for easier management
|
23 |
+
const tasksUsed = [
|
24 |
+
"Following Up on Job Application: Drafting a professional follow-up email",
|
25 |
+
"Planning Weekly Meals: Creating a meal plan accommodating dietary restrictions",
|
26 |
+
"Creating Travel Itinerary: Planning a European city break",
|
27 |
+
"Understanding Complex Topic: Learning about day trading concepts",
|
28 |
+
"Generating Creative Ideas: Brainstorming unique birthday gift ideas",
|
29 |
+
"Making Decisions Between Options: Comparing tech products for purchase",
|
30 |
+
];
|
31 |
+
|
32 |
+
// State for collapsible sections
|
33 |
+
const [openSections, setOpenSections] = useState({
|
34 |
+
introduction: true,
|
35 |
+
methodology: true,
|
36 |
+
metricsCalculation: true,
|
37 |
+
metricsExplained: true,
|
38 |
+
});
|
39 |
+
|
40 |
+
// State for active metric tab
|
41 |
+
const [activeMetricTab, setActiveMetricTab] = useState("helpfulness");
|
42 |
+
|
43 |
+
// Toggle section visibility
|
44 |
+
const toggleSection = (section) => {
|
45 |
+
setOpenSections({
|
46 |
+
...openSections,
|
47 |
+
[section]: !openSections[section],
|
48 |
+
});
|
49 |
+
};
|
50 |
+
|
51 |
+
// Metrics data
|
52 |
+
const metricsData = [
|
53 |
+
{
|
54 |
+
id: "helpfulness",
|
55 |
+
title: "Helpfulness",
|
56 |
+
icon: <CheckCircle size={18} />,
|
57 |
+
color: "bg-green-500",
|
58 |
+
description:
|
59 |
+
"Evaluates how well the model provides useful, practical assistance that addresses the user's needs and helps them accomplish their goals.",
|
60 |
+
metrics: [
|
61 |
+
{
|
62 |
+
name: "Effectiveness",
|
63 |
+
description:
|
64 |
+
"How effectively did the model help you accomplish your specific goal?",
|
65 |
+
},
|
66 |
+
{
|
67 |
+
name: "Comprehensiveness",
|
68 |
+
description:
|
69 |
+
"How comprehensive was the model's response in addressing all aspects of your request?",
|
70 |
+
},
|
71 |
+
{
|
72 |
+
name: "Usefulness",
|
73 |
+
description:
|
74 |
+
"How useful were the model's suggestions or solutions for your needs?",
|
75 |
+
},
|
76 |
+
],
|
77 |
+
},
|
78 |
+
{
|
79 |
+
id: "communication",
|
80 |
+
title: "Communication",
|
81 |
+
icon: <MessageCircle size={18} />,
|
82 |
+
color: "bg-blue-500",
|
83 |
+
description:
|
84 |
+
"Assesses the clarity, coherence, and appropriateness of the model's writing style, including tone and language choices.",
|
85 |
+
metrics: [
|
86 |
+
{
|
87 |
+
name: "Tone and Language Style",
|
88 |
+
description:
|
89 |
+
"How well did the model match its tone and language style to the context of your interaction?",
|
90 |
+
},
|
91 |
+
{
|
92 |
+
name: "Conversation Flow",
|
93 |
+
description:
|
94 |
+
"How natural and conversational were the model's responses?",
|
95 |
+
},
|
96 |
+
{
|
97 |
+
name: "Detail and Technical Language",
|
98 |
+
description:
|
99 |
+
"How appropriate was the level of detail and technical language for your needs?",
|
100 |
+
},
|
101 |
+
],
|
102 |
+
},
|
103 |
+
{
|
104 |
+
id: "understanding",
|
105 |
+
title: "Understanding",
|
106 |
+
icon: <Brain size={18} />,
|
107 |
+
color: "bg-purple-500",
|
108 |
+
description:
|
109 |
+
"Measures how well the model comprehends the user's requests, including implicit needs and contextual information.",
|
110 |
+
metrics: [
|
111 |
+
{
|
112 |
+
name: "Accuracy",
|
113 |
+
description:
|
114 |
+
"How accurately did the model interpret your initial request?",
|
115 |
+
},
|
116 |
+
{
|
117 |
+
name: "Context Memory",
|
118 |
+
description:
|
119 |
+
"How well did the model maintain context throughout the conversation?",
|
120 |
+
},
|
121 |
+
{
|
122 |
+
name: "Intuitiveness",
|
123 |
+
description:
|
124 |
+
"How well did the model pick up on implicit aspects of your request without requiring explicit explanation?",
|
125 |
+
},
|
126 |
+
],
|
127 |
+
},
|
128 |
+
{
|
129 |
+
id: "adaptiveness",
|
130 |
+
title: "Adaptiveness",
|
131 |
+
icon: <SlidersHorizontal size={18} />,
|
132 |
+
color: "bg-amber-500",
|
133 |
+
description:
|
134 |
+
"Measures how well the model adjusts to different user needs, contexts, and feedback throughout a conversation.",
|
135 |
+
metrics: [
|
136 |
+
{
|
137 |
+
name: "Flexibility",
|
138 |
+
description:
|
139 |
+
"How effectively did the model adjust its responses based on your feedback?",
|
140 |
+
},
|
141 |
+
{
|
142 |
+
name: "Clarity",
|
143 |
+
description:
|
144 |
+
"How well did the model clarify ambiguities or misunderstandings?",
|
145 |
+
},
|
146 |
+
{
|
147 |
+
name: "Conversation Building",
|
148 |
+
description:
|
149 |
+
"How well did the model build upon previous exchanges in the conversation?",
|
150 |
+
},
|
151 |
+
],
|
152 |
+
},
|
153 |
+
{
|
154 |
+
id: "trustworthiness",
|
155 |
+
title: "Trustworthiness",
|
156 |
+
icon: <Shield size={18} />,
|
157 |
+
color: "bg-red-500",
|
158 |
+
description:
|
159 |
+
"Evaluates transparency, citations, acknowledgment of limitations, and overall user confidence in the model's responses.",
|
160 |
+
metrics: [
|
161 |
+
{
|
162 |
+
name: "Consistency",
|
163 |
+
description:
|
164 |
+
"How consistent were the model's responses across similar questions?",
|
165 |
+
},
|
166 |
+
{
|
167 |
+
name: "Confidence",
|
168 |
+
description:
|
169 |
+
"How confident were you in the accuracy of the model's information?",
|
170 |
+
},
|
171 |
+
{
|
172 |
+
name: "Transparency",
|
173 |
+
description:
|
174 |
+
"How transparent was the model about its limitations or uncertainties?",
|
175 |
+
},
|
176 |
+
],
|
177 |
+
},
|
178 |
+
{
|
179 |
+
id: "personality",
|
180 |
+
title: "Personality",
|
181 |
+
icon: <Smile size={18} />,
|
182 |
+
color: "bg-pink-500",
|
183 |
+
description:
|
184 |
+
"Assesses consistency and definition of the model's persona, and alignment with expectations of honesty, empathy, and fairness.",
|
185 |
+
metrics: [
|
186 |
+
{
|
187 |
+
name: "Personality Consistency",
|
188 |
+
description: "How consistent was the LLM's personality?",
|
189 |
+
},
|
190 |
+
{
|
191 |
+
name: "Distinct Personality",
|
192 |
+
description: "How well-defined was the LLM's personality?",
|
193 |
+
},
|
194 |
+
{
|
195 |
+
name: "Honesty Empathy Fairness",
|
196 |
+
description:
|
197 |
+
"How much did the LLM respond in a way that aligned with your expectations of honesty, empathy, or fairness?",
|
198 |
+
},
|
199 |
+
],
|
200 |
+
},
|
201 |
+
{
|
202 |
+
id: "background",
|
203 |
+
title: "Background and Culture",
|
204 |
+
icon: <Globe size={18} />,
|
205 |
+
color: "bg-teal-500",
|
206 |
+
description:
|
207 |
+
"Evaluates cultural sensitivity, alignment, relevance, and freedom from bias.",
|
208 |
+
metrics: [
|
209 |
+
{
|
210 |
+
name: "Ethical Alignment",
|
211 |
+
description:
|
212 |
+
"How aligned with your culture, viewpoint, or values was the LLM?",
|
213 |
+
},
|
214 |
+
{
|
215 |
+
name: "Cultural Awareness",
|
216 |
+
description:
|
217 |
+
"How well did the LLM recognize when your cultural perspective was relevant?",
|
218 |
+
},
|
219 |
+
{
|
220 |
+
name: "Bias and Stereotypes",
|
221 |
+
description:
|
222 |
+
"How free from stereotypes or bias was the LLM's response?",
|
223 |
+
},
|
224 |
+
],
|
225 |
+
},
|
226 |
+
];
|
227 |
+
|
228 |
+
// Section header component
|
229 |
+
const SectionHeader = ({ title, icon, section }) => (
|
230 |
+
<div
|
231 |
+
className="px-4 py-3 bg-gray-50 border-b flex justify-between items-center cursor-pointer"
|
232 |
+
onClick={() => toggleSection(section)}
|
233 |
+
>
|
234 |
+
<div className="flex items-center gap-2">
|
235 |
+
{icon}
|
236 |
+
<h3 className="font-semibold text-gray-800">{title}</h3>
|
237 |
+
</div>
|
238 |
+
{openSections[section] ? (
|
239 |
+
<ChevronUp size={16} />
|
240 |
+
) : (
|
241 |
+
<ChevronDown size={16} />
|
242 |
+
)}
|
243 |
+
</div>
|
244 |
+
);
|
245 |
+
|
246 |
+
return (
|
247 |
+
<div className="space-y-6">
|
248 |
+
{/* Introduction */}
|
249 |
+
<div className="border rounded-lg overflow-hidden shadow-sm">
|
250 |
+
<SectionHeader
|
251 |
+
title="About HUMAINE"
|
252 |
+
icon={<Info size={18} />}
|
253 |
+
section="introduction"
|
254 |
+
/>
|
255 |
+
{openSections.introduction && (
|
256 |
+
<div className="p-4 bg-gradient-to-r from-white to-blue-50">
|
257 |
+
<div className="flex flex-col md:flex-row gap-6">
|
258 |
+
<div className="md:w-2/3">
|
259 |
+
<p className="mb-4">
|
260 |
+
<strong>HUMAINE</strong> (Human Understanding and Measurement
|
261 |
+
of AI Natural Engagement) is an evaluation benchmark that
|
262 |
+
measures language model performance through actual user
|
263 |
+
experience. While many benchmarks focus on technical
|
264 |
+
capabilities, this evaluation captures how users perceive and
|
265 |
+
rate different LLMs across common, everyday use cases.
|
266 |
+
</p>
|
267 |
+
<p className="mb-4">
|
268 |
+
This study collected ratings from 514 participants
|
269 |
+
demographically representative of the US population. Each
|
270 |
+
participant completed real-world tasks with different LLMs and
|
271 |
+
provided structured feedback on various aspects of their
|
272 |
+
experience.
|
273 |
+
</p>
|
274 |
+
<p>
|
275 |
+
The evaluation framework includes 7 high-level categories and
|
276 |
+
21 specific low-level metrics that measure aspects like
|
277 |
+
helpfulness, communication quality, understanding,
|
278 |
+
adaptiveness, trustworthiness, personality, and cultural
|
279 |
+
awareness, alongside demographic equity analysis.
|
280 |
+
</p>
|
281 |
+
</div>
|
282 |
+
<div className="md:w-1/3 bg-white p-4 rounded-lg border shadow-sm">
|
283 |
+
<h4 className="font-medium text-gray-700 mb-2 border-b pb-1">
|
284 |
+
Tasks Evaluated
|
285 |
+
</h4>
|
286 |
+
<ul className="list-disc pl-5 space-y-2 text-sm">
|
287 |
+
{tasksUsed.map((task, index) => (
|
288 |
+
<li key={index} className="text-gray-700">
|
289 |
+
{task}
|
290 |
+
</li>
|
291 |
+
))}
|
292 |
+
</ul>
|
293 |
+
</div>
|
294 |
+
</div>
|
295 |
+
</div>
|
296 |
+
)}
|
297 |
+
</div>
|
298 |
+
|
299 |
+
{/* Methodology */}
|
300 |
+
<div className="border rounded-lg overflow-hidden shadow-sm">
|
301 |
+
<SectionHeader
|
302 |
+
title="Methodology"
|
303 |
+
icon={<Book size={18} />}
|
304 |
+
section="methodology"
|
305 |
+
/>
|
306 |
+
{openSections.methodology && (
|
307 |
+
<div className="p-4">
|
308 |
+
<div className="grid md:grid-cols-1 gap-4">
|
309 |
+
{/* Study Design */}
|
310 |
+
<div className="border rounded-lg p-4 bg-gray-50 hover:shadow-md transition-shadow">
|
311 |
+
<h4 className="text-lg font-medium mb-2 flex items-center gap-2 text-gray-800">
|
312 |
+
<span className="w-8 h-8 rounded-full bg-blue-500 flex items-center justify-center text-white">
|
313 |
+
1
|
314 |
+
</span>
|
315 |
+
Study Design
|
316 |
+
</h4>
|
317 |
+
<ul className="list-disc pl-5 space-y-1 text-sm">
|
318 |
+
<li>
|
319 |
+
<strong>Participants:</strong> 514 individuals representing
|
320 |
+
US demographics (stratified by age, sex, ethnicity,
|
321 |
+
political affiliation).
|
322 |
+
</li>
|
323 |
+
<li>
|
324 |
+
<strong>Task Design:</strong> Six everyday tasks spanning
|
325 |
+
creative, practical, and analytical use cases.
|
326 |
+
</li>
|
327 |
+
<li>
|
328 |
+
<strong>Process:</strong> Each participant completed all six
|
329 |
+
tasks, each with a different LLM. The assignment of tasks to
|
330 |
+
models and the order of tasks were fully randomized.
|
331 |
+
</li>
|
332 |
+
<li>
|
333 |
+
<strong>Models Evaluated:</strong> Latest o1, GPT-4o, Claude
|
334 |
+
3.7 (extended thinking), Gemini 2 Flash, LLama 3.1 405B,
|
335 |
+
Deepseek R1.
|
336 |
+
</li>
|
337 |
+
<li>
|
338 |
+
<strong>Model Access:</strong> All models were accessed via
|
339 |
+
openrouter.ai with temperature=1, min_tokens=50,
|
340 |
+
max_tokens=5,000.
|
341 |
+
</li>
|
342 |
+
<li>
|
343 |
+
<strong>Conversations:</strong> Participants were required
|
344 |
+
to exchange at least 4 messages with the models and they
|
345 |
+
could exchange more if they wished (not capped).
|
346 |
+
</li>
|
347 |
+
</ul>
|
348 |
+
</div>
|
349 |
+
{/* Evaluation Framework */}
|
350 |
+
<div className="border rounded-lg p-4 bg-gray-50 hover:shadow-md transition-shadow">
|
351 |
+
<h4 className="text-lg font-medium mb-2 flex items-center gap-2 text-gray-800">
|
352 |
+
<span className="w-8 h-8 rounded-full bg-blue-500 flex items-center justify-center text-white">
|
353 |
+
2
|
354 |
+
</span>
|
355 |
+
Evaluation Framework
|
356 |
+
</h4>
|
357 |
+
<p className="mb-2 text-sm">
|
358 |
+
Our approach captures multiple aspects of user experience:
|
359 |
+
</p>
|
360 |
+
<ul className="list-disc pl-5 space-y-1 text-sm">
|
361 |
+
<li>
|
362 |
+
<strong>Multi-Dimensional Metrics:</strong> Performance is
|
363 |
+
evaluated across 7 high-level categories (rated 1-7) and 21
|
364 |
+
specific low-level metrics (rated 1-5).
|
365 |
+
</li>
|
366 |
+
<li>
|
367 |
+
<strong>Demographic Analysis:</strong> We assess performance
|
368 |
+
consistency across different demographic groups through
|
369 |
+
equity assessment.
|
370 |
+
</li>
|
371 |
+
<li>
|
372 |
+
<strong>Scale Normalization:</strong> All ratings are
|
373 |
+
converted to a 0-100 scale for easier comparison.
|
374 |
+
</li>
|
375 |
+
</ul>
|
376 |
+
</div>
|
377 |
+
|
378 |
+
{/* Data Analysis & Weighting */}
|
379 |
+
<div className="border rounded-lg p-4 bg-gray-50 hover:shadow-md transition-shadow">
|
380 |
+
<h4 className="text-lg font-medium mb-2 flex items-center gap-2 text-gray-800">
|
381 |
+
<span className="w-8 h-8 rounded-full bg-blue-500 flex items-center justify-center text-white">
|
382 |
+
3
|
383 |
+
</span>
|
384 |
+
Data Analysis & Weighting
|
385 |
+
</h4>
|
386 |
+
<ul className="list-disc pl-5 space-y-1 text-sm">
|
387 |
+
<li>
|
388 |
+
<strong>MRP Methodology:</strong> Data is processed through
|
389 |
+
multiple regression with poststratification to create
|
390 |
+
results weighted to be highly representative of the US
|
391 |
+
population.
|
392 |
+
</li>
|
393 |
+
<li>
|
394 |
+
<strong>Robust Estimation:</strong> All model estimations
|
395 |
+
were parametrically bootstrapped (N = 1000) to ensure that
|
396 |
+
any uncertainty in the estimates was accounted for.
|
397 |
+
</li>
|
398 |
+
<li>
|
399 |
+
<strong>National Level Comparisons:</strong> For the Overall
|
400 |
+
Rankings and Metrics Breakdown tabs, we use the
|
401 |
+
national-level estimates derived from MRP.
|
402 |
+
</li>
|
403 |
+
<li>
|
404 |
+
<strong>Task-Level Comparisons:</strong> For task-specific
|
405 |
+
comparisons (Task Performance tab), we use the raw
|
406 |
+
(unweighted) data due to sample size constraints.
|
407 |
+
</li>
|
408 |
+
</ul>
|
409 |
+
</div>
|
410 |
+
|
411 |
+
{/* Demographic Equity Assessment */}
|
412 |
+
<div className="border rounded-lg p-4 bg-gray-50 hover:shadow-md transition-shadow">
|
413 |
+
<h4 className="text-lg font-medium mb-2 flex items-center gap-2 text-gray-800">
|
414 |
+
<span className="w-8 h-8 rounded-full bg-blue-500 flex items-center justify-center text-white">
|
415 |
+
4
|
416 |
+
</span>
|
417 |
+
Demographic Equity Assessment
|
418 |
+
</h4>
|
419 |
+
<p className="mb-2 text-sm">
|
420 |
+
The equity assessment evaluates performance consistency across
|
421 |
+
demographic groups using a standardized approach:
|
422 |
+
</p>
|
423 |
+
<div className="bg-white rounded p-3 border mb-2">
|
424 |
+
<p className="text-xs mb-2">
|
425 |
+
The <strong>Equity Gap</strong> is the score difference
|
426 |
+
between the highest and lowest scoring demographic groups
|
427 |
+
for a specific metric. For example, if a model scores 85
|
428 |
+
with users age 18-29 but 65 with users age 60+ on
|
429 |
+
helpfulness, the equity gap would be 20 points.
|
430 |
+
</p>
|
431 |
+
<p className="text-xs mb-2">
|
432 |
+
We evaluate equity gaps using both{" "}
|
433 |
+
<strong>Effect Size</strong> and{" "}
|
434 |
+
<strong>Statistical Significance</strong> to identify
|
435 |
+
meaningful performance differences:
|
436 |
+
</p>
|
437 |
+
<div className="text-xs mt-2 space-y-2">
|
438 |
+
<div>
|
439 |
+
<p className="font-medium text-gray-700">
|
440 |
+
Effect Size Calculation:
|
441 |
+
</p>
|
442 |
+
<p className="text-gray-600 ml-2">
|
443 |
+
We normalize each gap by dividing it by the category's
|
444 |
+
standard deviation:
|
445 |
+
<br />
|
446 |
+
<span className="font-mono bg-gray-100 px-1">
|
447 |
+
Effect Size = (Max Score - Min Score) / Category
|
448 |
+
Standard Deviation
|
449 |
+
</span>
|
450 |
+
</p>
|
451 |
+
<p className="text-gray-600 ml-2 mt-1">
|
452 |
+
Category Standard Deviation is calculated from all
|
453 |
+
demographic MRP scores within that specific category.
|
454 |
+
</p>
|
455 |
+
</div>
|
456 |
+
|
457 |
+
<div>
|
458 |
+
<p className="font-medium text-gray-700">
|
459 |
+
Effect Size Classification:
|
460 |
+
</p>
|
461 |
+
<div className="grid grid-cols-2 gap-x-3 gap-y-2 mt-1">
|
462 |
+
<div className="flex items-center gap-1">
|
463 |
+
<div className="w-3 h-3 rounded-full bg-red-100"></div>
|
464 |
+
<div>
|
465 |
+
<span className="font-medium text-gray-700">
|
466 |
+
Large
|
467 |
+
</span>
|
468 |
+
<p className="text-gray-500">Effect Size ≥ 0.8</p>
|
469 |
+
</div>
|
470 |
+
</div>
|
471 |
+
<div className="flex items-center gap-1">
|
472 |
+
<div className="w-3 h-3 rounded-full bg-yellow-100"></div>
|
473 |
+
<div>
|
474 |
+
<span className="font-medium text-gray-700">
|
475 |
+
Medium
|
476 |
+
</span>
|
477 |
+
<p className="text-gray-500">Effect Size 0.5-0.8</p>
|
478 |
+
</div>
|
479 |
+
</div>
|
480 |
+
<div className="flex items-center gap-1">
|
481 |
+
<div className="w-3 h-3 rounded-full bg-blue-100"></div>
|
482 |
+
<div>
|
483 |
+
<span className="font-medium text-gray-700">
|
484 |
+
Small
|
485 |
+
</span>
|
486 |
+
<p className="text-gray-500">Effect Size 0.2-0.5</p>
|
487 |
+
</div>
|
488 |
+
</div>
|
489 |
+
<div className="flex items-center gap-1">
|
490 |
+
<div className="w-3 h-3 rounded-full bg-green-100"></div>
|
491 |
+
<div>
|
492 |
+
<span className="font-medium text-gray-700">
|
493 |
+
Negligible
|
494 |
+
</span>
|
495 |
+
<p className="text-gray-500">
|
496 |
+
Effect Size < 0.2
|
497 |
+
</p>
|
498 |
+
</div>
|
499 |
+
</div>
|
500 |
+
</div>
|
501 |
+
</div>
|
502 |
+
|
503 |
+
<div>
|
504 |
+
<p className="font-medium text-gray-700">
|
505 |
+
Statistical Significance:
|
506 |
+
</p>
|
507 |
+
<p className="text-gray-600 ml-2">
|
508 |
+
We use p-values to determine if gaps are statistically
|
509 |
+
significant (p < 0.05). To account for the large
|
510 |
+
number of tests performed, p-values were adjusted using
|
511 |
+
the Benjamini-Hochberg (FDR) method. Significance
|
512 |
+
reported reflects this correction (q < 0.05).
|
513 |
+
</p>
|
514 |
+
</div>
|
515 |
+
|
516 |
+
<div>
|
517 |
+
<p className="font-medium text-gray-700">
|
518 |
+
Equity Concerns:
|
519 |
+
</p>
|
520 |
+
<p className="text-gray-600 ml-2">
|
521 |
+
A gap is flagged as an equity concern when it has both:
|
522 |
+
<br />
|
523 |
+
1. Large Effect Size (≥ 0.8)
|
524 |
+
<br />
|
525 |
+
2. Statistical Significance (p < 0.05)
|
526 |
+
</p>
|
527 |
+
</div>
|
528 |
+
</div>
|
529 |
+
<p className="text-xs text-gray-600 mt-2">
|
530 |
+
<strong>Note:</strong> This methodology allows us to
|
531 |
+
identify meaningful performance differences across
|
532 |
+
demographic groups while accounting for both the magnitude
|
533 |
+
of the gap (effect size) and its statistical reliability
|
534 |
+
(significance).
|
535 |
+
</p>
|
536 |
+
</div>
|
537 |
+
</div>
|
538 |
+
</div>
|
539 |
+
</div>
|
540 |
+
)}
|
541 |
+
</div>
|
542 |
+
|
543 |
+
{/* Metrics Calculation */}
|
544 |
+
<div className="border rounded-lg overflow-hidden shadow-sm">
|
545 |
+
<SectionHeader
|
546 |
+
title="Metrics Calculation"
|
547 |
+
icon={<Calculator size={18} />}
|
548 |
+
section="metricsCalculation"
|
549 |
+
/>
|
550 |
+
{openSections.metricsCalculation && (
|
551 |
+
<div className="p-4">
|
552 |
+
<p className="text-sm mb-4">
|
553 |
+
This section explains how the metrics in the Overview page's
|
554 |
+
ranking table are calculated.
|
555 |
+
</p>
|
556 |
+
|
557 |
+
<div className="grid md:grid-cols-2 lg:grid-cols-3 gap-3">
|
558 |
+
<div className="border rounded p-3 hover:shadow-md transition-shadow">
|
559 |
+
<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
|
560 |
+
<div className="w-4 h-4 rounded-full bg-blue-500"></div>
|
561 |
+
Overall Score
|
562 |
+
</h4>
|
563 |
+
<p className="text-xs text-gray-600">
|
564 |
+
Average score across high-level categories at the national
|
565 |
+
level (0-100). This represents overall model performance
|
566 |
+
across all evaluation dimensions.
|
567 |
+
</p>
|
568 |
+
</div>
|
569 |
+
|
570 |
+
<div className="border rounded p-3 hover:shadow-md transition-shadow">
|
571 |
+
<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
|
572 |
+
<div className="w-4 h-4 rounded-full bg-blue-500"></div>
|
573 |
+
Overall SD
|
574 |
+
</h4>
|
575 |
+
<p className="text-xs text-gray-600">
|
576 |
+
Standard Deviation across high-level categories (lower = more
|
577 |
+
consistent). Measures how consistent a model performs across
|
578 |
+
different capability areas.
|
579 |
+
</p>
|
580 |
+
</div>
|
581 |
+
|
582 |
+
<div className="border rounded p-3 hover:shadow-md transition-shadow">
|
583 |
+
<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
|
584 |
+
<div className="w-4 h-4 rounded-full bg-blue-500"></div>
|
585 |
+
Max Equity Gap
|
586 |
+
</h4>
|
587 |
+
<p className="text-xs text-gray-600">
|
588 |
+
Largest demographic score difference (hover for details).
|
589 |
+
Shows the maximum difference in scores between any two
|
590 |
+
demographic groups, with indicators for effect size and
|
591 |
+
statistical significance.
|
592 |
+
</p>
|
593 |
+
</div>
|
594 |
+
|
595 |
+
<div className="border rounded p-3 hover:shadow-md transition-shadow">
|
596 |
+
<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
|
597 |
+
<div className="w-4 h-4 rounded-full bg-blue-500"></div>
|
598 |
+
Max Gap Area
|
599 |
+
</h4>
|
600 |
+
<p className="text-xs text-gray-600">
|
601 |
+
Factor and Category where the Max Equity Gap occurs.
|
602 |
+
Identifies which demographic factor (e.g., Age, Gender) and
|
603 |
+
which category (e.g., Helpfulness, Understanding) shows the
|
604 |
+
largest performance difference.
|
605 |
+
</p>
|
606 |
+
</div>
|
607 |
+
|
608 |
+
<div className="border rounded p-3 hover:shadow-md transition-shadow">
|
609 |
+
<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
|
610 |
+
<div className="w-4 h-4 rounded-full bg-blue-500"></div>
|
611 |
+
Equity Concerns
|
612 |
+
</h4>
|
613 |
+
<p className="text-xs text-gray-600">
|
614 |
+
Percentage of demographic gaps flagged as equity concerns
|
615 |
+
(lower is better). An equity concern is defined as a gap with
|
616 |
+
both large effect size (≥0.8) and statistical significance.
|
617 |
+
</p>
|
618 |
+
</div>
|
619 |
+
|
620 |
+
<div className="border rounded p-3 hover:shadow-md transition-shadow">
|
621 |
+
<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
|
622 |
+
<div className="w-4 h-4 rounded-full bg-blue-500"></div>
|
623 |
+
User Retention
|
624 |
+
</h4>
|
625 |
+
<p className="text-xs text-gray-600">
|
626 |
+
Percentage of participants who said they would use the model
|
627 |
+
again. This is based on the "Repeat Usage" question and
|
628 |
+
indicates user satisfaction and likelihood to continue using
|
629 |
+
the model.
|
630 |
+
</p>
|
631 |
+
</div>
|
632 |
+
</div>
|
633 |
+
|
634 |
+
<div className="mt-4 bg-blue-50 border-l-4 border-blue-400 p-3 rounded">
|
635 |
+
<p className="text-xs text-blue-800">
|
636 |
+
<strong>Note:</strong> All scores shown in the dashboard are
|
637 |
+
based on MRP-adjusted (Multilevel Regression with
|
638 |
+
Poststratification) estimates to ensure they are representative
|
639 |
+
of the US population. The only exception is the Task Performance
|
640 |
+
tab, which uses raw scores due to sample size constraints at the
|
641 |
+
task level.
|
642 |
+
</p>
|
643 |
+
</div>
|
644 |
+
</div>
|
645 |
+
)}
|
646 |
+
</div>
|
647 |
+
|
648 |
+
{/* Metrics Explained */}
|
649 |
+
<div className="border rounded-lg overflow-hidden shadow-sm">
|
650 |
+
<SectionHeader
|
651 |
+
title="Metrics Explained"
|
652 |
+
icon={<BarChart size={18} />}
|
653 |
+
section="metricsExplained"
|
654 |
+
/>
|
655 |
+
{openSections.metricsExplained && (
|
656 |
+
<div className="p-4">
|
657 |
+
<p className="mb-4 text-sm">
|
658 |
+
Our evaluation uses 7 high-level categories (rated on a 1-7 Likert
|
659 |
+
scale) and 21 low-level metrics (rated on a 1-5 scale) to
|
660 |
+
comprehensively assess LLM performance from a user experience
|
661 |
+
perspective.
|
662 |
+
</p>
|
663 |
+
|
664 |
+
{/* Metric selector tabs */}
|
665 |
+
<div className="flex flex-wrap gap-1 mb-4 border-b">
|
666 |
+
{metricsData.map((metric) => (
|
667 |
+
<button
|
668 |
+
key={metric.id}
|
669 |
+
className={`px-3 py-2 text-sm rounded-t-lg flex items-center gap-1 ${
|
670 |
+
activeMetricTab === metric.id
|
671 |
+
? "bg-gray-100 font-medium border-t border-l border-r"
|
672 |
+
: "bg-white hover:bg-gray-50"
|
673 |
+
}`}
|
674 |
+
onClick={() => setActiveMetricTab(metric.id)}
|
675 |
+
>
|
676 |
+
<span
|
677 |
+
className={`w-2 h-2 rounded-full ${metric.color}`}
|
678 |
+
></span>
|
679 |
+
{metric.title}
|
680 |
+
</button>
|
681 |
+
))}
|
682 |
+
</div>
|
683 |
+
|
684 |
+
{/* Active metric content */}
|
685 |
+
{metricsData.map(
|
686 |
+
(metric) =>
|
687 |
+
activeMetricTab === metric.id && (
|
688 |
+
<div
|
689 |
+
key={metric.id}
|
690 |
+
className="border rounded-lg overflow-hidden"
|
691 |
+
>
|
692 |
+
<div className="px-4 py-3 bg-gray-50 border-b flex items-center gap-2">
|
693 |
+
<div className={`rounded-full`}>
|
694 |
+
{React.cloneElement(metric.icon, {
|
695 |
+
className: `text-gray-700 w-5 h-5`,
|
696 |
+
})}
|
697 |
+
</div>
|
698 |
+
<h4 className="font-medium text-gray-800">
|
699 |
+
{metric.title}{" "}
|
700 |
+
<span className="text-sm font-normal text-gray-600">
|
701 |
+
(1-7 scale)
|
702 |
+
</span>
|
703 |
+
</h4>
|
704 |
+
</div>
|
705 |
+
<div className="p-4">
|
706 |
+
<p className="text-sm mb-4">{metric.description}</p>
|
707 |
+
|
708 |
+
{metric.metrics.length > 0 && (
|
709 |
+
<>
|
710 |
+
<h5 className="text-sm font-medium mb-3 text-gray-700">
|
711 |
+
Specific Metrics (1-5 scale)
|
712 |
+
</h5>
|
713 |
+
<div className="grid md:grid-cols-3 gap-3">
|
714 |
+
{metric.metrics.map((subMetric, idx) => (
|
715 |
+
<div
|
716 |
+
key={idx}
|
717 |
+
className="border rounded p-3 hover:shadow-sm transition-shadow"
|
718 |
+
>
|
719 |
+
<p className="text-sm font-medium">
|
720 |
+
{subMetric.name}
|
721 |
+
</p>
|
722 |
+
<p className="text-xs text-gray-600 mt-1">
|
723 |
+
{subMetric.description}
|
724 |
+
</p>
|
725 |
+
</div>
|
726 |
+
))}
|
727 |
+
</div>
|
728 |
+
</>
|
729 |
+
)}
|
730 |
+
</div>
|
731 |
+
</div>
|
732 |
+
)
|
733 |
+
)}
|
734 |
+
</div>
|
735 |
+
)}
|
736 |
+
</div>
|
737 |
+
</div>
|
738 |
+
);
|
739 |
+
};
|
740 |
+
|
741 |
+
export default AboutTab;
|
leaderboard-app/components/DemographicAnalysis.jsx
ADDED
@@ -0,0 +1,925 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// components/DemographicAnalysis.jsx - Complete Updated File
|
2 |
+
|
3 |
+
"use client";
|
4 |
+
|
5 |
+
import React, { useState, useMemo, useEffect, useRef } from "react";
|
6 |
+
import {
|
7 |
+
BarChart,
|
8 |
+
Bar,
|
9 |
+
XAxis,
|
10 |
+
YAxis,
|
11 |
+
CartesianGrid,
|
12 |
+
Tooltip as RechartsTooltip,
|
13 |
+
Legend,
|
14 |
+
ResponsiveContainer,
|
15 |
+
Cell,
|
16 |
+
LabelList,
|
17 |
+
} from "recharts";
|
18 |
+
import {
|
19 |
+
getSignificanceIndicator,
|
20 |
+
formatDisplayKey,
|
21 |
+
getMetricTooltip,
|
22 |
+
} from "../lib/utils"; // Adjust path as needed
|
23 |
+
import { Tooltip } from "./Tooltip"; // Your custom Tooltip component
|
24 |
+
|
25 |
+
// Helper component for info tooltips with fixed positioning
|
26 |
+
const InfoTooltip = ({ text }) => {
|
27 |
+
const [isVisible, setIsVisible] = useState(false);
|
28 |
+
const [position, setPosition] = useState({ top: 0, left: 0 });
|
29 |
+
const buttonRef = useRef(null);
|
30 |
+
|
31 |
+
// Update position when tooltip becomes visible
|
32 |
+
useEffect(() => {
|
33 |
+
if (isVisible && buttonRef.current) {
|
34 |
+
const rect = buttonRef.current.getBoundingClientRect();
|
35 |
+
setPosition({
|
36 |
+
top: rect.top - 10, // Position above the icon with a small gap
|
37 |
+
left: rect.left + 12, // Center with the icon
|
38 |
+
});
|
39 |
+
}
|
40 |
+
}, [isVisible]);
|
41 |
+
|
42 |
+
return (
|
43 |
+
<div className="relative inline-block ml-1 align-middle">
|
44 |
+
<button
|
45 |
+
ref={buttonRef}
|
46 |
+
className="text-gray-400 hover:text-gray-600 focus:outline-none"
|
47 |
+
onMouseEnter={() => setIsVisible(true)}
|
48 |
+
onMouseLeave={() => setIsVisible(false)}
|
49 |
+
onClick={(e) => {
|
50 |
+
e.stopPropagation();
|
51 |
+
setIsVisible(!isVisible);
|
52 |
+
}}
|
53 |
+
aria-label="Info"
|
54 |
+
>
|
55 |
+
<svg
|
56 |
+
xmlns="http://www.w3.org/2000/svg"
|
57 |
+
className="h-4 w-4"
|
58 |
+
viewBox="0 0 20 20"
|
59 |
+
fill="currentColor"
|
60 |
+
>
|
61 |
+
<path
|
62 |
+
fillRule="evenodd"
|
63 |
+
d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a1 1 0 000 2v3a1 1 0 001 1h1a1 1 0 100-2v-3a1 1 0 00-1-1H9z"
|
64 |
+
clipRule="evenodd"
|
65 |
+
/>
|
66 |
+
</svg>
|
67 |
+
</button>
|
68 |
+
{isVisible && (
|
69 |
+
<div
|
70 |
+
className="fixed p-2 bg-white border-1 rounded shadow-xl text-xs text-gray-700 whitespace-pre-wrap"
|
71 |
+
style={{
|
72 |
+
top: `${position.top}px`,
|
73 |
+
left: `${position.left}px`,
|
74 |
+
zIndex: 9999,
|
75 |
+
maxWidth: "250px",
|
76 |
+
transform: "translate(-50%, -100%)",
|
77 |
+
}}
|
78 |
+
>
|
79 |
+
{text}
|
80 |
+
</div>
|
81 |
+
)}
|
82 |
+
</div>
|
83 |
+
);
|
84 |
+
};
|
85 |
+
|
86 |
+
// Custom tooltip for DEMOGRAPHIC chart (shows scores per model for a level)
|
87 |
+
const CustomDemographicTooltip = ({ active, payload, label }) => {
|
88 |
+
if (active && payload && payload.length) {
|
89 |
+
const sortedPayload = [...payload].sort(
|
90 |
+
(a, b) => (b.value || 0) - (a.value || 0)
|
91 |
+
);
|
92 |
+
return (
|
93 |
+
<div className="bg-white p-3 border rounded shadow-lg max-w-xs">
|
94 |
+
<p className="font-medium text-sm mb-1">{label}</p>
|
95 |
+
{sortedPayload.map((entry, index) => (
|
96 |
+
<div key={`item-${index}`} className="flex items-center mt-1">
|
97 |
+
<div
|
98 |
+
className="w-3 h-3 mr-2 rounded-full flex-shrink-0"
|
99 |
+
style={{
|
100 |
+
backgroundColor:
|
101 |
+
entry.payload[`${entry.dataKey}_color`] ||
|
102 |
+
entry.color ||
|
103 |
+
"#999",
|
104 |
+
}}
|
105 |
+
></div>
|
106 |
+
<span className="text-xs flex-grow pr-2">{entry.name}: </span>
|
107 |
+
<span className="text-xs font-medium ml-1 whitespace-nowrap">
|
108 |
+
{typeof entry.value === "number" ? entry.value.toFixed(1) : "N/A"}
|
109 |
+
</span>
|
110 |
+
</div>
|
111 |
+
))}
|
112 |
+
</div>
|
113 |
+
);
|
114 |
+
}
|
115 |
+
return null;
|
116 |
+
};
|
117 |
+
|
118 |
+
// Custom tooltip for EQUITY GAP chart - UPDATED
|
119 |
+
const EquityGapTooltip = ({ active, payload }) => {
|
120 |
+
if (active && payload && payload.length > 0) {
|
121 |
+
const data = payload[0].payload; // data here IS an item from equityGapChartData (derived from all_equity_gaps)
|
122 |
+
|
123 |
+
if (!data || typeof data !== "object") return null;
|
124 |
+
|
125 |
+
// Get significance indicator parts
|
126 |
+
const significanceInfo = getSignificanceIndicator(
|
127 |
+
data.is_statistically_significant,
|
128 |
+
data.p_value
|
129 |
+
);
|
130 |
+
const ciLower = data.gap_confidence_interval_95_lower;
|
131 |
+
const ciUpper = data.gap_confidence_interval_95_upper;
|
132 |
+
|
133 |
+
return (
|
134 |
+
<div className="bg-white p-3 border rounded shadow-lg text-xs max-w-xs">
|
135 |
+
<p className="font-medium text-sm mb-2">{data.model}</p>
|
136 |
+
<div className="space-y-1">
|
137 |
+
<div className="flex justify-between">
|
138 |
+
<span className="font-semibold">Equity Gap:</span>
|
139 |
+
{/* 'gap' key is used in chart data */}
|
140 |
+
<span>{data.gap?.toFixed(1) ?? "N/A"} pts</span>
|
141 |
+
</div>
|
142 |
+
{data.effect_size !== undefined && data.effect_size !== null && (
|
143 |
+
<div className="flex justify-between">
|
144 |
+
<span className="font-semibold">Effect Size:</span>
|
145 |
+
<span>
|
146 |
+
{data.effect_size?.toFixed(2) ?? "N/A"} (
|
147 |
+
{data.effect_size_class || "N/A"})
|
148 |
+
</span>
|
149 |
+
</div>
|
150 |
+
)}
|
151 |
+
{/* Show Significance */}
|
152 |
+
<div className="flex justify-between items-center">
|
153 |
+
<span className="font-semibold">Significance:</span>
|
154 |
+
<span className={`flex items-center ${significanceInfo.className}`}>
|
155 |
+
{significanceInfo.tooltip.replace(/Statistically /g, "")}{" "}
|
156 |
+
{/* Shorten text */}
|
157 |
+
<span className="ml-1 font-bold">{significanceInfo.symbol}</span>
|
158 |
+
</span>
|
159 |
+
</div>
|
160 |
+
{/* Show Confidence Interval */}
|
161 |
+
<div className="flex justify-between">
|
162 |
+
<span className="font-semibold">95% CI:</span>
|
163 |
+
<span>
|
164 |
+
{typeof ciLower === "number" && typeof ciUpper === "number"
|
165 |
+
? `[${ciLower.toFixed(1)}, ${ciUpper.toFixed(1)}]`
|
166 |
+
: "N/A"}
|
167 |
+
</span>
|
168 |
+
</div>
|
169 |
+
{/* Show Concern Flag */}
|
170 |
+
{data.is_equity_concern !== undefined && (
|
171 |
+
<div className="flex justify-between">
|
172 |
+
<span className="font-semibold">Concern Flag:</span>
|
173 |
+
<span
|
174 |
+
className={
|
175 |
+
data.is_equity_concern
|
176 |
+
? "font-bold text-red-600"
|
177 |
+
: "text-gray-600"
|
178 |
+
}
|
179 |
+
>
|
180 |
+
{data.is_equity_concern ? "Yes" : "No"}
|
181 |
+
</span>
|
182 |
+
</div>
|
183 |
+
)}
|
184 |
+
{/* Show Min/Max Groups */}
|
185 |
+
<div className="flex justify-between">
|
186 |
+
<span className="font-semibold">Lowest Group:</span>
|
187 |
+
<span>
|
188 |
+
{data.min_level || "N/A"} ({data.min_score?.toFixed(1) ?? "-"})
|
189 |
+
</span>
|
190 |
+
</div>
|
191 |
+
<div className="flex justify-between">
|
192 |
+
<span className="font-semibold">Highest Group:</span>
|
193 |
+
<span>
|
194 |
+
{data.max_level || "N/A"} ({data.max_score?.toFixed(1) ?? "-"})
|
195 |
+
</span>
|
196 |
+
</div>
|
197 |
+
</div>
|
198 |
+
</div>
|
199 |
+
);
|
200 |
+
}
|
201 |
+
return null;
|
202 |
+
};
|
203 |
+
|
204 |
+
// New helper functions for styling consistency
|
205 |
+
|
206 |
+
// New helper function to get badge color for effect size
|
207 |
+
const getEffectSizeBadgeStyle = (effectSizeClass) => {
|
208 |
+
switch (effectSizeClass) {
|
209 |
+
case "Large":
|
210 |
+
return "bg-red-100 text-red-800";
|
211 |
+
case "Medium":
|
212 |
+
return "bg-yellow-100 text-yellow-800";
|
213 |
+
case "Small":
|
214 |
+
return "bg-blue-100 text-blue-800";
|
215 |
+
case "Negligible":
|
216 |
+
return "bg-green-100 text-green-800";
|
217 |
+
default:
|
218 |
+
return "bg-gray-100 text-gray-800";
|
219 |
+
}
|
220 |
+
};
|
221 |
+
|
222 |
+
// New helper function to get badge color for significance
|
223 |
+
const getSignificanceBadgeStyle = (isSignificant) => {
|
224 |
+
if (isSignificant === null || isSignificant === undefined)
|
225 |
+
return "bg-gray-100 text-gray-800";
|
226 |
+
return isSignificant
|
227 |
+
? "bg-blue-100 text-blue-800"
|
228 |
+
: "bg-gray-100 text-gray-600";
|
229 |
+
};
|
230 |
+
|
231 |
+
// New helper function to get badge color for concern
|
232 |
+
const getConcernBadgeStyle = (isConcern) => {
|
233 |
+
if (isConcern === null || isConcern === undefined)
|
234 |
+
return "bg-gray-100 text-gray-800";
|
235 |
+
return isConcern ? "bg-red-100 text-red-800" : "bg-green-100 text-green-800";
|
236 |
+
};
|
237 |
+
|
238 |
+
// New helper function to format p-value
|
239 |
+
const formatPValue = (pValue) => {
|
240 |
+
if (pValue === null || pValue === undefined) return "N/A";
|
241 |
+
return `p=${pValue.toFixed(3)}` + (pValue < 0.05 ? " < 0.05" : " ≥ 0.05");
|
242 |
+
};
|
243 |
+
|
244 |
+
// New helper function to create effect size tooltip content
|
245 |
+
const getEffectSizeTooltip = (effectSize) => {
|
246 |
+
return `Effect Size: ${effectSize.toFixed(2)}
|
247 |
+
|
248 |
+
Calculation: Normalized Effect Size = (Max Score - Min Score) / Category Standard Deviation
|
249 |
+
|
250 |
+
Category Standard Deviation: The standard deviation of all demographic scores within this specific category.
|
251 |
+
|
252 |
+
Thresholds:
|
253 |
+
• ≥ 0.8: "Large"
|
254 |
+
• ≥ 0.5 and < 0.8: "Medium"
|
255 |
+
• ≥ 0.2 and < 0.5: "Small"
|
256 |
+
• < 0.2: "Negligible"`;
|
257 |
+
};
|
258 |
+
|
259 |
+
// Main component
|
260 |
+
const DemographicAnalysis = ({
|
261 |
+
rawData = { demographicOptions: {}, mrpDemographics: {} }, // Expect camelCase keys here, snake_case inside mrpDemographics
|
262 |
+
modelsMeta = [], // Expect camelCase keys
|
263 |
+
metricsData = { highLevelCategories: {}, lowLevelMetrics: {} }, // Expect Title Case keys, contains internalMetricKey
|
264 |
+
equityAnalysis = { all_equity_gaps: [], universal_issues: [] }, // Expect snake_case keys
|
265 |
+
}) => {
|
266 |
+
// Use Title Case metric keys for state and dropdowns
|
267 |
+
const highLevelMetricDisplayKeys = Object.keys(
|
268 |
+
metricsData?.highLevelCategories || {}
|
269 |
+
).sort();
|
270 |
+
const lowLevelMetricDisplayKeys = Object.keys(
|
271 |
+
metricsData?.lowLevelMetrics || {}
|
272 |
+
).sort();
|
273 |
+
|
274 |
+
const [selectedDemographicFactor, setSelectedDemographicFactor] =
|
275 |
+
useState(null);
|
276 |
+
const [selectedMetricDisplayKey, setSelectedMetricDisplayKey] =
|
277 |
+
useState(null); // State holds Title Case
|
278 |
+
const [metricLevel, setMetricLevel] = useState("high");
|
279 |
+
|
280 |
+
const currentMetricDisplayKeys = useMemo(
|
281 |
+
() =>
|
282 |
+
metricLevel === "high"
|
283 |
+
? highLevelMetricDisplayKeys
|
284 |
+
: lowLevelMetricDisplayKeys,
|
285 |
+
[metricLevel, highLevelMetricDisplayKeys, lowLevelMetricDisplayKeys]
|
286 |
+
);
|
287 |
+
|
288 |
+
const getModelColor = (modelName) =>
|
289 |
+
modelsMeta.find((m) => m.model === modelName)?.color || "#999999";
|
290 |
+
|
291 |
+
// Set default factor
|
292 |
+
useEffect(() => {
|
293 |
+
const factors = Object.keys(rawData.demographicOptions || {});
|
294 |
+
if (!selectedDemographicFactor && factors.length > 0) {
|
295 |
+
const defaultFactor = factors.includes("Age") ? "Age" : factors.sort()[0];
|
296 |
+
setSelectedDemographicFactor(defaultFactor);
|
297 |
+
}
|
298 |
+
}, [rawData.demographicOptions, selectedDemographicFactor]);
|
299 |
+
|
300 |
+
// Set default metric when list available
|
301 |
+
useEffect(() => {
|
302 |
+
if (!selectedMetricDisplayKey && currentMetricDisplayKeys.length > 0) {
|
303 |
+
// Default logic might need adjustment if "Overall" isn't a key
|
304 |
+
const defaultMetric = currentMetricDisplayKeys.includes("Overall Score")
|
305 |
+
? "Overall Score"
|
306 |
+
: currentMetricDisplayKeys[0];
|
307 |
+
setSelectedMetricDisplayKey(defaultMetric);
|
308 |
+
} else if (
|
309 |
+
selectedMetricDisplayKey &&
|
310 |
+
!currentMetricDisplayKeys.includes(selectedMetricDisplayKey)
|
311 |
+
) {
|
312 |
+
setSelectedMetricDisplayKey(
|
313 |
+
currentMetricDisplayKeys.length > 0 ? currentMetricDisplayKeys[0] : null
|
314 |
+
);
|
315 |
+
}
|
316 |
+
}, [currentMetricDisplayKeys, selectedMetricDisplayKey, metricLevel]);
|
317 |
+
|
318 |
+
// Get the internal snake_case key for filtering equity gaps
|
319 |
+
const internalMetricKey = useMemo(() => {
|
320 |
+
if (!selectedMetricDisplayKey) return null;
|
321 |
+
const allMetrics = {
|
322 |
+
...(metricsData?.highLevelCategories || {}),
|
323 |
+
...(metricsData?.lowLevelMetrics || {}),
|
324 |
+
};
|
325 |
+
// Look up using Title Case display key
|
326 |
+
return allMetrics[selectedMetricDisplayKey]?.internalMetricKey ?? null;
|
327 |
+
}, [selectedMetricDisplayKey, metricsData]);
|
328 |
+
|
329 |
+
// Filter equity gaps based on internal key and factor
|
330 |
+
const filteredEquityGaps = useMemo(() => {
|
331 |
+
// Use internalMetricKey (snake_case) and selectedDemographicFactor
|
332 |
+
if (
|
333 |
+
!internalMetricKey ||
|
334 |
+
!selectedDemographicFactor ||
|
335 |
+
!equityAnalysis?.all_equity_gaps ||
|
336 |
+
!Array.isArray(equityAnalysis.all_equity_gaps)
|
337 |
+
) {
|
338 |
+
return [];
|
339 |
+
}
|
340 |
+
// Filter all_equity_gaps (which has snake_case keys)
|
341 |
+
return equityAnalysis.all_equity_gaps.filter(
|
342 |
+
(gap) =>
|
343 |
+
gap.category === internalMetricKey &&
|
344 |
+
gap.demographic_factor === selectedDemographicFactor
|
345 |
+
);
|
346 |
+
}, [
|
347 |
+
internalMetricKey,
|
348 |
+
selectedDemographicFactor,
|
349 |
+
equityAnalysis?.all_equity_gaps,
|
350 |
+
]);
|
351 |
+
|
352 |
+
// Prepare data for Equity Gap Chart - uses snake_case keys from filteredEquityGaps
|
353 |
+
const equityGapChartData = useMemo(() => {
|
354 |
+
return filteredEquityGaps
|
355 |
+
.map((gap) => ({
|
356 |
+
// Pass all original snake_case keys needed by tooltip/table
|
357 |
+
// These keys match the fields expected by EquityGapTooltip
|
358 |
+
model: gap.model,
|
359 |
+
gap: gap.score_range ?? 0, // Rename score_range to gap for chart dataKey
|
360 |
+
score_range: gap.score_range,
|
361 |
+
effect_size: gap.effect_size,
|
362 |
+
effect_size_class: gap.effect_size_class,
|
363 |
+
is_statistically_significant: gap.is_statistically_significant,
|
364 |
+
p_value: gap.p_value,
|
365 |
+
gap_confidence_interval_95_lower: gap.gap_confidence_interval_95_lower,
|
366 |
+
gap_confidence_interval_95_upper: gap.gap_confidence_interval_95_upper,
|
367 |
+
is_equity_concern: gap.is_equity_concern,
|
368 |
+
min_level: gap.min_level,
|
369 |
+
min_score: gap.min_score,
|
370 |
+
max_level: gap.max_level,
|
371 |
+
max_score: gap.max_score,
|
372 |
+
|
373 |
+
// Add derived properties
|
374 |
+
color: getModelColor(gap.model),
|
375 |
+
}))
|
376 |
+
.sort((a, b) => (a.gap ?? 0) - (b.gap ?? 0)) // Sort by gap size ascending
|
377 |
+
.map((item, index) => ({ ...item, rank: index + 1 })); // Add rank based on gap size
|
378 |
+
}, [filteredEquityGaps]); // Depend only on filteredEquityGaps
|
379 |
+
|
380 |
+
// Prepare data for Demographic Breakdown Chart
|
381 |
+
const demographicChartData = useMemo(() => {
|
382 |
+
// selectedMetricDisplayKey is Title Case, matching keys in mrpDemographics
|
383 |
+
if (
|
384 |
+
!selectedDemographicFactor ||
|
385 |
+
!selectedMetricDisplayKey ||
|
386 |
+
!rawData.mrpDemographics
|
387 |
+
)
|
388 |
+
return [];
|
389 |
+
const metricKeyInData = selectedMetricDisplayKey; // Use Title Case key
|
390 |
+
const levels = rawData.demographicOptions[selectedDemographicFactor] || [];
|
391 |
+
if (levels.length === 0) return [];
|
392 |
+
|
393 |
+
const chartData = levels.map((level) => {
|
394 |
+
const entry = { level };
|
395 |
+
modelsMeta.forEach((model) => {
|
396 |
+
// Access mrpDemographics using Title Case metric key
|
397 |
+
const score =
|
398 |
+
rawData.mrpDemographics[model.model]?.[selectedDemographicFactor]?.[
|
399 |
+
level
|
400 |
+
]?.[metricKeyInData];
|
401 |
+
entry[model.model] =
|
402 |
+
score !== undefined && score !== null && score !== "N/A"
|
403 |
+
? parseFloat(score)
|
404 |
+
: null;
|
405 |
+
entry[`${model.model}_color`] = model.color;
|
406 |
+
});
|
407 |
+
return entry;
|
408 |
+
});
|
409 |
+
return chartData.sort((a, b) => {
|
410 |
+
if (a.level === "N/A") return 1;
|
411 |
+
if (b.level === "N/A") return -1;
|
412 |
+
return a.level.localeCompare(b.level);
|
413 |
+
});
|
414 |
+
}, [
|
415 |
+
selectedDemographicFactor,
|
416 |
+
selectedMetricDisplayKey,
|
417 |
+
rawData.mrpDemographics,
|
418 |
+
rawData.demographicOptions,
|
419 |
+
modelsMeta,
|
420 |
+
]);
|
421 |
+
|
422 |
+
const modelsWithDemoData = useMemo(
|
423 |
+
() =>
|
424 |
+
modelsMeta
|
425 |
+
.map((m) => m.model)
|
426 |
+
.filter((modelName) =>
|
427 |
+
demographicChartData.some(
|
428 |
+
(d) => d[modelName] !== null && d[modelName] !== undefined
|
429 |
+
)
|
430 |
+
),
|
431 |
+
[modelsMeta, demographicChartData]
|
432 |
+
);
|
433 |
+
|
434 |
+
return (
|
435 |
+
<div>
|
436 |
+
{/* Controls Panel */}
|
437 |
+
<div className="border rounded-lg overflow-hidden mb-6 shadow-sm">
|
438 |
+
<div className="px-4 py-3 bg-gray-50 border-b">
|
439 |
+
<h3 className="font-semibold text-gray-800">
|
440 |
+
Demographic Analysis Controls
|
441 |
+
</h3>
|
442 |
+
</div>
|
443 |
+
<div className="p-4 grid grid-cols-1 md:grid-cols-3 gap-4">
|
444 |
+
{/* Factor Selector */}
|
445 |
+
<div>
|
446 |
+
<label
|
447 |
+
htmlFor="factorSelect"
|
448 |
+
className="block text-sm font-medium text-gray-700 mb-1"
|
449 |
+
>
|
450 |
+
Demographic Factor
|
451 |
+
</label>
|
452 |
+
<select
|
453 |
+
id="factorSelect"
|
454 |
+
className="w-full border rounded-md px-3 py-2 bg-white shadow-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
|
455 |
+
value={selectedDemographicFactor || ""}
|
456 |
+
onChange={(e) => setSelectedDemographicFactor(e.target.value)}
|
457 |
+
>
|
458 |
+
<option value="" disabled>
|
459 |
+
Select factor
|
460 |
+
</option>
|
461 |
+
{Object.keys(rawData.demographicOptions || {})
|
462 |
+
.sort()
|
463 |
+
.map((factor) => (
|
464 |
+
<option key={factor} value={factor}>
|
465 |
+
{formatDisplayKey(factor)}
|
466 |
+
</option>
|
467 |
+
))}
|
468 |
+
</select>
|
469 |
+
</div>
|
470 |
+
{/* Level Toggle */}
|
471 |
+
<div>
|
472 |
+
<label className="block text-sm font-medium text-gray-700 mb-1">
|
473 |
+
Metric Level
|
474 |
+
</label>
|
475 |
+
<div className="flex">
|
476 |
+
<button
|
477 |
+
className={`px-3 py-2 text-sm font-medium border ${
|
478 |
+
metricLevel === "high"
|
479 |
+
? "bg-blue-100 text-blue-800 border-blue-300"
|
480 |
+
: "bg-white text-gray-700 border-gray-300 hover:bg-gray-50"
|
481 |
+
} rounded-l-md flex-1`}
|
482 |
+
onClick={() => setMetricLevel("high")}
|
483 |
+
>
|
484 |
+
High-Level
|
485 |
+
</button>
|
486 |
+
<button
|
487 |
+
className={`px-3 py-2 text-sm font-medium border-t border-b border-r ${
|
488 |
+
metricLevel === "low"
|
489 |
+
? "bg-blue-100 text-blue-800 border-blue-300"
|
490 |
+
: "bg-white text-gray-700 border-gray-300 hover:bg-gray-50"
|
491 |
+
} rounded-r-md flex-1`}
|
492 |
+
onClick={() => setMetricLevel("low")}
|
493 |
+
>
|
494 |
+
Low-Level
|
495 |
+
</button>
|
496 |
+
</div>
|
497 |
+
</div>
|
498 |
+
{/* Metric Selector - Uses Title Case keys */}
|
499 |
+
<div>
|
500 |
+
<label
|
501 |
+
htmlFor="metricSelect"
|
502 |
+
className="block text-sm font-medium text-gray-700 mb-1"
|
503 |
+
>
|
504 |
+
<Tooltip content={getMetricTooltip(selectedMetricDisplayKey)}>
|
505 |
+
<span>
|
506 |
+
{metricLevel === "high"
|
507 |
+
? "High-Level Category"
|
508 |
+
: "Low-Level Metric"}
|
509 |
+
</span>
|
510 |
+
</Tooltip>
|
511 |
+
</label>
|
512 |
+
<select
|
513 |
+
id="metricSelect"
|
514 |
+
className="w-full border rounded-md px-3 py-2 bg-white shadow-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
|
515 |
+
value={selectedMetricDisplayKey || ""}
|
516 |
+
onChange={(e) => setSelectedMetricDisplayKey(e.target.value)}
|
517 |
+
disabled={currentMetricDisplayKeys.length === 0}
|
518 |
+
>
|
519 |
+
<option value="" disabled>
|
520 |
+
Select metric
|
521 |
+
</option>
|
522 |
+
{/* Iterate through Title Case keys */}
|
523 |
+
{currentMetricDisplayKeys.map((displayKey) => (
|
524 |
+
<option key={displayKey} value={displayKey}>
|
525 |
+
{displayKey}
|
526 |
+
</option>
|
527 |
+
))}
|
528 |
+
</select>
|
529 |
+
{!selectedMetricDisplayKey &&
|
530 |
+
currentMetricDisplayKeys.length > 0 && (
|
531 |
+
<p className="mt-1 text-xs text-gray-500">
|
532 |
+
Select a metric to view analysis.
|
533 |
+
</p>
|
534 |
+
)}
|
535 |
+
{currentMetricDisplayKeys.length === 0 && (
|
536 |
+
<p className="mt-1 text-xs text-amber-600">
|
537 |
+
No {metricLevel} metrics available.
|
538 |
+
</p>
|
539 |
+
)}
|
540 |
+
</div>
|
541 |
+
</div>
|
542 |
+
</div>
|
543 |
+
|
544 |
+
{/* Demographic Breakdown Chart */}
|
545 |
+
<div className="border rounded-lg overflow-hidden mb-6 shadow-sm">
|
546 |
+
<div className="px-4 py-3 bg-gray-50 border-b">
|
547 |
+
<h3 className="font-semibold text-gray-800">
|
548 |
+
{selectedMetricDisplayKey || "Metric"} Scores across{" "}
|
549 |
+
{formatDisplayKey(selectedDemographicFactor) || "Groups"}
|
550 |
+
<InfoTooltip
|
551 |
+
text={`Shows the average score (0-100) for each model within each subgroup of ${formatDisplayKey(
|
552 |
+
selectedDemographicFactor
|
553 |
+
)}. Higher scores are better.`}
|
554 |
+
/>
|
555 |
+
</h3>
|
556 |
+
</div>
|
557 |
+
<div className="p-4">
|
558 |
+
{demographicChartData.length > 0 && modelsWithDemoData.length > 0 ? (
|
559 |
+
<div className="h-80">
|
560 |
+
<ResponsiveContainer width="100%" height="100%">
|
561 |
+
<BarChart
|
562 |
+
data={demographicChartData}
|
563 |
+
margin={{ top: 5, right: 5, left: 0, bottom: 60 }}
|
564 |
+
>
|
565 |
+
<CartesianGrid strokeDasharray="3 3" vertical={false} />
|
566 |
+
<XAxis
|
567 |
+
dataKey="level"
|
568 |
+
angle={-45}
|
569 |
+
textAnchor="end"
|
570 |
+
tick={{ fontSize: 11 }}
|
571 |
+
interval={0}
|
572 |
+
height={70}
|
573 |
+
/>
|
574 |
+
<YAxis domain={[0, 100]} tick={{ fontSize: 11 }} width={40} />
|
575 |
+
<RechartsTooltip
|
576 |
+
content={<CustomDemographicTooltip />}
|
577 |
+
wrapperStyle={{ zIndex: 10 }}
|
578 |
+
/>
|
579 |
+
<Legend
|
580 |
+
layout="horizontal"
|
581 |
+
verticalAlign="bottom"
|
582 |
+
align="center"
|
583 |
+
wrapperStyle={{ paddingTop: 30 }}
|
584 |
+
iconSize={10}
|
585 |
+
/>
|
586 |
+
{modelsWithDemoData.map((modelName) => (
|
587 |
+
<Bar
|
588 |
+
key={modelName}
|
589 |
+
dataKey={modelName}
|
590 |
+
name={modelName}
|
591 |
+
fill={getModelColor(modelName)}
|
592 |
+
/>
|
593 |
+
))}
|
594 |
+
</BarChart>
|
595 |
+
</ResponsiveContainer>
|
596 |
+
</div>
|
597 |
+
) : (
|
598 |
+
<div className="flex items-center justify-center h-60 bg-gray-50 rounded">
|
599 |
+
<div className="text-center p-4">
|
600 |
+
<svg
|
601 |
+
xmlns="http://www.w3.org/2000/svg"
|
602 |
+
className="h-10 w-10 mx-auto text-gray-400 mb-3"
|
603 |
+
fill="none"
|
604 |
+
viewBox="0 0 24 24"
|
605 |
+
stroke="currentColor"
|
606 |
+
>
|
607 |
+
<path
|
608 |
+
strokeLinecap="round"
|
609 |
+
strokeLinejoin="round"
|
610 |
+
strokeWidth={2}
|
611 |
+
d="M9 17v-2m3 2v-4m3 4v-6m2 10H7a2 2 0 01-2-2V7a2 2 0 012-2h2l2-3h6l2 3h2a2 2 0 012 2v10a2 2 0 01-2 2h-1"
|
612 |
+
/>
|
613 |
+
</svg>
|
614 |
+
<h3 className="text-lg font-medium text-gray-900 mb-1">
|
615 |
+
No Data Available
|
616 |
+
</h3>
|
617 |
+
<p className="text-sm text-gray-600">
|
618 |
+
{!selectedDemographicFactor
|
619 |
+
? "Please select a demographic factor."
|
620 |
+
: !selectedMetricDisplayKey
|
621 |
+
? "Please select a metric."
|
622 |
+
: "No score data found."}
|
623 |
+
</p>
|
624 |
+
</div>
|
625 |
+
</div>
|
626 |
+
)}
|
627 |
+
</div>
|
628 |
+
</div>
|
629 |
+
|
630 |
+
{/* Equity Gap Comparison Chart */}
|
631 |
+
<div className="border rounded-lg overflow-hidden mb-6 shadow-sm">
|
632 |
+
<div className="px-4 py-3 bg-gray-50 border-b">
|
633 |
+
<h3 className="font-semibold text-gray-800">
|
634 |
+
Equity Gap Comparison for {selectedMetricDisplayKey || "Metric"}
|
635 |
+
<InfoTooltip
|
636 |
+
text={`Compares the maximum score difference observed between ${formatDisplayKey(
|
637 |
+
selectedDemographicFactor
|
638 |
+
)} groups for each model. Lower gaps indicate better equity.`}
|
639 |
+
/>
|
640 |
+
</h3>
|
641 |
+
</div>
|
642 |
+
<div className="p-4">
|
643 |
+
{equityGapChartData.length > 0 ? (
|
644 |
+
<div className="h-72">
|
645 |
+
<ResponsiveContainer width="100%" height="100%">
|
646 |
+
<BarChart
|
647 |
+
data={equityGapChartData}
|
648 |
+
margin={{ top: 5, right: 30, left: 5, bottom: 5 }}
|
649 |
+
layout="vertical"
|
650 |
+
>
|
651 |
+
<CartesianGrid
|
652 |
+
strokeDasharray="3 3"
|
653 |
+
horizontal={true}
|
654 |
+
vertical={false}
|
655 |
+
/>
|
656 |
+
<XAxis
|
657 |
+
type="number"
|
658 |
+
dataKey="gap"
|
659 |
+
domain={[0, "auto"]}
|
660 |
+
tick={{ fontSize: 11 }}
|
661 |
+
allowDecimals={false}
|
662 |
+
/>
|
663 |
+
<YAxis
|
664 |
+
dataKey="model"
|
665 |
+
type="category"
|
666 |
+
width={130}
|
667 |
+
tick={{ fontSize: 11 }}
|
668 |
+
/>
|
669 |
+
<RechartsTooltip
|
670 |
+
content={<EquityGapTooltip />}
|
671 |
+
wrapperStyle={{ zIndex: 10 }}
|
672 |
+
/>
|
673 |
+
<Bar
|
674 |
+
dataKey="gap"
|
675 |
+
name="Equity Gap"
|
676 |
+
barSize={20}
|
677 |
+
radius={[0, 4, 4, 0]}
|
678 |
+
>
|
679 |
+
{equityGapChartData.map((entry, index) => (
|
680 |
+
<Cell
|
681 |
+
key={`cell-${index}`}
|
682 |
+
fill={entry.color}
|
683 |
+
fillOpacity={0.8}
|
684 |
+
/>
|
685 |
+
))}
|
686 |
+
<LabelList
|
687 |
+
dataKey="gap"
|
688 |
+
position="right"
|
689 |
+
formatter={(value) => value?.toFixed(1) ?? ""}
|
690 |
+
style={{ fontSize: 11, fill: "#6b7280" }}
|
691 |
+
/>
|
692 |
+
</Bar>
|
693 |
+
</BarChart>
|
694 |
+
</ResponsiveContainer>
|
695 |
+
</div>
|
696 |
+
) : (
|
697 |
+
<div className="flex items-center justify-center h-60 bg-gray-50 rounded">
|
698 |
+
<div className="text-center p-4">
|
699 |
+
<svg
|
700 |
+
xmlns="http://www.w3.org/2000/svg"
|
701 |
+
className="h-10 w-10 mx-auto text-gray-400 mb-3"
|
702 |
+
fill="none"
|
703 |
+
viewBox="0 0 24 24"
|
704 |
+
stroke="currentColor"
|
705 |
+
>
|
706 |
+
<path
|
707 |
+
strokeLinecap="round"
|
708 |
+
strokeLinejoin="round"
|
709 |
+
strokeWidth={2}
|
710 |
+
d="M9 17v-2m3 2v-4m3 4v-6m2 10H7a2 2 0 01-2-2V7a2 2 0 012-2h2l2-3h6l2 3h2a2 2 0 012 2v10a2 2 0 01-2 2h-1"
|
711 |
+
/>
|
712 |
+
</svg>
|
713 |
+
<h3 className="text-lg font-medium text-gray-900 mb-1">
|
714 |
+
No Equity Gap Data
|
715 |
+
</h3>
|
716 |
+
<p className="text-sm text-gray-600">
|
717 |
+
{!selectedDemographicFactor
|
718 |
+
? "Select factor."
|
719 |
+
: !selectedMetricDisplayKey
|
720 |
+
? "Select metric."
|
721 |
+
: "No equity gaps found."}
|
722 |
+
</p>
|
723 |
+
</div>
|
724 |
+
</div>
|
725 |
+
)}
|
726 |
+
{equityGapChartData.length > 0 && (
|
727 |
+
<p className="mt-3 text-xs text-gray-500">
|
728 |
+
Chart ranks models by equity gap size (lower is better).
|
729 |
+
</p>
|
730 |
+
)}
|
731 |
+
</div>
|
732 |
+
</div>
|
733 |
+
|
734 |
+
{/* Equity Gap Details Table - IMPROVED */}
|
735 |
+
{equityGapChartData.length > 0 && (
|
736 |
+
<div className="border rounded-lg overflow-hidden mb-6 shadow-sm">
|
737 |
+
<div className="px-4 py-3 bg-gray-50 border-b">
|
738 |
+
<h3 className="font-semibold text-gray-800">
|
739 |
+
Detailed Equity Gaps: {selectedMetricDisplayKey || "Metric"} by{" "}
|
740 |
+
{formatDisplayKey(selectedDemographicFactor) || "Factor"}
|
741 |
+
</h3>
|
742 |
+
</div>
|
743 |
+
<div className="p-4 overflow-x-auto">
|
744 |
+
<table className="min-w-full divide-y divide-gray-200">
|
745 |
+
<thead className="bg-gray-50">
|
746 |
+
<tr>
|
747 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">
|
748 |
+
Rank
|
749 |
+
</th>
|
750 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">
|
751 |
+
Model
|
752 |
+
</th>
|
753 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">
|
754 |
+
Equity Gap
|
755 |
+
</th>
|
756 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">
|
757 |
+
Effect Size
|
758 |
+
</th>
|
759 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">
|
760 |
+
Significance
|
761 |
+
</th>
|
762 |
+
<th className="px-3 py-2 text-center text-xs font-medium text-gray-500 uppercase tracking-wider">
|
763 |
+
Concern?
|
764 |
+
</th>
|
765 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">
|
766 |
+
Lowest Group (Score)
|
767 |
+
</th>
|
768 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider">
|
769 |
+
Highest Group (Score)
|
770 |
+
</th>
|
771 |
+
</tr>
|
772 |
+
</thead>
|
773 |
+
<tbody className="bg-white divide-y divide-gray-200">
|
774 |
+
{equityGapChartData.map((gap) => {
|
775 |
+
const minScoreDisplay =
|
776 |
+
typeof gap.min_score === "number"
|
777 |
+
? gap.min_score.toFixed(1)
|
778 |
+
: "-";
|
779 |
+
const maxScoreDisplay =
|
780 |
+
typeof gap.max_score === "number"
|
781 |
+
? gap.max_score.toFixed(1)
|
782 |
+
: "-";
|
783 |
+
|
784 |
+
return (
|
785 |
+
<tr
|
786 |
+
key={gap.model}
|
787 |
+
className={`hover:bg-gray-50 ${
|
788 |
+
gap.is_equity_concern ? "bg-red-50" : ""
|
789 |
+
}`}
|
790 |
+
>
|
791 |
+
<td className="px-3 py-2 whitespace-nowrap text-sm text-gray-500">
|
792 |
+
{gap.rank}
|
793 |
+
</td>
|
794 |
+
<td className="px-3 py-2 whitespace-nowrap">
|
795 |
+
<div className="flex items-center">
|
796 |
+
<div
|
797 |
+
className="w-3 h-3 rounded-full mr-2 flex-shrink-0"
|
798 |
+
style={{ backgroundColor: gap.color }}
|
799 |
+
></div>
|
800 |
+
<span className="text-sm font-medium text-gray-900">
|
801 |
+
{gap.model}
|
802 |
+
</span>
|
803 |
+
</div>
|
804 |
+
</td>
|
805 |
+
<td className="px-3 py-2 whitespace-nowrap text-sm font-medium">
|
806 |
+
{/* Equity Gap as plain text */}
|
807 |
+
{gap.gap !== undefined && gap.gap !== null
|
808 |
+
? gap.gap.toFixed(1)
|
809 |
+
: "N/A"}
|
810 |
+
</td>
|
811 |
+
<td className="px-3 py-2 whitespace-nowrap text-sm">
|
812 |
+
{gap.effect_size !== undefined &&
|
813 |
+
gap.effect_size !== null ? (
|
814 |
+
<div className="flex items-center">
|
815 |
+
<span
|
816 |
+
className={`px-2 py-0.5 rounded-full text-xs font-medium ${getEffectSizeBadgeStyle(
|
817 |
+
gap.effect_size_class
|
818 |
+
)}`}
|
819 |
+
>
|
820 |
+
{gap.effect_size_class || "N/A"}
|
821 |
+
</span>
|
822 |
+
<InfoTooltip
|
823 |
+
text={getEffectSizeTooltip(gap.effect_size)}
|
824 |
+
/>
|
825 |
+
</div>
|
826 |
+
) : (
|
827 |
+
<span className="text-gray-500">N/A</span>
|
828 |
+
)}
|
829 |
+
</td>
|
830 |
+
<td className="px-3 py-2 whitespace-nowrap text-sm">
|
831 |
+
<div className="flex flex-col">
|
832 |
+
<div className="flex items-center">
|
833 |
+
<span
|
834 |
+
className={`px-2 py-0.5 rounded-full text-xs font-medium ${getSignificanceBadgeStyle(
|
835 |
+
gap.is_statistically_significant
|
836 |
+
)}`}
|
837 |
+
>
|
838 |
+
{gap.is_statistically_significant ? (
|
839 |
+
<span>Significant ✔</span>
|
840 |
+
) : (
|
841 |
+
<span>Not Significant ✘</span>
|
842 |
+
)}
|
843 |
+
</span>
|
844 |
+
</div>
|
845 |
+
<div className="text-xs text-gray-500 mt-1">
|
846 |
+
{gap.p_value !== undefined && gap.p_value !== null
|
847 |
+
? formatPValue(gap.p_value)
|
848 |
+
: ""}
|
849 |
+
</div>
|
850 |
+
</div>
|
851 |
+
</td>
|
852 |
+
<td className="px-3 py-2 whitespace-nowrap text-sm text-center">
|
853 |
+
<span
|
854 |
+
className={`inline-block px-2 py-0.5 rounded-full text-xs font-medium ${getConcernBadgeStyle(
|
855 |
+
gap.is_equity_concern
|
856 |
+
)}`}
|
857 |
+
>
|
858 |
+
{gap.is_equity_concern ? "Yes" : "No"}
|
859 |
+
</span>
|
860 |
+
</td>
|
861 |
+
<td className="px-3 py-2 whitespace-nowrap text-sm">
|
862 |
+
{gap.min_level ? (
|
863 |
+
<div className="flex flex-col">
|
864 |
+
<span className="font-medium">{gap.min_level}</span>
|
865 |
+
<span className="text-gray-500">
|
866 |
+
{minScoreDisplay}
|
867 |
+
</span>
|
868 |
+
</div>
|
869 |
+
) : (
|
870 |
+
<span className="text-gray-500">-</span>
|
871 |
+
)}
|
872 |
+
</td>
|
873 |
+
<td className="px-3 py-2 whitespace-nowrap text-sm">
|
874 |
+
{gap.max_level ? (
|
875 |
+
<div className="flex flex-col">
|
876 |
+
<span className="font-medium">{gap.max_level}</span>
|
877 |
+
<span className="text-gray-500">
|
878 |
+
{maxScoreDisplay}
|
879 |
+
</span>
|
880 |
+
</div>
|
881 |
+
) : (
|
882 |
+
<span className="text-gray-500">-</span>
|
883 |
+
)}
|
884 |
+
</td>
|
885 |
+
</tr>
|
886 |
+
);
|
887 |
+
})}
|
888 |
+
</tbody>
|
889 |
+
</table>
|
890 |
+
</div>
|
891 |
+
{/* Table Footer/Explanation - IMPROVED */}
|
892 |
+
<div className="px-4 pb-4 pt-2 text-xs text-gray-600">
|
893 |
+
<div className="space-y-1">
|
894 |
+
<p>
|
895 |
+
<span className="font-semibold">Rank:</span> Based on lowest
|
896 |
+
Equity Gap value for this metric/factor
|
897 |
+
</p>
|
898 |
+
<p>
|
899 |
+
<span className="font-semibold">Equity Gap:</span> Score
|
900 |
+
difference (0-100 points) between highest and lowest scoring
|
901 |
+
groups
|
902 |
+
</p>
|
903 |
+
<p>
|
904 |
+
<span className="font-semibold">Effect Size:</span> Gap
|
905 |
+
magnitude relative to score variation (hover for details)
|
906 |
+
</p>
|
907 |
+
<p>
|
908 |
+
<span className="font-semibold">Significance:</span>Whether the
|
909 |
+
gap is statistically significant after adjusting for multiple
|
910 |
+
tests (Benjamini-Hochberg FDR correction, q<0.05)
|
911 |
+
</p>
|
912 |
+
<p>
|
913 |
+
<span className="font-semibold">Concern?:</span> 'Yes' flags
|
914 |
+
potential equity concerns (Large Effect Size AND Statistically
|
915 |
+
Significant)
|
916 |
+
</p>
|
917 |
+
</div>
|
918 |
+
</div>
|
919 |
+
</div>
|
920 |
+
)}
|
921 |
+
</div>
|
922 |
+
);
|
923 |
+
};
|
924 |
+
|
925 |
+
export default DemographicAnalysis;
|
leaderboard-app/components/LLMComparisonDashboard.jsx
ADDED
@@ -0,0 +1,639 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// components/LLMComparisonDashboard.jsx
|
2 |
+
|
3 |
+
"use client";
|
4 |
+
|
5 |
+
import React, { useState, useMemo } from "react";
|
6 |
+
import {
|
7 |
+
getScoreBadgeColor,
|
8 |
+
formatDisplayKey, // Use this for displaying snake_case keys nicely
|
9 |
+
getMetricTooltip,
|
10 |
+
getEquityIndicatorStyle, // Use this for Max Equity Gap status
|
11 |
+
} from "../lib/utils"; // Adjust path as needed
|
12 |
+
import TaskPerformance from "./TaskPerformance";
|
13 |
+
import DemographicAnalysis from "./DemographicAnalysis";
|
14 |
+
import MetricsBreakdown from "./MetricsBreakdown";
|
15 |
+
import About from "./About";
|
16 |
+
import { Tooltip } from "./Tooltip"; // Assuming this is your Tooltip component
|
17 |
+
|
18 |
+
// Helper component for info tooltips (assuming it exists and works)
|
19 |
+
const InfoTooltip = ({ text }) => {
|
20 |
+
const [isVisible, setIsVisible] = useState(false);
|
21 |
+
return (
|
22 |
+
<div className="relative inline-block ml-1 align-middle">
|
23 |
+
<button
|
24 |
+
className="text-gray-400 hover:text-gray-600 focus:outline-none"
|
25 |
+
onMouseEnter={() => setIsVisible(true)}
|
26 |
+
onMouseLeave={() => setIsVisible(false)}
|
27 |
+
onClick={(e) => {
|
28 |
+
e.stopPropagation();
|
29 |
+
setIsVisible(!isVisible);
|
30 |
+
}}
|
31 |
+
aria-label="Info"
|
32 |
+
>
|
33 |
+
<svg
|
34 |
+
xmlns="http://www.w3.org/2000/svg"
|
35 |
+
className="h-4 w-4"
|
36 |
+
viewBox="0 0 20 20"
|
37 |
+
fill="currentColor"
|
38 |
+
>
|
39 |
+
<path
|
40 |
+
fillRule="evenodd"
|
41 |
+
d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a1 1 0 000 2v3a1 1 0 001 1h1a1 1 0 100-2v-3a1 1 0 00-1-1H9z"
|
42 |
+
clipRule="evenodd"
|
43 |
+
/>
|
44 |
+
</svg>
|
45 |
+
</button>
|
46 |
+
{isVisible && (
|
47 |
+
<div className="absolute z-10 w-64 p-2 bg-white border rounded shadow-lg text-xs text-gray-700 -translate-x-1/2 left-1/2 mt-1 normal-case">
|
48 |
+
{text}
|
49 |
+
</div>
|
50 |
+
)}
|
51 |
+
</div>
|
52 |
+
);
|
53 |
+
};
|
54 |
+
|
55 |
+
// Main dashboard component
|
56 |
+
const LLMComparisonDashboard = ({ data: processedData }) => {
|
57 |
+
const [activeTab, setActiveTab] = useState("overview");
|
58 |
+
const [topPerformersView, setTopPerformersView] = useState("high-level");
|
59 |
+
|
60 |
+
// Destructure data - top-level keys are camelCase
|
61 |
+
// Nested rawData and equityAnalysis retain original snake_case keys
|
62 |
+
const {
|
63 |
+
models: rankedModels = [], // This is overallRankingProcessed with camelCase keys
|
64 |
+
metricsData = { highLevelCategories: {}, lowLevelMetrics: {} }, // Title Case keys inside
|
65 |
+
radarData = [],
|
66 |
+
overviewCardData = {}, // camelCase keys inside expected
|
67 |
+
rawData = {
|
68 |
+
// camelCase keys for objects, snake_case keys inside those objects
|
69 |
+
taskLevelPerformance: {},
|
70 |
+
mrpDemographics: {},
|
71 |
+
demographicOptions: {},
|
72 |
+
availableMetrics: [], // Title Case
|
73 |
+
tasks: [],
|
74 |
+
taskCategories: {},
|
75 |
+
taskMetrics: [], // Title Case
|
76 |
+
taskMetricsSnake: [], // snake_case
|
77 |
+
taskCategoryMap: {},
|
78 |
+
},
|
79 |
+
bestPerCategory = {}, // Title Case keys
|
80 |
+
bestPerMetric = {}, // Title Case keys
|
81 |
+
equityAnalysis = {
|
82 |
+
// Original snake_case keys
|
83 |
+
all_equity_gaps: [],
|
84 |
+
model_max_effect_gaps: {},
|
85 |
+
universal_issues: [],
|
86 |
+
assessment_method: {},
|
87 |
+
demographic_variation_stats: {},
|
88 |
+
},
|
89 |
+
metadata = {}, // Original keys
|
90 |
+
} = processedData || {};
|
91 |
+
|
92 |
+
// NEW: Helper function to get color for Max Equity Gap bubble
|
93 |
+
const getEquityGapBadgeColor = (model) => {
|
94 |
+
const isConcern = model.maxEffectConcernFlag;
|
95 |
+
const isSignificant = model.maxEffectSignificant;
|
96 |
+
const effectSizeClass = model.maxEffectSizeClass;
|
97 |
+
const isLargeEffect = effectSizeClass === "Large";
|
98 |
+
|
99 |
+
if (isConcern && isSignificant && isLargeEffect) {
|
100 |
+
return "bg-red-100 text-red-800"; // Equity Concern
|
101 |
+
}
|
102 |
+
if (isLargeEffect) {
|
103 |
+
return "bg-yellow-100 text-yellow-800"; // Large Effect
|
104 |
+
}
|
105 |
+
if (isSignificant) {
|
106 |
+
return "bg-blue-100 text-blue-800"; // Significant
|
107 |
+
}
|
108 |
+
return "bg-gray-100 text-gray-800"; // No concern
|
109 |
+
};
|
110 |
+
|
111 |
+
// UPDATED: Render cell for Max Equity Gap column with bubble design
|
112 |
+
const renderMaxEquityGapCell = (model) => {
|
113 |
+
// model object has camelCase keys
|
114 |
+
const gapValue = model.maxEffectGap;
|
115 |
+
const isConcern = model.maxEffectConcernFlag;
|
116 |
+
const significanceStatus = model.maxEffectSignificant;
|
117 |
+
const pValue = model.maxEffectPValue;
|
118 |
+
const effectSizeClass = model.maxEffectSizeClass;
|
119 |
+
const isLargeEffect = effectSizeClass === "Large";
|
120 |
+
// Access nested details using original snake_case keys
|
121 |
+
const gapDetails = model.maxEffectGapDetails || {};
|
122 |
+
const ciLower = gapDetails.gap_confidence_interval_95_lower;
|
123 |
+
const ciUpper = gapDetails.gap_confidence_interval_95_upper;
|
124 |
+
|
125 |
+
const displayValue =
|
126 |
+
typeof gapValue === "number" ? gapValue.toFixed(1) : "N/A";
|
127 |
+
if (displayValue === "N/A")
|
128 |
+
return <span className="text-xs text-gray-500">N/A</span>;
|
129 |
+
|
130 |
+
const indicator = getEquityIndicatorStyle(
|
131 |
+
isConcern,
|
132 |
+
isLargeEffect,
|
133 |
+
significanceStatus,
|
134 |
+
pValue,
|
135 |
+
effectSizeClass
|
136 |
+
);
|
137 |
+
let fullTooltipContent = indicator.tooltip;
|
138 |
+
if (typeof ciLower === "number" && typeof ciUpper === "number") {
|
139 |
+
fullTooltipContent += `\n95% CI: [${ciLower.toFixed(
|
140 |
+
1
|
141 |
+
)}, ${ciUpper.toFixed(1)}]`;
|
142 |
+
} else {
|
143 |
+
fullTooltipContent += `\n95% CI: N/A`;
|
144 |
+
}
|
145 |
+
|
146 |
+
return (
|
147 |
+
<Tooltip
|
148 |
+
content={
|
149 |
+
<div className="whitespace-pre-line">{fullTooltipContent}</div>
|
150 |
+
}
|
151 |
+
>
|
152 |
+
<span
|
153 |
+
className={`px-2 py-0.5 rounded-full text-xs font-medium ${getEquityGapBadgeColor(
|
154 |
+
model
|
155 |
+
)}`}
|
156 |
+
>
|
157 |
+
{displayValue}
|
158 |
+
</span>
|
159 |
+
</Tooltip>
|
160 |
+
);
|
161 |
+
};
|
162 |
+
|
163 |
+
// NEW: Helper for equity concerns percentage badge color
|
164 |
+
const getEquityConcernBadgeColor = (percentage) => {
|
165 |
+
if (percentage === null || percentage === undefined)
|
166 |
+
return "bg-gray-100 text-gray-800";
|
167 |
+
if (percentage === 0) return "bg-green-100 text-green-800";
|
168 |
+
if (percentage <= 2.5) return "bg-blue-100 text-blue-800";
|
169 |
+
if (percentage <= 5) return "bg-yellow-100 text-yellow-800";
|
170 |
+
return "bg-red-100 text-red-800";
|
171 |
+
};
|
172 |
+
|
173 |
+
return (
|
174 |
+
<div className="max-w-7xl mx-auto p-4 bg-white">
|
175 |
+
{/* Header */}
|
176 |
+
<div className="relative mb-6 overflow-hidden">
|
177 |
+
<div className="absolute inset-0 bg-gradient-to-br from-blue-50 to-sky-50 opacity-70"></div>
|
178 |
+
<div className="relative max-w-5xl mx-auto px-6 py-6">
|
179 |
+
<div className="text-center">
|
180 |
+
<h1 className="text-4xl font-bold mb-2 tracking-narrow text-blue-700">
|
181 |
+
Prolific's AI User Experience Leaderboard
|
182 |
+
</h1>
|
183 |
+
|
184 |
+
<p className="text-gray-600 max-w-4xl mx-auto">
|
185 |
+
A benchmark assessing how well language models handle real-world
|
186 |
+
tasks based on user experiences.
|
187 |
+
</p>
|
188 |
+
</div>
|
189 |
+
</div>
|
190 |
+
</div>
|
191 |
+
{/* Tab Buttons */}
|
192 |
+
<div className="flex flex-wrap mb-6 border-b">
|
193 |
+
{[
|
194 |
+
"overview",
|
195 |
+
"metrics-breakdown",
|
196 |
+
"task-performance",
|
197 |
+
"demographic-analysis",
|
198 |
+
"about",
|
199 |
+
].map((tab) => (
|
200 |
+
<button
|
201 |
+
key={tab}
|
202 |
+
className={`px-4 py-2 font-medium capitalize ${
|
203 |
+
activeTab === tab
|
204 |
+
? "text-blue-600 border-b-2 border-blue-600"
|
205 |
+
: "text-gray-500 hover:text-gray-700"
|
206 |
+
}`}
|
207 |
+
onClick={() => setActiveTab(tab)}
|
208 |
+
>
|
209 |
+
{" "}
|
210 |
+
{tab.replace("-", " ")}{" "}
|
211 |
+
</button>
|
212 |
+
))}
|
213 |
+
</div>
|
214 |
+
{/* Overview Tab */}
|
215 |
+
{activeTab === "overview" && (
|
216 |
+
<div>
|
217 |
+
{/* Overall Rankings Card */}
|
218 |
+
<div className="mb-6 border rounded-lg overflow-hidden shadow-sm">
|
219 |
+
<div className="px-4 py-3 bg-gray-50 border-b">
|
220 |
+
<h2 className="text-xl font-semibold text-gray-800">
|
221 |
+
Overall Model Rankings
|
222 |
+
</h2>
|
223 |
+
</div>
|
224 |
+
<div className="p-4">
|
225 |
+
<div className="overflow-x-auto">
|
226 |
+
<table className="w-full min-w-[850px] table-auto divide-y divide-gray-200">
|
227 |
+
<thead>
|
228 |
+
<tr className="bg-gray-50">
|
229 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-12">
|
230 |
+
Rank
|
231 |
+
</th>
|
232 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-48">
|
233 |
+
Model
|
234 |
+
</th>
|
235 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-28">
|
236 |
+
<span>Overall Score</span>
|
237 |
+
</th>
|
238 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-24">
|
239 |
+
<span>Overall SD</span>
|
240 |
+
</th>
|
241 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-32">
|
242 |
+
<span>Max Equity Gap</span>
|
243 |
+
</th>
|
244 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-38">
|
245 |
+
<span>Max Gap Area</span>
|
246 |
+
</th>
|
247 |
+
<th className="px-3 py-2 text-center text-xs font-medium text-gray-500 uppercase tracking-wider w-36">
|
248 |
+
<span>Equity Concerns</span>
|
249 |
+
</th>
|
250 |
+
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider w-32">
|
251 |
+
<span>User Retention</span>
|
252 |
+
</th>
|
253 |
+
</tr>
|
254 |
+
</thead>
|
255 |
+
<tbody className="divide-y divide-gray-200">
|
256 |
+
{/* Use camelCase model object from rankedModels */}
|
257 |
+
{rankedModels.map((model) => (
|
258 |
+
<tr key={model.model} className="hover:bg-gray-50">
|
259 |
+
<td className="px-3 py-3 text-sm font-medium text-gray-900">
|
260 |
+
{model.rank}
|
261 |
+
</td>
|
262 |
+
<td className="px-3 py-3">
|
263 |
+
<div className="flex items-center">
|
264 |
+
<div
|
265 |
+
className="w-3 h-3 rounded-full mr-2 flex-shrink-0"
|
266 |
+
style={{ backgroundColor: model.color }}
|
267 |
+
></div>
|
268 |
+
<span className="text-sm font-medium text-gray-900">
|
269 |
+
{model.model}
|
270 |
+
</span>
|
271 |
+
</div>
|
272 |
+
</td>
|
273 |
+
<td className="px-3 py-3 text-sm font-semibold text-gray-800">
|
274 |
+
{model.overallScore !== null
|
275 |
+
? model.overallScore.toFixed(1)
|
276 |
+
: "N/A"}
|
277 |
+
</td>
|
278 |
+
<td className="px-3 py-3 text-sm text-gray-600">
|
279 |
+
{model.stdDevAcrossCats !== "N/A" &&
|
280 |
+
model.stdDevAcrossCats !== null
|
281 |
+
? `± ${Number(model.stdDevAcrossCats).toFixed(1)}`
|
282 |
+
: "N/A"}
|
283 |
+
</td>
|
284 |
+
<td className="px-3 py-3 text-sm">
|
285 |
+
{renderMaxEquityGapCell(model)}
|
286 |
+
</td>
|
287 |
+
<td className="px-3 py-3">
|
288 |
+
{model.maxEffectFactor &&
|
289 |
+
model.maxEffectFactor !== "N/A" ? (
|
290 |
+
<div className="flex flex-col">
|
291 |
+
<span className="text-xs font-medium text-gray-900">
|
292 |
+
{formatDisplayKey(model.maxEffectFactor)}
|
293 |
+
</span>
|
294 |
+
<span className="text-xs text-gray-500">
|
295 |
+
{formatDisplayKey(model.maxEffectCategory)}
|
296 |
+
</span>
|
297 |
+
</div>
|
298 |
+
) : (
|
299 |
+
<span className="text-xs text-gray-500">N/A</span>
|
300 |
+
)}
|
301 |
+
</td>
|
302 |
+
<td className="px-3 py-3 text-sm text-center">
|
303 |
+
{model.equityConcernPercentage !== null ? (
|
304 |
+
<span>
|
305 |
+
{model.equityConcernPercentage.toFixed(1)}%
|
306 |
+
</span>
|
307 |
+
) : (
|
308 |
+
<span className="text-xs text-gray-500">N/A</span>
|
309 |
+
)}
|
310 |
+
</td>
|
311 |
+
<td className="px-3 py-3 text-sm">
|
312 |
+
{model.repeatUsageScore !== null ? (
|
313 |
+
<span
|
314 |
+
className={`px-2 py-0.5 rounded-full text-xs font-medium ${getScoreBadgeColor(
|
315 |
+
model.repeatUsageScore
|
316 |
+
)}`}
|
317 |
+
>
|
318 |
+
{model.repeatUsageScore.toFixed(1)}%
|
319 |
+
</span>
|
320 |
+
) : (
|
321 |
+
<span className="text-xs text-gray-500">N/A</span>
|
322 |
+
)}
|
323 |
+
</td>
|
324 |
+
</tr>
|
325 |
+
))}
|
326 |
+
</tbody>
|
327 |
+
</table>
|
328 |
+
</div>
|
329 |
+
{/* UPDATED: Vertical list for column descriptions with detailed info */}
|
330 |
+
<div className="mt-4 pt-3 border-t border-gray-200 text-xs text-gray-600">
|
331 |
+
{/* Column descriptions in vertical list */}
|
332 |
+
<div className="mb-2">
|
333 |
+
<div>
|
334 |
+
<span className="font-semibold">Overall Score:</span> Avg.
|
335 |
+
score across high-level categories
|
336 |
+
</div>
|
337 |
+
<div>
|
338 |
+
<span className="font-semibold">Overall SD:</span> Standard
|
339 |
+
deviation across high-level categories (lower = more
|
340 |
+
consistent)
|
341 |
+
</div>
|
342 |
+
<div>
|
343 |
+
<span className="font-semibold">Max Equity Gap:</span>{" "}
|
344 |
+
Largest demographic score difference (hover for details on
|
345 |
+
significance and effect size)
|
346 |
+
</div>
|
347 |
+
<div>
|
348 |
+
<span className="font-semibold">Max Gap Area:</span>{" "}
|
349 |
+
Demographic group and Category where the Max Equity Gap
|
350 |
+
occurs
|
351 |
+
</div>
|
352 |
+
<div>
|
353 |
+
<span className="font-semibold">Equity Concerns:</span>{" "}
|
354 |
+
Percentage of demographic gaps flagged as concerns (large
|
355 |
+
effect & statistically significant)
|
356 |
+
</div>
|
357 |
+
<div>
|
358 |
+
<span className="font-semibold">User Retention:</span>{" "}
|
359 |
+
Percentage of participants who said they would use the model
|
360 |
+
again
|
361 |
+
</div>
|
362 |
+
</div>
|
363 |
+
|
364 |
+
{/* Color key on a single line */}
|
365 |
+
<div className="mt-2 pt-2 border-t border-gray-100 flex flex-wrap items-center gap-x-4 gap-y-2">
|
366 |
+
<span className="font-semibold whitespace-nowrap">
|
367 |
+
Color Key:
|
368 |
+
</span>
|
369 |
+
<div className="flex items-center">
|
370 |
+
<span className="inline-block w-4 h-4 rounded-full bg-red-100 mr-1"></span>
|
371 |
+
<span>
|
372 |
+
Equity Concern (Large Effect & Statistically Significant)
|
373 |
+
</span>
|
374 |
+
</div>
|
375 |
+
<div className="flex items-center">
|
376 |
+
<span className="inline-block w-4 h-4 rounded-full bg-yellow-100 mr-1"></span>
|
377 |
+
<span>Large Effect (Not Statistically Significant)</span>
|
378 |
+
</div>
|
379 |
+
</div>
|
380 |
+
</div>
|
381 |
+
</div>
|
382 |
+
</div>
|
383 |
+
|
384 |
+
{/* Top Performers Section */}
|
385 |
+
<div className="mb-4 flex items-center">
|
386 |
+
<h3 className="font-semibold text-xl mr-4">
|
387 |
+
Top Performers by Category
|
388 |
+
</h3>
|
389 |
+
<div className="flex space-x-1 p-1 bg-gray-200 rounded-lg">
|
390 |
+
<button
|
391 |
+
className={`px-4 py-1.5 text-sm font-medium rounded-md transition-colors duration-150 ${
|
392 |
+
topPerformersView === "high-level"
|
393 |
+
? "bg-white shadow text-blue-600"
|
394 |
+
: "text-gray-600 hover:text-gray-800"
|
395 |
+
}`}
|
396 |
+
onClick={() => setTopPerformersView("high-level")}
|
397 |
+
>
|
398 |
+
{" "}
|
399 |
+
High-Level Categories{" "}
|
400 |
+
</button>
|
401 |
+
<button
|
402 |
+
className={`px-4 py-1.5 text-sm font-medium rounded-md transition-colors duration-150 ${
|
403 |
+
topPerformersView === "low-level"
|
404 |
+
? "bg-white shadow text-blue-600"
|
405 |
+
: "text-gray-600 hover:text-gray-800"
|
406 |
+
}`}
|
407 |
+
onClick={() => setTopPerformersView("low-level")}
|
408 |
+
>
|
409 |
+
{" "}
|
410 |
+
Low-Level Metrics{" "}
|
411 |
+
</button>
|
412 |
+
</div>
|
413 |
+
</div>
|
414 |
+
{/* Top Performers Tables - Access using Title Case keys */}
|
415 |
+
{topPerformersView === "high-level" && (
|
416 |
+
<div className="border rounded-lg overflow-hidden shadow-sm mb-6">
|
417 |
+
<div className="px-4 py-3 bg-gray-50 border-b">
|
418 |
+
<h3 className="font-semibold text-gray-800">
|
419 |
+
Top Performers by High-Level Category
|
420 |
+
</h3>
|
421 |
+
</div>
|
422 |
+
<div className="p-4">
|
423 |
+
{Object.entries(bestPerCategory || {}).length > 0 ? (
|
424 |
+
<table className="min-w-full divide-y divide-gray-200">
|
425 |
+
<thead>
|
426 |
+
<tr>
|
427 |
+
<th
|
428 |
+
scope="col"
|
429 |
+
className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider"
|
430 |
+
>
|
431 |
+
Category
|
432 |
+
</th>
|
433 |
+
<th
|
434 |
+
scope="col"
|
435 |
+
className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider"
|
436 |
+
>
|
437 |
+
Best Model
|
438 |
+
</th>
|
439 |
+
<th
|
440 |
+
scope="col"
|
441 |
+
className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider"
|
442 |
+
>
|
443 |
+
Score
|
444 |
+
</th>
|
445 |
+
</tr>
|
446 |
+
</thead>
|
447 |
+
<tbody className="bg-white divide-y divide-gray-200">
|
448 |
+
{Object.entries(bestPerCategory)
|
449 |
+
.sort(([a], [b]) => a.localeCompare(b))
|
450 |
+
.map(([catDisplayKey, bestInfo], idx) => (
|
451 |
+
<tr
|
452 |
+
key={catDisplayKey}
|
453 |
+
className={
|
454 |
+
idx % 2 === 0 ? "bg-white" : "bg-gray-50"
|
455 |
+
}
|
456 |
+
>
|
457 |
+
<td className="px-3 py-2 font-medium text-sm text-gray-900">
|
458 |
+
<Tooltip
|
459 |
+
content={getMetricTooltip(catDisplayKey)}
|
460 |
+
>
|
461 |
+
<span>{catDisplayKey}</span>
|
462 |
+
</Tooltip>
|
463 |
+
</td>
|
464 |
+
<td className="px-3 py-2">
|
465 |
+
{bestInfo.model !== "N/A" ? (
|
466 |
+
<div className="flex items-center">
|
467 |
+
<div
|
468 |
+
className="w-3 h-3 rounded-full mr-2 shrink-0"
|
469 |
+
style={{ backgroundColor: bestInfo.color }}
|
470 |
+
></div>
|
471 |
+
<span className="text-sm">
|
472 |
+
{bestInfo.model}
|
473 |
+
</span>
|
474 |
+
</div>
|
475 |
+
) : (
|
476 |
+
<span className="text-sm text-gray-500">
|
477 |
+
N/A
|
478 |
+
</span>
|
479 |
+
)}
|
480 |
+
</td>
|
481 |
+
<td className="px-3 py-2">
|
482 |
+
{bestInfo.score !== null ? (
|
483 |
+
<span
|
484 |
+
className={`px-2 py-0.5 rounded-full text-xs font-medium ${getScoreBadgeColor(
|
485 |
+
bestInfo.score
|
486 |
+
)}`}
|
487 |
+
>
|
488 |
+
{bestInfo.score.toFixed(1)}
|
489 |
+
</span>
|
490 |
+
) : (
|
491 |
+
<span className="text-sm text-gray-500">
|
492 |
+
N/A
|
493 |
+
</span>
|
494 |
+
)}
|
495 |
+
</td>
|
496 |
+
</tr>
|
497 |
+
))}
|
498 |
+
</tbody>
|
499 |
+
</table>
|
500 |
+
) : (
|
501 |
+
<p className="text-center text-gray-500 py-4">
|
502 |
+
Top performer data not available.
|
503 |
+
</p>
|
504 |
+
)}
|
505 |
+
<p className="text-xs text-gray-500 mt-2">
|
506 |
+
Scores based on user ratings, normalized to 0-100.
|
507 |
+
</p>
|
508 |
+
</div>
|
509 |
+
</div>
|
510 |
+
)}
|
511 |
+
{topPerformersView === "low-level" && (
|
512 |
+
<div className="border rounded-lg overflow-hidden shadow-sm mb-6">
|
513 |
+
<div className="px-4 py-3 bg-gray-50 border-b">
|
514 |
+
<h3 className="font-semibold text-gray-800">
|
515 |
+
Top Performers by Low-Level Metric
|
516 |
+
</h3>
|
517 |
+
</div>
|
518 |
+
<div className="p-4">
|
519 |
+
{Object.entries(bestPerMetric || {}).length > 0 ? (
|
520 |
+
<table className="min-w-full divide-y divide-gray-200">
|
521 |
+
<thead>
|
522 |
+
<tr>
|
523 |
+
<th
|
524 |
+
scope="col"
|
525 |
+
className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider"
|
526 |
+
>
|
527 |
+
Metric
|
528 |
+
</th>
|
529 |
+
<th
|
530 |
+
scope="col"
|
531 |
+
className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider"
|
532 |
+
>
|
533 |
+
Best Model
|
534 |
+
</th>
|
535 |
+
<th
|
536 |
+
scope="col"
|
537 |
+
className="px-3 py-2 text-left text-xs font-medium text-gray-500 uppercase tracking-wider"
|
538 |
+
>
|
539 |
+
Score
|
540 |
+
</th>
|
541 |
+
</tr>
|
542 |
+
</thead>
|
543 |
+
<tbody className="bg-white divide-y divide-gray-200">
|
544 |
+
{Object.entries(bestPerMetric)
|
545 |
+
.sort(([a], [b]) => a.localeCompare(b))
|
546 |
+
.map(([metricDisplayKey, bestInfo], idx) => (
|
547 |
+
<tr
|
548 |
+
key={metricDisplayKey}
|
549 |
+
className={
|
550 |
+
idx % 2 === 0 ? "bg-white" : "bg-gray-50"
|
551 |
+
}
|
552 |
+
>
|
553 |
+
<td className="px-3 py-2 font-medium text-sm text-gray-900">
|
554 |
+
<Tooltip
|
555 |
+
content={getMetricTooltip(metricDisplayKey)}
|
556 |
+
>
|
557 |
+
<span>{metricDisplayKey}</span>
|
558 |
+
</Tooltip>
|
559 |
+
</td>
|
560 |
+
<td className="px-3 py-2">
|
561 |
+
{bestInfo.model !== "N/A" ? (
|
562 |
+
<div className="flex items-center">
|
563 |
+
<div
|
564 |
+
className="w-3 h-3 rounded-full mr-2 shrink-0"
|
565 |
+
style={{ backgroundColor: bestInfo.color }}
|
566 |
+
></div>
|
567 |
+
<span className="text-sm">
|
568 |
+
{bestInfo.model}
|
569 |
+
</span>
|
570 |
+
</div>
|
571 |
+
) : (
|
572 |
+
<span className="text-sm text-gray-500">
|
573 |
+
N/A
|
574 |
+
</span>
|
575 |
+
)}
|
576 |
+
</td>
|
577 |
+
<td className="px-3 py-2">
|
578 |
+
{bestInfo.score !== null ? (
|
579 |
+
<span
|
580 |
+
className={`px-2 py-0.5 rounded-full text-xs font-medium ${getScoreBadgeColor(
|
581 |
+
bestInfo.score
|
582 |
+
)}`}
|
583 |
+
>
|
584 |
+
{bestInfo.score.toFixed(1)}
|
585 |
+
</span>
|
586 |
+
) : (
|
587 |
+
<span className="text-sm text-gray-500">
|
588 |
+
N/A
|
589 |
+
</span>
|
590 |
+
)}
|
591 |
+
</td>
|
592 |
+
</tr>
|
593 |
+
))}
|
594 |
+
</tbody>
|
595 |
+
</table>
|
596 |
+
) : (
|
597 |
+
<p className="text-center text-gray-500 py-4">
|
598 |
+
Low-level metric top performer data not available.
|
599 |
+
</p>
|
600 |
+
)}
|
601 |
+
<p className="text-xs text-gray-500 mt-2">
|
602 |
+
Scores based on user ratings, normalized to 0-100.
|
603 |
+
</p>
|
604 |
+
</div>
|
605 |
+
</div>
|
606 |
+
)}
|
607 |
+
</div>
|
608 |
+
)}{" "}
|
609 |
+
{/* End Overview Tab */}
|
610 |
+
{/* Other Tabs - Pass Correct Props */}
|
611 |
+
{activeTab === "metrics-breakdown" && (
|
612 |
+
<MetricsBreakdown
|
613 |
+
metricsData={metricsData} // Title Case keys inside, plus internalMetricKey
|
614 |
+
modelsMeta={rankedModels} // camelCase keys inside
|
615 |
+
radarData={radarData}
|
616 |
+
/>
|
617 |
+
)}
|
618 |
+
{activeTab === "task-performance" && (
|
619 |
+
<TaskPerformance
|
620 |
+
rawData={rawData} // Contains camelCase top-level, snake_case nested
|
621 |
+
modelsMeta={rankedModels}
|
622 |
+
metricsData={metricsData} // Title Case keys inside, plus internalMetricKey
|
623 |
+
overviewCardData={overviewCardData}
|
624 |
+
/>
|
625 |
+
)}
|
626 |
+
{activeTab === "demographic-analysis" && (
|
627 |
+
<DemographicAnalysis
|
628 |
+
rawData={rawData} // Contains camelCase top-level, snake_case/Title Case nested
|
629 |
+
modelsMeta={rankedModels}
|
630 |
+
metricsData={metricsData} // Title Case keys inside, plus internalMetricKey
|
631 |
+
equityAnalysis={equityAnalysis} // Original snake_case structure
|
632 |
+
/>
|
633 |
+
)}
|
634 |
+
{activeTab === "about" && <About metadata={metadata} />}
|
635 |
+
</div>
|
636 |
+
);
|
637 |
+
};
|
638 |
+
|
639 |
+
export default LLMComparisonDashboard;
|
leaderboard-app/components/MetricsBreakdown.jsx
ADDED
@@ -0,0 +1,447 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// components/MetricsBreakdown.jsx
|
2 |
+
|
3 |
+
"use client";
|
4 |
+
|
5 |
+
import React, { useState, useEffect, useMemo } from "react";
|
6 |
+
import {
|
7 |
+
RadarChart,
|
8 |
+
PolarGrid,
|
9 |
+
PolarAngleAxis,
|
10 |
+
PolarRadiusAxis,
|
11 |
+
Radar,
|
12 |
+
Tooltip as RechartsTooltip, // Renamed to avoid conflict with local Tooltip
|
13 |
+
Legend,
|
14 |
+
ResponsiveContainer,
|
15 |
+
} from "recharts";
|
16 |
+
import { getScoreColor, getMetricTooltip } from "../lib/utils";
|
17 |
+
import { Tooltip } from "./Tooltip"; // Your custom Tooltip component for headers etc.
|
18 |
+
|
19 |
+
// Component receives processed metrics data, model metadata, and category radar data
|
20 |
+
const MetricsBreakdown = ({
|
21 |
+
metricsData,
|
22 |
+
modelsMeta,
|
23 |
+
radarData: categoryRadarDataProp, // Already processed radar data for categories
|
24 |
+
}) => {
|
25 |
+
const [subTab, setSubTab] = useState("categories"); // 'categories' or 'metrics'
|
26 |
+
const [selectedModels, setSelectedModels] = useState([]);
|
27 |
+
|
28 |
+
// console.log("Metrics Data in Breakdown:", metricsData); // For debugging
|
29 |
+
// console.log("Models Meta in Breakdown:", modelsMeta);
|
30 |
+
// console.log("Category Radar Data Prop:", categoryRadarDataProp);
|
31 |
+
|
32 |
+
// Extract data from props with defaults
|
33 |
+
const { highLevelCategories, lowLevelMetrics } = metricsData || {
|
34 |
+
highLevelCategories: {},
|
35 |
+
lowLevelMetrics: {},
|
36 |
+
};
|
37 |
+
// Use modelsMeta directly for clarity, aliasing if preferred
|
38 |
+
const models = modelsMeta || [];
|
39 |
+
|
40 |
+
// Get sorted lists of category and metric names
|
41 |
+
const sortedCategoryNames = useMemo(
|
42 |
+
() =>
|
43 |
+
Object.keys(highLevelCategories || {}).sort((a, b) => a.localeCompare(b)),
|
44 |
+
[highLevelCategories]
|
45 |
+
);
|
46 |
+
const sortedMetricNames = useMemo(
|
47 |
+
() => Object.keys(lowLevelMetrics || {}).sort((a, b) => a.localeCompare(b)),
|
48 |
+
[lowLevelMetrics]
|
49 |
+
);
|
50 |
+
|
51 |
+
// Initialize selections
|
52 |
+
useEffect(() => {
|
53 |
+
if (selectedModels.length === 0 && models.length > 0) {
|
54 |
+
setSelectedModels(models.map((m) => m.model));
|
55 |
+
}
|
56 |
+
// eslint-disable-next-line react-hooks/exhaustive-deps
|
57 |
+
}, [models]); // Only depends on models changing/loading
|
58 |
+
|
59 |
+
// --- Memoized data generation functions ---
|
60 |
+
|
61 |
+
// Radar data for LL Metrics (used when subTab === 'metrics') - CORRECTED ACCESSORS
|
62 |
+
const metricRadarData = useMemo(() => {
|
63 |
+
if (
|
64 |
+
!lowLevelMetrics ||
|
65 |
+
models.length === 0 ||
|
66 |
+
sortedMetricNames.length === 0
|
67 |
+
)
|
68 |
+
return [];
|
69 |
+
return sortedMetricNames.map((metricName) => {
|
70 |
+
const entry = { category: metricName }; // Use metric name as the axis category
|
71 |
+
const metricData = lowLevelMetrics[metricName];
|
72 |
+
if (metricData) {
|
73 |
+
models
|
74 |
+
.filter((m) => selectedModels.includes(m.model))
|
75 |
+
.forEach((model) => {
|
76 |
+
// Use correct camelCase keys
|
77 |
+
entry[model.model] =
|
78 |
+
Number(metricData.modelScores?.[model.model]?.nationalScore) || 0;
|
79 |
+
// Standard deviation per metric is NOT available, so we don't add it here
|
80 |
+
});
|
81 |
+
}
|
82 |
+
return entry;
|
83 |
+
});
|
84 |
+
}, [lowLevelMetrics, models, selectedModels, sortedMetricNames]);
|
85 |
+
|
86 |
+
// Custom tooltip (common for both radar charts) - CORRECTED (removed std dev logic)
|
87 |
+
const CustomRadarTooltip = ({ active, payload, label }) => {
|
88 |
+
if (active && payload && payload.length) {
|
89 |
+
return (
|
90 |
+
<div className="bg-white p-3 border rounded shadow-lg max-w-xs opacity-95">
|
91 |
+
<p className="font-medium mb-1 text-gray-800">{label}</p>
|
92 |
+
{/* Get tooltip description for the category/metric itself */}
|
93 |
+
<p className="text-xs mb-3 text-gray-600 border-b pb-2">
|
94 |
+
{getMetricTooltip(label)}
|
95 |
+
</p>
|
96 |
+
<div className="space-y-1">
|
97 |
+
{payload
|
98 |
+
// Sort models by score within tooltip
|
99 |
+
.sort((a, b) => (b.value || 0) - (a.value || 0))
|
100 |
+
.map((entry) => (
|
101 |
+
<div
|
102 |
+
key={entry.dataKey} // dataKey is the model name here
|
103 |
+
className="flex items-center text-sm"
|
104 |
+
>
|
105 |
+
<div
|
106 |
+
className="w-2.5 h-2.5 rounded-full mr-2 flex-shrink-0"
|
107 |
+
style={{ backgroundColor: entry.color || "#8884d8" }}
|
108 |
+
></div>
|
109 |
+
<span className="mr-1 truncate flex-grow text-gray-700">
|
110 |
+
{entry.name}: {/* name is also the model name */}
|
111 |
+
</span>
|
112 |
+
<span className="font-medium flex-shrink-0 text-gray-900">
|
113 |
+
{/* Ensure value exists and format */}
|
114 |
+
{entry.value !== null && entry.value !== undefined
|
115 |
+
? Number(entry.value).toFixed(1)
|
116 |
+
: "N/A"}
|
117 |
+
{/* Removed standard deviation display */}
|
118 |
+
</span>
|
119 |
+
</div>
|
120 |
+
))}
|
121 |
+
</div>
|
122 |
+
</div>
|
123 |
+
);
|
124 |
+
}
|
125 |
+
return null;
|
126 |
+
};
|
127 |
+
|
128 |
+
// Use the radar data passed via prop for categories view, filtered by selected models - CORRECTED (removed std dev logic)
|
129 |
+
const filteredCategoryRadarData = useMemo(() => {
|
130 |
+
if (!categoryRadarDataProp || models.length === 0) return [];
|
131 |
+
// Filter based on selected models, removing std dev keys
|
132 |
+
return categoryRadarDataProp.map((item) => {
|
133 |
+
const newItem = { category: item.category };
|
134 |
+
models
|
135 |
+
.filter((m) => selectedModels.includes(m.model))
|
136 |
+
.forEach((model) => {
|
137 |
+
// We only need the model score itself for the radar data
|
138 |
+
newItem[model.model] = item[model.model] ?? 0; // Use nullish coalescing for default
|
139 |
+
});
|
140 |
+
return newItem;
|
141 |
+
});
|
142 |
+
}, [categoryRadarDataProp, models, selectedModels]);
|
143 |
+
|
144 |
+
return (
|
145 |
+
<>
|
146 |
+
{/* Top Controls: Model Selector & Sub-Tab Pills (No changes needed) */}
|
147 |
+
<div className="mb-6 flex flex-col md:flex-row justify-between items-center gap-4">
|
148 |
+
{/* Sub-Tab Pills */}
|
149 |
+
<div className="flex space-x-1 p-1 bg-gray-200 rounded-lg">
|
150 |
+
{" "}
|
151 |
+
<button
|
152 |
+
aria-pressed={subTab === "categories"}
|
153 |
+
className={`px-4 py-1.5 text-sm font-medium rounded-md transition-colors duration-150 ${
|
154 |
+
subTab === "categories"
|
155 |
+
? "bg-white shadow text-blue-600"
|
156 |
+
: "text-gray-600 hover:text-gray-800"
|
157 |
+
}`}
|
158 |
+
onClick={() => setSubTab("categories")}
|
159 |
+
>
|
160 |
+
{" "}
|
161 |
+
High-Level Categories{" "}
|
162 |
+
</button>{" "}
|
163 |
+
<button
|
164 |
+
aria-pressed={subTab === "metrics"}
|
165 |
+
className={`px-4 py-1.5 text-sm font-medium rounded-md transition-colors duration-150 ${
|
166 |
+
subTab === "metrics"
|
167 |
+
? "bg-white shadow text-blue-600"
|
168 |
+
: "text-gray-600 hover:text-gray-800"
|
169 |
+
}`}
|
170 |
+
onClick={() => setSubTab("metrics")}
|
171 |
+
>
|
172 |
+
{" "}
|
173 |
+
Low-Level Metrics{" "}
|
174 |
+
</button>{" "}
|
175 |
+
</div>
|
176 |
+
{/* Model Selector */}
|
177 |
+
<div className="flex items-center flex-wrap gap-1">
|
178 |
+
{" "}
|
179 |
+
<span className="text-sm text-gray-500 mr-2">Models:</span>{" "}
|
180 |
+
{models?.map((model) => (
|
181 |
+
<button
|
182 |
+
key={model.model}
|
183 |
+
className={`px-2 py-0.5 text-xs rounded border ${
|
184 |
+
selectedModels.includes(model.model)
|
185 |
+
? "bg-sky-100 text-sky-800 border-sky-300 font-medium"
|
186 |
+
: "bg-gray-100 text-gray-600 border-gray-300 hover:bg-gray-200"
|
187 |
+
}`}
|
188 |
+
onClick={() => {
|
189 |
+
if (selectedModels.includes(model.model)) {
|
190 |
+
if (selectedModels.length > 1) {
|
191 |
+
setSelectedModels(
|
192 |
+
selectedModels.filter((m) => m !== model.model)
|
193 |
+
);
|
194 |
+
}
|
195 |
+
} else {
|
196 |
+
setSelectedModels([...selectedModels, model.model]);
|
197 |
+
}
|
198 |
+
}}
|
199 |
+
>
|
200 |
+
{" "}
|
201 |
+
{model.model}{" "}
|
202 |
+
</button>
|
203 |
+
))}{" "}
|
204 |
+
</div>
|
205 |
+
</div>
|
206 |
+
|
207 |
+
{/* Conditional content based on sub-tab */}
|
208 |
+
{subTab === "categories" && (
|
209 |
+
<div className="space-y-6">
|
210 |
+
{/* CATEGORIES VIEW */}
|
211 |
+
{/* Summary Table: Models as Rows, Categories as Columns - CORRECTED ACCESSORS */}
|
212 |
+
<div className="border rounded-lg overflow-hidden shadow-sm">
|
213 |
+
<div className="px-4 py-3 bg-gray-50 border-b">
|
214 |
+
<h3 className="font-semibold text-gray-800">
|
215 |
+
Category Performance Summary
|
216 |
+
</h3>
|
217 |
+
</div>
|
218 |
+
<div className="p-4 overflow-x-auto">
|
219 |
+
{sortedCategoryNames.length > 0 ? (
|
220 |
+
<table className="min-w-full divide-y divide-gray-200 border border-gray-200">
|
221 |
+
<thead>
|
222 |
+
<tr className="bg-gray-100">
|
223 |
+
<th
|
224 |
+
scope="col"
|
225 |
+
className="sticky left-0 bg-gray-100 px-3 py-2 text-left text-xs font-semibold text-gray-600 uppercase tracking-wider z-10"
|
226 |
+
>
|
227 |
+
Model
|
228 |
+
</th>
|
229 |
+
{sortedCategoryNames.map((catName) => (
|
230 |
+
<th
|
231 |
+
key={catName}
|
232 |
+
scope="col"
|
233 |
+
className="px-3 py-2 text-left text-xs font-semibold text-gray-600 uppercase tracking-wider whitespace-nowrap"
|
234 |
+
>
|
235 |
+
{catName}
|
236 |
+
</th>
|
237 |
+
))}
|
238 |
+
</tr>
|
239 |
+
</thead>
|
240 |
+
<tbody className="bg-white divide-y divide-gray-200">
|
241 |
+
{models
|
242 |
+
?.filter((m) => selectedModels.includes(m.model))
|
243 |
+
.map((model, idx) => (
|
244 |
+
<tr
|
245 |
+
key={model.model}
|
246 |
+
className={
|
247 |
+
idx % 2 === 0
|
248 |
+
? "bg-white hover:bg-gray-50"
|
249 |
+
: "bg-gray-50 hover:bg-gray-100"
|
250 |
+
}
|
251 |
+
>
|
252 |
+
<td className="sticky left-0 bg-inherit px-3 py-2 whitespace-nowrap z-10 text-left">
|
253 |
+
{" "}
|
254 |
+
{/* Keep sticky styles */}
|
255 |
+
<div className="flex items-center">
|
256 |
+
<div
|
257 |
+
className="w-3 h-3 rounded-full mr-2 shrink-0"
|
258 |
+
style={{ backgroundColor: model.color }}
|
259 |
+
></div>
|
260 |
+
<span className="text-sm font-medium">
|
261 |
+
{model.model}
|
262 |
+
</span>
|
263 |
+
</div>
|
264 |
+
</td>
|
265 |
+
{sortedCategoryNames.map((catName) => {
|
266 |
+
// Use correct camelCase keys
|
267 |
+
const scoreData =
|
268 |
+
highLevelCategories[catName]?.modelScores?.[
|
269 |
+
model.model
|
270 |
+
];
|
271 |
+
const score = scoreData?.nationalScore; // Access camelCase key
|
272 |
+
const displayScore =
|
273 |
+
score !== null && score !== undefined
|
274 |
+
? Number(score).toFixed(1)
|
275 |
+
: "N/A";
|
276 |
+
return (
|
277 |
+
<td
|
278 |
+
key={catName}
|
279 |
+
className="px-3 py-2 whitespace-nowrap text-center"
|
280 |
+
>
|
281 |
+
<div
|
282 |
+
className={`text-sm ${
|
283 |
+
displayScore === "N/A"
|
284 |
+
? "text-gray-400"
|
285 |
+
: getScoreColor(score)
|
286 |
+
}`}
|
287 |
+
>
|
288 |
+
{displayScore}
|
289 |
+
</div>
|
290 |
+
</td>
|
291 |
+
);
|
292 |
+
})}
|
293 |
+
</tr>
|
294 |
+
))}
|
295 |
+
</tbody>
|
296 |
+
</table>
|
297 |
+
) : (
|
298 |
+
<p className="text-center text-gray-500 py-4">
|
299 |
+
No category data available.
|
300 |
+
</p>
|
301 |
+
)}
|
302 |
+
</div>
|
303 |
+
</div>
|
304 |
+
|
305 |
+
{/* Radar Chart for Categories (Uses filteredCategoryRadarData) */}
|
306 |
+
<div className="border rounded-lg overflow-hidden shadow-sm">
|
307 |
+
<div className="px-4 py-3 bg-gray-50 border-b flex justify-between items-center">
|
308 |
+
<h3 className="font-semibold text-gray-800">
|
309 |
+
Performance Across Categories
|
310 |
+
</h3>
|
311 |
+
<div className="text-xs text-gray-500">
|
312 |
+
National Average Scores
|
313 |
+
</div>
|
314 |
+
</div>
|
315 |
+
<div className="p-4">
|
316 |
+
{filteredCategoryRadarData &&
|
317 |
+
filteredCategoryRadarData.length > 0 ? (
|
318 |
+
<div className="h-96 md:h-[450px]">
|
319 |
+
<ResponsiveContainer width="100%" height="100%">
|
320 |
+
<RadarChart
|
321 |
+
outerRadius="80%"
|
322 |
+
data={filteredCategoryRadarData}
|
323 |
+
>
|
324 |
+
<PolarGrid gridType="polygon" stroke="#e5e7eb" />
|
325 |
+
<PolarAngleAxis
|
326 |
+
dataKey="category"
|
327 |
+
tick={{ fill: "#4b5563", fontSize: 12 }}
|
328 |
+
/>
|
329 |
+
<PolarRadiusAxis
|
330 |
+
angle={90}
|
331 |
+
domain={[0, 100]}
|
332 |
+
axisLine={false}
|
333 |
+
tick={{ fill: "#6b7280", fontSize: 10 }}
|
334 |
+
/>
|
335 |
+
{models
|
336 |
+
?.filter((m) => selectedModels.includes(m.model))
|
337 |
+
.map((model) => (
|
338 |
+
<Radar
|
339 |
+
key={model.model}
|
340 |
+
name={model.model}
|
341 |
+
dataKey={model.model}
|
342 |
+
stroke={model.color}
|
343 |
+
fill={model.color}
|
344 |
+
fillOpacity={0.1}
|
345 |
+
strokeWidth={2}
|
346 |
+
/>
|
347 |
+
))}
|
348 |
+
{/* Use the corrected CustomRadarTooltip */}
|
349 |
+
<RechartsTooltip content={<CustomRadarTooltip />} />
|
350 |
+
<Legend
|
351 |
+
iconSize={10}
|
352 |
+
wrapperStyle={{ fontSize: "12px", paddingTop: "20px" }}
|
353 |
+
/>
|
354 |
+
</RadarChart>
|
355 |
+
</ResponsiveContainer>
|
356 |
+
</div>
|
357 |
+
) : (
|
358 |
+
<p className="text-center text-gray-500 py-4">
|
359 |
+
Radar data not available.
|
360 |
+
</p>
|
361 |
+
)}
|
362 |
+
<p className="text-xs text-gray-500 mt-4">
|
363 |
+
This radar chart visualizes how each model performs across
|
364 |
+
different high-level evaluation categories. The further out on
|
365 |
+
each axis, the better the performance on that category.
|
366 |
+
</p>
|
367 |
+
</div>
|
368 |
+
</div>
|
369 |
+
</div>
|
370 |
+
)}
|
371 |
+
|
372 |
+
{subTab === "metrics" && (
|
373 |
+
<div className="space-y-6">
|
374 |
+
{/* METRICS VIEW */}
|
375 |
+
{/* Radar Chart for Metrics (Uses metricRadarData) */}
|
376 |
+
<div className="border rounded-lg overflow-hidden shadow-sm">
|
377 |
+
<div className="px-4 py-3 bg-gray-50 border-b flex justify-between items-center">
|
378 |
+
<h3 className="font-semibold text-gray-800">
|
379 |
+
Performance Across All Metrics
|
380 |
+
</h3>
|
381 |
+
<div className="text-xs text-gray-500">
|
382 |
+
National Average Scores
|
383 |
+
</div>
|
384 |
+
</div>
|
385 |
+
<div className="p-4">
|
386 |
+
{metricRadarData.length > 0 ? (
|
387 |
+
<div className="h-96 md:h-[600px]">
|
388 |
+
{" "}
|
389 |
+
{/* Increased height */}
|
390 |
+
<ResponsiveContainer width="100%" height="100%">
|
391 |
+
<RadarChart outerRadius="80%" data={metricRadarData}>
|
392 |
+
{" "}
|
393 |
+
{/* Use metricRadarData */}
|
394 |
+
<PolarGrid gridType="polygon" stroke="#e5e7eb" />
|
395 |
+
<PolarAngleAxis
|
396 |
+
dataKey="category"
|
397 |
+
tick={{ fill: "#4b5563", fontSize: 10 }}
|
398 |
+
/>{" "}
|
399 |
+
{/* Adjusted font size */}
|
400 |
+
<PolarRadiusAxis
|
401 |
+
angle={90}
|
402 |
+
domain={[0, 100]}
|
403 |
+
axisLine={false}
|
404 |
+
tick={{ fill: "#6b7280", fontSize: 10 }}
|
405 |
+
/>
|
406 |
+
{models
|
407 |
+
?.filter((m) => selectedModels.includes(m.model))
|
408 |
+
.map((model) => (
|
409 |
+
<Radar
|
410 |
+
key={model.model}
|
411 |
+
name={model.model}
|
412 |
+
dataKey={model.model}
|
413 |
+
stroke={model.color}
|
414 |
+
fill={model.color}
|
415 |
+
fillOpacity={0.1}
|
416 |
+
strokeWidth={2}
|
417 |
+
/>
|
418 |
+
))}
|
419 |
+
{/* Use the corrected CustomRadarTooltip */}
|
420 |
+
<RechartsTooltip content={<CustomRadarTooltip />} />
|
421 |
+
<Legend
|
422 |
+
iconSize={10}
|
423 |
+
wrapperStyle={{ fontSize: "12px", paddingTop: "20px" }}
|
424 |
+
/>
|
425 |
+
</RadarChart>
|
426 |
+
</ResponsiveContainer>
|
427 |
+
</div>
|
428 |
+
) : (
|
429 |
+
<p className="text-center text-gray-500 py-4">
|
430 |
+
Metric data not available for radar chart.
|
431 |
+
</p>
|
432 |
+
)}
|
433 |
+
<p className="text-xs text-gray-500 mt-4">
|
434 |
+
This radar chart visualizes how each model performs across
|
435 |
+
different low-level metrics. The further out on each axis, the
|
436 |
+
better the performance on that metric.
|
437 |
+
</p>
|
438 |
+
</div>
|
439 |
+
</div>
|
440 |
+
{/* Optional: Add a table summary for low-level metrics similar to the categories one if desired */}
|
441 |
+
</div>
|
442 |
+
)}
|
443 |
+
</>
|
444 |
+
);
|
445 |
+
};
|
446 |
+
|
447 |
+
export default MetricsBreakdown;
|
leaderboard-app/components/TaskPerformance.jsx
ADDED
@@ -0,0 +1,756 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// components/TaskPerformance.jsx
|
2 |
+
|
3 |
+
"use client";
|
4 |
+
|
5 |
+
import React, { useState, useMemo, useEffect } from "react";
|
6 |
+
import {
|
7 |
+
BarChart,
|
8 |
+
Bar,
|
9 |
+
XAxis,
|
10 |
+
YAxis,
|
11 |
+
CartesianGrid,
|
12 |
+
Tooltip as RechartsTooltip,
|
13 |
+
ResponsiveContainer,
|
14 |
+
Cell,
|
15 |
+
} from "recharts";
|
16 |
+
import {
|
17 |
+
getMetricTooltip,
|
18 |
+
getScoreBadgeColor,
|
19 |
+
formatDisplayKey,
|
20 |
+
camelToTitle,
|
21 |
+
} from "../lib/utils"; // Import formatDisplayKey
|
22 |
+
|
23 |
+
// Helper component for info tooltips
|
24 |
+
const InfoTooltip = ({ text }) => {
|
25 |
+
/* ... (no change) ... */
|
26 |
+
const [isVisible, setIsVisible] = useState(false);
|
27 |
+
return (
|
28 |
+
<div className="relative inline-block ml-1 align-middle">
|
29 |
+
<button
|
30 |
+
className="text-gray-400 hover:text-gray-600 focus:outline-none"
|
31 |
+
onMouseEnter={() => setIsVisible(true)}
|
32 |
+
onMouseLeave={() => setIsVisible(false)}
|
33 |
+
onClick={(e) => {
|
34 |
+
e.stopPropagation();
|
35 |
+
setIsVisible(!isVisible);
|
36 |
+
}}
|
37 |
+
aria-label="Info"
|
38 |
+
>
|
39 |
+
<svg
|
40 |
+
xmlns="http://www.w3.org/2000/svg"
|
41 |
+
className="h-4 w-4"
|
42 |
+
viewBox="0 0 20 20"
|
43 |
+
fill="currentColor"
|
44 |
+
>
|
45 |
+
<path
|
46 |
+
fillRule="evenodd"
|
47 |
+
d="M18 10a8 8 0 11-16 0 8 8 0 0116 0zm-7-4a1 1 0 11-2 0 1 1 0 012 0zM9 9a1 1 0 000 2v3a1 1 0 001 1h1a1 1 0 100-2v-3a1 1 0 00-1-1H9z"
|
48 |
+
clipRule="evenodd"
|
49 |
+
/>
|
50 |
+
</svg>{" "}
|
51 |
+
</button>{" "}
|
52 |
+
{isVisible && (
|
53 |
+
<div className="absolute z-10 w-64 p-2 bg-white border rounded shadow-lg text-xs text-gray-700 -translate-x-1/2 left-1/2 mt-1">
|
54 |
+
{text}
|
55 |
+
</div>
|
56 |
+
)}{" "}
|
57 |
+
</div>
|
58 |
+
);
|
59 |
+
};
|
60 |
+
|
61 |
+
// Custom tooltip for charts
|
62 |
+
const CustomTooltip = ({ active, payload, label }) => {
|
63 |
+
/* ... (no change needed) ... */
|
64 |
+
if (active && payload && payload.length) {
|
65 |
+
const sortedPayload = [...payload].sort(
|
66 |
+
(a, b) => (b.value || 0) - (a.value || 0)
|
67 |
+
);
|
68 |
+
return (
|
69 |
+
<div className="bg-white p-3 border rounded shadow-lg max-w-xs">
|
70 |
+
<p className="font-medium text-sm">{label}</p>{" "}
|
71 |
+
{sortedPayload.map((entry, index) => (
|
72 |
+
<div key={`item-${index}`} className="flex items-center mt-1">
|
73 |
+
<div
|
74 |
+
className="w-3 h-3 mr-2 rounded-full flex-shrink-0"
|
75 |
+
style={{
|
76 |
+
backgroundColor:
|
77 |
+
entry.payload?.color || entry.color || "#8884d8",
|
78 |
+
}}
|
79 |
+
></div>{" "}
|
80 |
+
<span className="text-xs flex-grow pr-2">{entry.name}: </span>{" "}
|
81 |
+
<span className="text-xs font-medium ml-1 whitespace-nowrap">
|
82 |
+
{typeof entry.value === "number" ? entry.value.toFixed(1) : "N/A"}
|
83 |
+
</span>{" "}
|
84 |
+
</div>
|
85 |
+
))}{" "}
|
86 |
+
</div>
|
87 |
+
);
|
88 |
+
}
|
89 |
+
return null;
|
90 |
+
};
|
91 |
+
|
92 |
+
// Tab component
|
93 |
+
const TabButton = ({ active, onClick, children }) => (
|
94 |
+
<button
|
95 |
+
aria-pressed={active}
|
96 |
+
className={`px-4 py-1.5 text-sm font-medium rounded-md transition-colors duration-150 ${
|
97 |
+
active
|
98 |
+
? "bg-white shadow text-blue-600"
|
99 |
+
: "text-gray-600 hover:text-gray-800"
|
100 |
+
}`}
|
101 |
+
onClick={onClick}
|
102 |
+
>
|
103 |
+
{children}{" "}
|
104 |
+
</button>
|
105 |
+
);
|
106 |
+
|
107 |
+
// Main component
|
108 |
+
const TaskPerformance = ({
|
109 |
+
rawData,
|
110 |
+
modelsMeta,
|
111 |
+
metricsData, // Expects Title Case keys (e.g., Context Memory) containing internalMetricKey
|
112 |
+
overviewCardData,
|
113 |
+
}) => {
|
114 |
+
const [activeTab, setActiveTab] = useState("top-performers");
|
115 |
+
|
116 |
+
// *** Use Title Case metric keys from processed metricsData ***
|
117 |
+
const highLevelMetricDisplayKeys = useMemo(
|
118 |
+
() => Object.keys(metricsData?.highLevelCategories || {}).sort(),
|
119 |
+
[metricsData?.highLevelCategories]
|
120 |
+
);
|
121 |
+
const lowLevelMetricDisplayKeys = useMemo(
|
122 |
+
() => Object.keys(metricsData?.lowLevelMetrics || {}).sort(),
|
123 |
+
[metricsData?.lowLevelMetrics]
|
124 |
+
);
|
125 |
+
// **************************************************************
|
126 |
+
|
127 |
+
// Access original snake_case keys from rawData
|
128 |
+
const { taskLevelPerformance = {}, tasks = [] } = rawData || {};
|
129 |
+
const { bestModelPerTask = {} } = overviewCardData || {};
|
130 |
+
const models = modelsMeta || [];
|
131 |
+
|
132 |
+
// State for 'Model Performance' tab
|
133 |
+
const [selectedTask, setSelectedTask] = useState(
|
134 |
+
tasks.length > 0 ? tasks[0] : "all"
|
135 |
+
);
|
136 |
+
const [selectedMetricType, setSelectedMetricType] = useState("high");
|
137 |
+
// *** selectedMetric now stores the Title Case display key ***
|
138 |
+
const [selectedMetricDisplayKey, setSelectedMetricDisplayKey] = useState("");
|
139 |
+
// ***********************************************************
|
140 |
+
const [selectedModels, setSelectedModels] = useState([]);
|
141 |
+
|
142 |
+
// Determine current metrics list (Title Case display keys)
|
143 |
+
const currentMetricDisplayKeysList = useMemo(
|
144 |
+
() =>
|
145 |
+
selectedMetricType === "high"
|
146 |
+
? highLevelMetricDisplayKeys
|
147 |
+
: lowLevelMetricDisplayKeys,
|
148 |
+
[selectedMetricType, highLevelMetricDisplayKeys, lowLevelMetricDisplayKeys]
|
149 |
+
);
|
150 |
+
|
151 |
+
// Load models on mount
|
152 |
+
useEffect(() => {
|
153 |
+
if (models.length > 0 && selectedModels.length === 0) {
|
154 |
+
setSelectedModels(models.map((m) => m.model));
|
155 |
+
}
|
156 |
+
}, [models, selectedModels.length]);
|
157 |
+
|
158 |
+
// Set default metric display key when the list or type changes
|
159 |
+
useEffect(() => {
|
160 |
+
if (currentMetricDisplayKeysList.length > 0) {
|
161 |
+
if (
|
162 |
+
!selectedMetricDisplayKey ||
|
163 |
+
!currentMetricDisplayKeysList.includes(selectedMetricDisplayKey)
|
164 |
+
) {
|
165 |
+
setSelectedMetricDisplayKey(currentMetricDisplayKeysList[0]); // Set to the first Title Case key
|
166 |
+
}
|
167 |
+
} else {
|
168 |
+
setSelectedMetricDisplayKey("");
|
169 |
+
}
|
170 |
+
}, [currentMetricDisplayKeysList, selectedMetricDisplayKey]);
|
171 |
+
|
172 |
+
// Prep chart data - *** UPDATED to use internalMetricKey looked up via selectedMetricDisplayKey ***
|
173 |
+
const chartData = useMemo(() => {
|
174 |
+
if (
|
175 |
+
!taskLevelPerformance ||
|
176 |
+
!selectedMetricDisplayKey ||
|
177 |
+
selectedModels.length === 0
|
178 |
+
)
|
179 |
+
return [];
|
180 |
+
|
181 |
+
// Find the internal snake_case key using the selected Title Case display name
|
182 |
+
const allMetricsProcessed = {
|
183 |
+
...(metricsData?.highLevelCategories || {}),
|
184 |
+
...(metricsData?.lowLevelMetrics || {}),
|
185 |
+
};
|
186 |
+
const metricInfo = allMetricsProcessed[selectedMetricDisplayKey]; // Look up using Title Case key
|
187 |
+
const internalMetricKey = metricInfo?.internalMetricKey; // Access the stored snake_case key
|
188 |
+
|
189 |
+
if (!internalMetricKey) {
|
190 |
+
console.warn(
|
191 |
+
`Could not find internal key for selected metric: ${selectedMetricDisplayKey}`
|
192 |
+
);
|
193 |
+
return [];
|
194 |
+
}
|
195 |
+
|
196 |
+
let data = [];
|
197 |
+
if (selectedTask === "all") {
|
198 |
+
const modelAggregates = {};
|
199 |
+
tasks.forEach((task) => {
|
200 |
+
if (taskLevelPerformance[task]) {
|
201 |
+
Object.entries(taskLevelPerformance[task]).forEach(
|
202 |
+
([model, metrics]) => {
|
203 |
+
if (selectedModels.includes(model)) {
|
204 |
+
// *** Use the FOUND snake_case internalMetricKey ***
|
205 |
+
const score = metrics?.[internalMetricKey];
|
206 |
+
if (score !== undefined && score !== null && score !== "N/A") {
|
207 |
+
const numScore = parseFloat(score);
|
208 |
+
if (!isNaN(numScore)) {
|
209 |
+
if (!modelAggregates[model])
|
210 |
+
modelAggregates[model] = { sum: 0, count: 0 };
|
211 |
+
modelAggregates[model].sum += numScore;
|
212 |
+
modelAggregates[model].count++;
|
213 |
+
}
|
214 |
+
}
|
215 |
+
}
|
216 |
+
}
|
217 |
+
);
|
218 |
+
}
|
219 |
+
});
|
220 |
+
data = Object.entries(modelAggregates).map(([model, aggregates]) => {
|
221 |
+
const modelMeta = models.find((m) => m.model === model) || {};
|
222 |
+
return {
|
223 |
+
model: model,
|
224 |
+
score:
|
225 |
+
aggregates.count > 0 ? aggregates.sum / aggregates.count : null,
|
226 |
+
color: modelMeta.color || "#999999",
|
227 |
+
};
|
228 |
+
});
|
229 |
+
} else if (taskLevelPerformance[selectedTask]) {
|
230 |
+
data = Object.entries(taskLevelPerformance[selectedTask])
|
231 |
+
.filter(([model, _metrics]) => selectedModels.includes(model))
|
232 |
+
.map(([model, metrics]) => {
|
233 |
+
// *** Use the FOUND snake_case internalMetricKey ***
|
234 |
+
const score = metrics?.[internalMetricKey];
|
235 |
+
const modelMeta = models.find((m) => m.model === model) || {};
|
236 |
+
return {
|
237 |
+
model: model,
|
238 |
+
score:
|
239 |
+
score !== undefined && score !== null && score !== "N/A"
|
240 |
+
? parseFloat(score)
|
241 |
+
: null,
|
242 |
+
color: modelMeta.color || "#999999",
|
243 |
+
};
|
244 |
+
});
|
245 |
+
}
|
246 |
+
|
247 |
+
return data
|
248 |
+
.filter((item) => item.score !== null && !isNaN(item.score))
|
249 |
+
.sort((a, b) => b.score - a.score);
|
250 |
+
// Update dependencies
|
251 |
+
}, [
|
252 |
+
selectedTask,
|
253 |
+
selectedMetricDisplayKey,
|
254 |
+
selectedModels,
|
255 |
+
taskLevelPerformance,
|
256 |
+
models,
|
257 |
+
metricsData,
|
258 |
+
tasks,
|
259 |
+
]);
|
260 |
+
|
261 |
+
// Task definitions
|
262 |
+
const featuredTasks = useMemo(
|
263 |
+
() => [
|
264 |
+
/* ... (keep task definitions array) ... */ {
|
265 |
+
id: "Generating a Creative Idea",
|
266 |
+
title: "Generating Creative Ideas",
|
267 |
+
description: "Brainstorming unique birthday gift ideas.",
|
268 |
+
icon: (color) => (
|
269 |
+
<svg
|
270 |
+
style={{ color: color || "#6b7280" }}
|
271 |
+
className="h-8 w-8"
|
272 |
+
fill="none"
|
273 |
+
viewBox="0 0 24 24"
|
274 |
+
stroke="currentColor"
|
275 |
+
>
|
276 |
+
<path
|
277 |
+
strokeLinecap="round"
|
278 |
+
strokeLinejoin="round"
|
279 |
+
strokeWidth={2}
|
280 |
+
d="M9.663 17h4.673M12 3v1m6.364 1.636l-.707.707M21 12h-1M4 12H3m3.343-5.657l-.707-.707m2.828 9.9a5 5 0 117.072 0l-.548.547A3.374 3.374 0 0014 18.469V19a2 2 0 11-4 0v-.531c0-.895-.356-1.754-.988-2.386l-.548-.547z"
|
281 |
+
/>
|
282 |
+
</svg>
|
283 |
+
),
|
284 |
+
},
|
285 |
+
{
|
286 |
+
id: "Creating a Travel Itinerary",
|
287 |
+
title: "Creating Travel Itinerary",
|
288 |
+
description: "Planning a European city break.",
|
289 |
+
icon: (color) => (
|
290 |
+
<svg
|
291 |
+
style={{ color: color || "#6b7280" }}
|
292 |
+
className="h-8 w-8"
|
293 |
+
fill="none"
|
294 |
+
viewBox="0 0 24 24"
|
295 |
+
stroke="currentColor"
|
296 |
+
>
|
297 |
+
<path
|
298 |
+
strokeLinecap="round"
|
299 |
+
strokeLinejoin="round"
|
300 |
+
strokeWidth={2}
|
301 |
+
d="M17.657 16.657L13.414 20.9a1.998 1.998 0 01-2.827 0l-4.244-4.243a8 8 0 1111.314 0z"
|
302 |
+
/>
|
303 |
+
<path
|
304 |
+
strokeLinecap="round"
|
305 |
+
strokeLinejoin="round"
|
306 |
+
strokeWidth={2}
|
307 |
+
d="M15 11a3 3 0 11-6 0 3 3 0 016 0z"
|
308 |
+
/>
|
309 |
+
</svg>
|
310 |
+
),
|
311 |
+
},
|
312 |
+
{
|
313 |
+
id: "Following Up on a Job Application",
|
314 |
+
title: "Following Up on Job App",
|
315 |
+
description: "Drafting a professional follow-up email.",
|
316 |
+
icon: (color) => (
|
317 |
+
<svg
|
318 |
+
style={{ color: color || "#6b7280" }}
|
319 |
+
className="h-8 w-8"
|
320 |
+
fill="none"
|
321 |
+
viewBox="0 0 24 24"
|
322 |
+
stroke="currentColor"
|
323 |
+
>
|
324 |
+
<path
|
325 |
+
strokeLinecap="round"
|
326 |
+
strokeLinejoin="round"
|
327 |
+
strokeWidth={2}
|
328 |
+
d="M3 8l7.89 5.26a2 2 0 002.22 0L21 8M5 19h14a2 2 0 002-2V7a2 2 0 00-2-2H5a2 2 0 00-2 2v10a2 2 0 002 2z"
|
329 |
+
/>
|
330 |
+
</svg>
|
331 |
+
),
|
332 |
+
},
|
333 |
+
{
|
334 |
+
id: "Planning Your Weekly Meals",
|
335 |
+
title: "Planning Weekly Meals",
|
336 |
+
description: "Creating a meal plan accommodating dietary restrictions.",
|
337 |
+
icon: (color) => (
|
338 |
+
<svg
|
339 |
+
style={{ color: color || "#6b7280" }}
|
340 |
+
className="h-8 w-8"
|
341 |
+
fill="none"
|
342 |
+
viewBox="0 0 24 24"
|
343 |
+
stroke="currentColor"
|
344 |
+
>
|
345 |
+
<path
|
346 |
+
strokeLinecap="round"
|
347 |
+
strokeLinejoin="round"
|
348 |
+
strokeWidth={2}
|
349 |
+
d="M9 5H7a2 2 0 00-2 2v12a2 2 0 002 2h10a2 2 0 002-2V7a2 2 0 00-2-2h-2M9 5a2 2 0 002 2h2a2 2 0 002-2M9 5a2 2 0 012-2h2a2 2 0 012 2"
|
350 |
+
/>
|
351 |
+
</svg>
|
352 |
+
),
|
353 |
+
},
|
354 |
+
{
|
355 |
+
id: "Making a Decision Between Options",
|
356 |
+
title: "Making a Decision",
|
357 |
+
description: "Comparing tech products for purchase.",
|
358 |
+
icon: (color) => (
|
359 |
+
<svg
|
360 |
+
style={{ color: color || "#6b7280" }}
|
361 |
+
className="h-8 w-8"
|
362 |
+
fill="none"
|
363 |
+
viewBox="0 0 24 24"
|
364 |
+
stroke="currentColor"
|
365 |
+
strokeWidth={2}
|
366 |
+
>
|
367 |
+
<path
|
368 |
+
strokeLinecap="round"
|
369 |
+
strokeLinejoin="round"
|
370 |
+
d="M14 5l7 7m0 0l-7 7m7-7H3"
|
371 |
+
/>{" "}
|
372 |
+
<path
|
373 |
+
strokeLinecap="round"
|
374 |
+
strokeLinejoin="round"
|
375 |
+
d="M10 19l-7-7m0 0l7-7m-7 7h17"
|
376 |
+
/>
|
377 |
+
</svg>
|
378 |
+
),
|
379 |
+
},
|
380 |
+
{
|
381 |
+
id: "Understanding a Complex Topic",
|
382 |
+
title: "Understanding a Complex Topic",
|
383 |
+
description: "Learning about day trading concepts.",
|
384 |
+
icon: (color) => (
|
385 |
+
<svg
|
386 |
+
style={{ color: color || "#6b7280" }}
|
387 |
+
className="h-8 w-8"
|
388 |
+
fill="none"
|
389 |
+
viewBox="0 0 24 24"
|
390 |
+
stroke="currentColor"
|
391 |
+
>
|
392 |
+
<path
|
393 |
+
strokeLinecap="round"
|
394 |
+
strokeLinejoin="round"
|
395 |
+
strokeWidth={2}
|
396 |
+
d="M12 6.253v13m0-13C10.832 5.477 9.246 5 7.5 5S4.168 5.477 3 6.253v13C4.168 18.477 5.754 18 7.5 18s3.332.477 4.5 1.253m0-13C13.168 5.477 14.754 5 16.5 5c1.747 0 3.332.477 4.5 1.253v13C19.832 18.477 18.247 18 16.5 18c-1.746 0-3.332.477-4.5 1.253"
|
397 |
+
/>
|
398 |
+
</svg>
|
399 |
+
),
|
400 |
+
},
|
401 |
+
],
|
402 |
+
[]
|
403 |
+
);
|
404 |
+
const tasksToDisplay = useMemo(() => {
|
405 |
+
const availableTaskKeys = bestModelPerTask
|
406 |
+
? Object.keys(bestModelPerTask)
|
407 |
+
: [];
|
408 |
+
return featuredTasks.filter((ft) => availableTaskKeys.includes(ft.id));
|
409 |
+
}, [bestModelPerTask, featuredTasks]);
|
410 |
+
const taskRankings = useMemo(() => {
|
411 |
+
const rankings = {};
|
412 |
+
tasksToDisplay.forEach((task) => {
|
413 |
+
const taskId = task.id;
|
414 |
+
if (!taskLevelPerformance[taskId]) {
|
415 |
+
rankings[taskId] = [];
|
416 |
+
return;
|
417 |
+
}
|
418 |
+
const taskScores = models
|
419 |
+
.map((modelMeta) => {
|
420 |
+
const modelData = taskLevelPerformance[taskId][modelMeta.model];
|
421 |
+
if (!modelData) return null;
|
422 |
+
const scores = Object.values(modelData)
|
423 |
+
.map((s) => parseFloat(s))
|
424 |
+
.filter((s) => !isNaN(s));
|
425 |
+
if (scores.length === 0) return null;
|
426 |
+
const avgScore =
|
427 |
+
scores.reduce((sum, score) => sum + score, 0) / scores.length;
|
428 |
+
return {
|
429 |
+
model: modelMeta.model,
|
430 |
+
taskAvgScore: avgScore,
|
431 |
+
color: modelMeta.color || "#999999",
|
432 |
+
};
|
433 |
+
})
|
434 |
+
.filter((item) => item !== null)
|
435 |
+
.sort((a, b) => b.taskAvgScore - a.taskAvgScore);
|
436 |
+
rankings[taskId] = taskScores;
|
437 |
+
});
|
438 |
+
return rankings;
|
439 |
+
}, [tasksToDisplay, taskLevelPerformance, models]);
|
440 |
+
|
441 |
+
const renderTopPerformersTab = () => (
|
442 |
+
<div className="mb-6">
|
443 |
+
<div className="grid grid-cols-1 md:grid-cols-2 lg:grid-cols-3 gap-6">
|
444 |
+
{tasksToDisplay.length === 0 && (
|
445 |
+
<p className="col-span-full text-center text-gray-500 py-8">
|
446 |
+
No task performance data available.
|
447 |
+
</p>
|
448 |
+
)}
|
449 |
+
{tasksToDisplay.map((task) => {
|
450 |
+
const bestModelInfo = bestModelPerTask?.[task.id];
|
451 |
+
const topModelsForTask = taskRankings[task.id] || [];
|
452 |
+
if (!bestModelInfo || bestModelInfo.model === "N/A") return null;
|
453 |
+
const modelColor = bestModelInfo.color || "#6b7280";
|
454 |
+
return (
|
455 |
+
<div
|
456 |
+
key={task.id}
|
457 |
+
className="border rounded-lg overflow-hidden shadow-sm bg-white flex flex-col"
|
458 |
+
>
|
459 |
+
<div className="px-4 py-2 bg-gray-50 border-b flex items-center flex-shrink-0">
|
460 |
+
<h3
|
461 |
+
className="font-semibold text-sm flex-grow truncate pr-2"
|
462 |
+
title={task.title}
|
463 |
+
>
|
464 |
+
{task.title}
|
465 |
+
</h3>
|
466 |
+
<div
|
467 |
+
className="ml-1 w-2 h-2 rounded-full flex-shrink-0"
|
468 |
+
style={{ backgroundColor: modelColor }}
|
469 |
+
aria-hidden="true"
|
470 |
+
></div>
|
471 |
+
</div>
|
472 |
+
<div className="p-4 flex-grow flex flex-col">
|
473 |
+
<div className="flex items-center mb-4 flex-shrink-0">
|
474 |
+
<div
|
475 |
+
className="p-2 rounded-full flex-shrink-0"
|
476 |
+
style={{ backgroundColor: `${modelColor}20` }}
|
477 |
+
>
|
478 |
+
{task.icon(modelColor)}
|
479 |
+
</div>
|
480 |
+
<div className="ml-4 overflow-hidden">
|
481 |
+
<h4
|
482 |
+
className="text-lg font-semibold truncate"
|
483 |
+
title={bestModelInfo.model}
|
484 |
+
>
|
485 |
+
{bestModelInfo.model}
|
486 |
+
</h4>
|
487 |
+
<p className="text-sm text-gray-600">
|
488 |
+
Avg. Score: {bestModelInfo.score?.toFixed(1) ?? "N/A"}
|
489 |
+
</p>
|
490 |
+
</div>
|
491 |
+
</div>
|
492 |
+
<div className="mb-4 flex-grow">
|
493 |
+
<h5 className="text-sm font-semibold mb-2">Task Ranking</h5>
|
494 |
+
{topModelsForTask.length > 0 ? (
|
495 |
+
<ol className="space-y-1.5 list-none pl-0">
|
496 |
+
{topModelsForTask.map((rankedModel, index) => (
|
497 |
+
<li
|
498 |
+
key={rankedModel.model}
|
499 |
+
className="text-sm flex items-center justify-between"
|
500 |
+
>
|
501 |
+
<div className="flex items-center truncate mr-2">
|
502 |
+
<span className="font-medium w-4 mr-1.5 text-gray-500">
|
503 |
+
{index + 1}.
|
504 |
+
</span>
|
505 |
+
<div
|
506 |
+
className="w-2.5 h-2.5 rounded-full mr-1.5 flex-shrink-0"
|
507 |
+
style={{ backgroundColor: rankedModel.color }}
|
508 |
+
></div>
|
509 |
+
<span
|
510 |
+
className="truncate"
|
511 |
+
title={rankedModel.model}
|
512 |
+
>
|
513 |
+
{rankedModel.model}
|
514 |
+
</span>
|
515 |
+
</div>
|
516 |
+
<span
|
517 |
+
className={`font-medium flex-shrink-0 px-1.5 py-0.5 text-xs rounded ${getScoreBadgeColor(
|
518 |
+
rankedModel.taskAvgScore
|
519 |
+
)}`}
|
520 |
+
>
|
521 |
+
{rankedModel.taskAvgScore?.toFixed(1) ?? "N/A"}
|
522 |
+
</span>
|
523 |
+
</li>
|
524 |
+
))}
|
525 |
+
</ol>
|
526 |
+
) : (
|
527 |
+
<p className="text-xs text-gray-500 italic">
|
528 |
+
Ranking data not available.
|
529 |
+
</p>
|
530 |
+
)}
|
531 |
+
</div>
|
532 |
+
<p className="text-xs text-gray-600 mt-auto pt-2 flex-shrink-0">
|
533 |
+
Task Example: {task.description}
|
534 |
+
</p>
|
535 |
+
</div>
|
536 |
+
</div>
|
537 |
+
);
|
538 |
+
})}
|
539 |
+
</div>
|
540 |
+
</div>
|
541 |
+
);
|
542 |
+
|
543 |
+
// Render the model performance analysis tab - *** UPDATED SELECTOR & LABELS ***
|
544 |
+
const renderModelPerformanceTab = () => (
|
545 |
+
<div>
|
546 |
+
{/* Controls Panel */}
|
547 |
+
<div className="border rounded-lg overflow-hidden mb-6 shadow-sm">
|
548 |
+
<div className="px-4 py-3 bg-gray-50 border-b">
|
549 |
+
<h3 className="font-semibold text-gray-800">
|
550 |
+
Task Analysis Controls
|
551 |
+
</h3>
|
552 |
+
</div>
|
553 |
+
<div className="p-4 flex flex-wrap items-center gap-4">
|
554 |
+
{/* Task Selector */}
|
555 |
+
<div className="w-full sm:w-auto">
|
556 |
+
<label
|
557 |
+
htmlFor="taskSelect"
|
558 |
+
className="block text-sm font-medium text-gray-700 mb-1"
|
559 |
+
>
|
560 |
+
Task
|
561 |
+
</label>
|
562 |
+
<select
|
563 |
+
id="taskSelect"
|
564 |
+
className="w-full sm:w-64 border rounded-md px-3 py-2 bg-white shadow-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
|
565 |
+
value={selectedTask}
|
566 |
+
onChange={(e) => setSelectedTask(e.target.value)}
|
567 |
+
>
|
568 |
+
<option value="all">All Tasks (Average)</option>
|
569 |
+
{tasks.sort().map((task) => (
|
570 |
+
<option key={task} value={task}>
|
571 |
+
{task}
|
572 |
+
</option>
|
573 |
+
))}
|
574 |
+
</select>
|
575 |
+
</div>
|
576 |
+
{/* Metric Type Selector Pills */}
|
577 |
+
<div className="flex flex-col">
|
578 |
+
<label className="block text-sm font-medium text-gray-700 mb-1">
|
579 |
+
Metric Type
|
580 |
+
</label>
|
581 |
+
<div className="flex space-x-1 p-1 bg-gray-200 rounded-lg">
|
582 |
+
<TabButton
|
583 |
+
active={selectedMetricType === "high"}
|
584 |
+
onClick={() => setSelectedMetricType("high")}
|
585 |
+
>
|
586 |
+
High-Level
|
587 |
+
</TabButton>
|
588 |
+
<TabButton
|
589 |
+
active={selectedMetricType === "low"}
|
590 |
+
onClick={() => setSelectedMetricType("low")}
|
591 |
+
>
|
592 |
+
Low-Level
|
593 |
+
</TabButton>
|
594 |
+
</div>
|
595 |
+
</div>
|
596 |
+
{/* Metric Selector - VALUE is Title Case key, displays Title Case */}
|
597 |
+
<div className="w-full sm:w-auto">
|
598 |
+
<label
|
599 |
+
htmlFor="metricSelect"
|
600 |
+
className="block text-sm font-medium text-gray-700 mb-1"
|
601 |
+
>
|
602 |
+
{selectedMetricType === "high"
|
603 |
+
? "High-Level Metric"
|
604 |
+
: "Low-Level Metric"}
|
605 |
+
</label>
|
606 |
+
<select
|
607 |
+
id="metricSelect"
|
608 |
+
className="w-full sm:w-48 border rounded-md px-3 py-2 bg-white shadow-sm focus:outline-none focus:ring-2 focus:ring-blue-500"
|
609 |
+
value={selectedMetricDisplayKey} // VALUE is the Title Case key
|
610 |
+
onChange={(e) => setSelectedMetricDisplayKey(e.target.value)} // Store Title Case key
|
611 |
+
disabled={currentMetricDisplayKeysList.length === 0}
|
612 |
+
>
|
613 |
+
{currentMetricDisplayKeysList.length === 0 && (
|
614 |
+
<option value="">No metrics</option>
|
615 |
+
)}
|
616 |
+
{/* Iterate through Title Case keys, display Title Case */}
|
617 |
+
{currentMetricDisplayKeysList.map((displayKey) => (
|
618 |
+
<option key={displayKey} value={displayKey}>
|
619 |
+
{displayKey}
|
620 |
+
</option>
|
621 |
+
))}
|
622 |
+
</select>
|
623 |
+
</div>
|
624 |
+
</div>
|
625 |
+
</div>
|
626 |
+
|
627 |
+
{/* Chart Visualization */}
|
628 |
+
<div className="border rounded-lg overflow-hidden mb-6 shadow-sm">
|
629 |
+
{/* Use selectedMetricDisplayKey for title */}
|
630 |
+
<div className="px-4 py-3 bg-gray-50 border-b">
|
631 |
+
<h3 className="font-semibold text-gray-800">
|
632 |
+
{`${selectedMetricDisplayKey || "Selected Metric"} Comparison for `}
|
633 |
+
<span className="font-normal">
|
634 |
+
{selectedTask === "all"
|
635 |
+
? "All Tasks (Average)"
|
636 |
+
: `"${selectedTask}"`}
|
637 |
+
</span>
|
638 |
+
</h3>
|
639 |
+
</div>
|
640 |
+
<div className="p-4">
|
641 |
+
{chartData.length > 0 ? (
|
642 |
+
<div className="h-80">
|
643 |
+
<ResponsiveContainer width="100%" height="100%">
|
644 |
+
<BarChart
|
645 |
+
data={chartData}
|
646 |
+
margin={{ top: 5, right: 5, left: 0, bottom: 5 }}
|
647 |
+
barCategoryGap="20%"
|
648 |
+
>
|
649 |
+
<CartesianGrid strokeDasharray="3 3" vertical={false} />
|
650 |
+
<XAxis dataKey="model" hide />
|
651 |
+
<YAxis domain={[0, 100]} width={30} tick={{ fontSize: 11 }} />
|
652 |
+
<RechartsTooltip
|
653 |
+
content={<CustomTooltip />}
|
654 |
+
wrapperStyle={{ zIndex: 10 }}
|
655 |
+
/>
|
656 |
+
{/* Use Title Case key for Bar name */}
|
657 |
+
<Bar
|
658 |
+
dataKey="score"
|
659 |
+
name={selectedMetricDisplayKey || "Score"}
|
660 |
+
radius={[4, 4, 0, 0]}
|
661 |
+
>
|
662 |
+
{chartData.map((entry, index) => (
|
663 |
+
<Cell key={`cell-${index}`} fill={entry.color} />
|
664 |
+
))}
|
665 |
+
</Bar>
|
666 |
+
</BarChart>
|
667 |
+
</ResponsiveContainer>
|
668 |
+
<div className="flex flex-wrap justify-center gap-x-4 gap-y-1 mt-4 text-xs">
|
669 |
+
{chartData.map((entry) => (
|
670 |
+
<div key={entry.model} className="flex items-center">
|
671 |
+
<div
|
672 |
+
className="w-2.5 h-2.5 rounded-full mr-1.5"
|
673 |
+
style={{ backgroundColor: entry.color }}
|
674 |
+
></div>
|
675 |
+
<span>{entry.model}</span>
|
676 |
+
</div>
|
677 |
+
))}
|
678 |
+
</div>
|
679 |
+
</div>
|
680 |
+
) : (
|
681 |
+
<div className="flex items-center justify-center h-60 bg-gray-50 rounded">
|
682 |
+
<div className="text-center p-4">
|
683 |
+
<svg
|
684 |
+
xmlns="http://www.w3.org/2000/svg"
|
685 |
+
className="h-10 w-10 mx-auto text-gray-400 mb-3"
|
686 |
+
fill="none"
|
687 |
+
viewBox="0 0 24 24"
|
688 |
+
stroke="currentColor"
|
689 |
+
>
|
690 |
+
<path
|
691 |
+
strokeLinecap="round"
|
692 |
+
strokeLinejoin="round"
|
693 |
+
strokeWidth={2}
|
694 |
+
d="M9 17v-2m3 2v-4m3 4v-6m2 10H7a2 2 0 01-2-2V7a2 2 0 012-2h2l2-3h6l2 3h2a2 2 0 012 2v10a2 2 0 01-2 2h-1"
|
695 |
+
/>
|
696 |
+
</svg>
|
697 |
+
<h3 className="text-lg font-medium text-gray-900 mb-1">
|
698 |
+
No Data Available
|
699 |
+
</h3>
|
700 |
+
<p className="text-sm text-gray-600">
|
701 |
+
No data available for the selected task, metric, and models.
|
702 |
+
</p>
|
703 |
+
</div>
|
704 |
+
</div>
|
705 |
+
)}
|
706 |
+
<div className="mt-15 text-xs text-gray-500">
|
707 |
+
{/* Corrected margin-top */}
|
708 |
+
{/* Use Title Case key for display and lookup */}
|
709 |
+
<p>
|
710 |
+
This chart shows{" "}
|
711 |
+
<strong>
|
712 |
+
{selectedMetricDisplayKey || "the selected metric"}
|
713 |
+
</strong>{" "}
|
714 |
+
scores (0-100, higher is better) for models on
|
715 |
+
{selectedTask === "all"
|
716 |
+
? "average across all tasks"
|
717 |
+
: `the "${selectedTask}" task`}
|
718 |
+
.
|
719 |
+
{selectedMetricDisplayKey &&
|
720 |
+
` Metric definition: ${getMetricTooltip(
|
721 |
+
selectedMetricDisplayKey
|
722 |
+
)}`}
|
723 |
+
</p>
|
724 |
+
</div>
|
725 |
+
</div>
|
726 |
+
</div>
|
727 |
+
</div>
|
728 |
+
);
|
729 |
+
|
730 |
+
// Main return with tabs
|
731 |
+
return (
|
732 |
+
<div>
|
733 |
+
<div className="mb-6 flex flex-col md:flex-row justify-between items-center gap-4">
|
734 |
+
<div className="flex space-x-1 p-1 bg-gray-200 rounded-lg">
|
735 |
+
<TabButton
|
736 |
+
active={activeTab === "top-performers"}
|
737 |
+
onClick={() => setActiveTab("top-performers")}
|
738 |
+
>
|
739 |
+
Top Performing Models by Task
|
740 |
+
</TabButton>{" "}
|
741 |
+
<TabButton
|
742 |
+
active={activeTab === "model-performance"}
|
743 |
+
onClick={() => setActiveTab("model-performance")}
|
744 |
+
>
|
745 |
+
Model Performance Comparison
|
746 |
+
</TabButton>{" "}
|
747 |
+
</div>{" "}
|
748 |
+
</div>
|
749 |
+
{activeTab === "top-performers"
|
750 |
+
? renderTopPerformersTab()
|
751 |
+
: renderModelPerformanceTab()}
|
752 |
+
</div>
|
753 |
+
);
|
754 |
+
};
|
755 |
+
|
756 |
+
export default TaskPerformance;
|
leaderboard-app/components/Tooltip.jsx
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"use client";
|
2 |
+
|
3 |
+
import React, { useState, useRef, useEffect } from "react";
|
4 |
+
|
5 |
+
export const Tooltip = ({
|
6 |
+
content,
|
7 |
+
children,
|
8 |
+
position = "top",
|
9 |
+
showIcon = true,
|
10 |
+
iconClassName = "",
|
11 |
+
}) => {
|
12 |
+
const [isVisible, setIsVisible] = useState(false);
|
13 |
+
const [tooltipStyle, setTooltipStyle] = useState({});
|
14 |
+
const tooltipRef = useRef(null);
|
15 |
+
const iconRef = useRef(null);
|
16 |
+
|
17 |
+
const showTooltip = () => setIsVisible(true);
|
18 |
+
const hideTooltip = () => setIsVisible(false);
|
19 |
+
|
20 |
+
// Position the tooltip when it becomes visible
|
21 |
+
useEffect(() => {
|
22 |
+
if (isVisible && iconRef.current && tooltipRef.current) {
|
23 |
+
const triggerRect = iconRef.current.getBoundingClientRect();
|
24 |
+
const tooltipRect = tooltipRef.current.getBoundingClientRect();
|
25 |
+
const spacing = 8; // Space between trigger and tooltip
|
26 |
+
|
27 |
+
let style = {};
|
28 |
+
|
29 |
+
switch (position) {
|
30 |
+
case "top":
|
31 |
+
style = {
|
32 |
+
left:
|
33 |
+
triggerRect.left + triggerRect.width / 2 - tooltipRect.width / 2,
|
34 |
+
top: triggerRect.top - tooltipRect.height - spacing,
|
35 |
+
};
|
36 |
+
break;
|
37 |
+
case "bottom":
|
38 |
+
style = {
|
39 |
+
left:
|
40 |
+
triggerRect.left + triggerRect.width / 2 - tooltipRect.width / 2,
|
41 |
+
top: triggerRect.bottom + spacing,
|
42 |
+
};
|
43 |
+
break;
|
44 |
+
case "left":
|
45 |
+
style = {
|
46 |
+
left: triggerRect.left - tooltipRect.width - spacing,
|
47 |
+
top:
|
48 |
+
triggerRect.top + triggerRect.height / 2 - tooltipRect.height / 2,
|
49 |
+
};
|
50 |
+
break;
|
51 |
+
case "right":
|
52 |
+
style = {
|
53 |
+
left: triggerRect.right + spacing,
|
54 |
+
top:
|
55 |
+
triggerRect.top + triggerRect.height / 2 - tooltipRect.height / 2,
|
56 |
+
};
|
57 |
+
break;
|
58 |
+
}
|
59 |
+
|
60 |
+
// Adjust if tooltip would go off-screen
|
61 |
+
const viewportWidth = window.innerWidth;
|
62 |
+
const viewportHeight = window.innerHeight;
|
63 |
+
|
64 |
+
if (style.left < 10) style.left = 10;
|
65 |
+
if (style.left + tooltipRect.width > viewportWidth - 10) {
|
66 |
+
style.left = viewportWidth - tooltipRect.width - 10;
|
67 |
+
}
|
68 |
+
|
69 |
+
if (style.top < 10) style.top = 10;
|
70 |
+
if (style.top + tooltipRect.height > viewportHeight - 10) {
|
71 |
+
style.top = viewportHeight - tooltipRect.height - 10;
|
72 |
+
}
|
73 |
+
|
74 |
+
// Convert to fixed position
|
75 |
+
style.position = "fixed";
|
76 |
+
style.left = `${style.left}px`;
|
77 |
+
style.top = `${style.top}px`;
|
78 |
+
|
79 |
+
setTooltipStyle(style);
|
80 |
+
}
|
81 |
+
}, [isVisible, position]);
|
82 |
+
|
83 |
+
return (
|
84 |
+
<div className="inline-flex items-center relative">
|
85 |
+
{children}
|
86 |
+
|
87 |
+
{showIcon && (
|
88 |
+
<div
|
89 |
+
ref={iconRef}
|
90 |
+
className={`inline-flex items-center justify-center ml-1 cursor-help ${iconClassName}`}
|
91 |
+
onMouseEnter={showTooltip}
|
92 |
+
onMouseLeave={hideTooltip}
|
93 |
+
>
|
94 |
+
<svg
|
95 |
+
xmlns="http://www.w3.org/2000/svg"
|
96 |
+
className="h-4 w-4 text-gray-400 hover:text-gray-500"
|
97 |
+
fill="none"
|
98 |
+
viewBox="0 0 24 24"
|
99 |
+
stroke="currentColor"
|
100 |
+
>
|
101 |
+
<path
|
102 |
+
strokeLinecap="round"
|
103 |
+
strokeLinejoin="round"
|
104 |
+
strokeWidth={2}
|
105 |
+
d="M13 16h-1v-4h-1m1-4h.01M21 12a9 9 0 11-18 0 9 9 0 0118 0z"
|
106 |
+
/>
|
107 |
+
</svg>
|
108 |
+
</div>
|
109 |
+
)}
|
110 |
+
|
111 |
+
{isVisible && (
|
112 |
+
<div
|
113 |
+
ref={tooltipRef}
|
114 |
+
className="z-50 bg-gray-800 text-white text-xs rounded py-1 px-2 max-w-xs shadow-lg pointer-events-none"
|
115 |
+
style={{
|
116 |
+
...tooltipStyle,
|
117 |
+
}}
|
118 |
+
>
|
119 |
+
{content}
|
120 |
+
<div
|
121 |
+
className={`absolute w-2 h-2 bg-gray-800 transform rotate-45 ${
|
122 |
+
position === "top"
|
123 |
+
? "bottom-0 translate-y-1/2"
|
124 |
+
: position === "bottom"
|
125 |
+
? "top-0 -translate-y-1/2"
|
126 |
+
: position === "left"
|
127 |
+
? "right-0 translate-x-1/2"
|
128 |
+
: "left-0 -translate-x-1/2"
|
129 |
+
}`}
|
130 |
+
style={{
|
131 |
+
left:
|
132 |
+
position === "top" || position === "bottom"
|
133 |
+
? "calc(50% - 4px)"
|
134 |
+
: "",
|
135 |
+
top:
|
136 |
+
position === "left" || position === "right"
|
137 |
+
? "calc(50% - 4px)"
|
138 |
+
: "",
|
139 |
+
}}
|
140 |
+
/>
|
141 |
+
</div>
|
142 |
+
)}
|
143 |
+
</div>
|
144 |
+
);
|
145 |
+
};
|
leaderboard-app/eslint.config.mjs
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import { dirname } from "path";
|
2 |
+
import { fileURLToPath } from "url";
|
3 |
+
import { FlatCompat } from "@eslint/eslintrc";
|
4 |
+
|
5 |
+
const __filename = fileURLToPath(import.meta.url);
|
6 |
+
const __dirname = dirname(__filename);
|
7 |
+
|
8 |
+
const compat = new FlatCompat({
|
9 |
+
baseDirectory: __dirname,
|
10 |
+
});
|
11 |
+
|
12 |
+
const eslintConfig = [...compat.extends("next/core-web-vitals")];
|
13 |
+
|
14 |
+
export default eslintConfig;
|
leaderboard-app/jsconfig.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"compilerOptions": {
|
3 |
+
"paths": {
|
4 |
+
"@/*": ["./*"]
|
5 |
+
}
|
6 |
+
}
|
7 |
+
}
|
leaderboard-app/lib/utils.js
ADDED
@@ -0,0 +1,708 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
// lib/utils.js
|
2 |
+
|
3 |
+
/**
|
4 |
+
* Constants
|
5 |
+
*/
|
6 |
+
const MODEL_COLORS = {
|
7 |
+
"gpt-4o": "#0072B2", // Strong blue
|
8 |
+
"claude-3.7-sonnet": "#D55E00", // Vermillion/orange-red
|
9 |
+
"deepseek-r1": "#F0E442", // Yellow
|
10 |
+
o1: "#CC79A7", // Pink
|
11 |
+
"gemini-2.0-flash-001": "#009E73", // Bluish green
|
12 |
+
"llama-3.1-405b-instruct": "#56B4E9", // Light blue
|
13 |
+
};
|
14 |
+
|
15 |
+
// --- Helper Functions ---
|
16 |
+
|
17 |
+
/**
|
18 |
+
* Converts camelCase to Title Case.
|
19 |
+
* @param {string} str Input string.
|
20 |
+
* @returns {string} Title Case string.
|
21 |
+
*/
|
22 |
+
export const camelToTitle = (str) => {
|
23 |
+
if (!str) return str;
|
24 |
+
const spaced = str.replace(/([A-Z])/g, " $1");
|
25 |
+
return spaced.charAt(0).toUpperCase() + spaced.slice(1).trim();
|
26 |
+
};
|
27 |
+
|
28 |
+
/**
|
29 |
+
* Helper to format metric/factor names (snake/kebab to Title Case)
|
30 |
+
* Needed for display consistency when keys are snake_case.
|
31 |
+
*/
|
32 |
+
export const formatDisplayKey = (key) => {
|
33 |
+
if (!key || typeof key !== "string") return "N/A";
|
34 |
+
if (key === "N/A") return "N/A";
|
35 |
+
// Handle snake_case or kebab-case input
|
36 |
+
return key
|
37 |
+
.replace(/_/g, " ")
|
38 |
+
.replace(/-/g, " ")
|
39 |
+
.trim()
|
40 |
+
.replace(/\b\w/g, (l) => l.toUpperCase());
|
41 |
+
};
|
42 |
+
|
43 |
+
/**
|
44 |
+
* Helper to get Significance indicator style and tooltip
|
45 |
+
*/
|
46 |
+
export function getSignificanceIndicator(isSignificant, pValue, alpha = 0.05) {
|
47 |
+
const pValueFormatted =
|
48 |
+
typeof pValue === "number" && !isNaN(pValue) ? pValue.toFixed(3) : "N/A";
|
49 |
+
if (isSignificant === true) {
|
50 |
+
return {
|
51 |
+
symbol: "✓",
|
52 |
+
className: "text-green-600",
|
53 |
+
tooltip: `Statistically Significant (p=${pValueFormatted} < ${alpha})`,
|
54 |
+
};
|
55 |
+
} else if (isSignificant === false) {
|
56 |
+
return {
|
57 |
+
symbol: "✗",
|
58 |
+
className: "text-red-600",
|
59 |
+
tooltip: `Not Statistically Significant (p=${pValueFormatted} ≥ ${alpha})`,
|
60 |
+
};
|
61 |
+
} else {
|
62 |
+
return {
|
63 |
+
symbol: "?",
|
64 |
+
className: "text-gray-400",
|
65 |
+
tooltip: "Significance Undetermined",
|
66 |
+
};
|
67 |
+
}
|
68 |
+
}
|
69 |
+
|
70 |
+
/**
|
71 |
+
* Determines the style and tooltip for an equity gap status indicator.
|
72 |
+
*/
|
73 |
+
export function getEquityIndicatorStyle(
|
74 |
+
isConcern,
|
75 |
+
isLargeEffect,
|
76 |
+
isSignificant,
|
77 |
+
pValue,
|
78 |
+
effectSizeClass
|
79 |
+
) {
|
80 |
+
const pValueText =
|
81 |
+
typeof pValue === "number" && !isNaN(pValue)
|
82 |
+
? `p=${pValue.toFixed(3)}`
|
83 |
+
: "p=N/A";
|
84 |
+
const effectText = `Effect: ${effectSizeClass || "N/A"}`;
|
85 |
+
if (isConcern === true) {
|
86 |
+
return {
|
87 |
+
icon: "▲",
|
88 |
+
colorClass: "text-red-600",
|
89 |
+
tooltip: `Equity Concern (${effectText}, Significant, ${pValueText})`,
|
90 |
+
};
|
91 |
+
} else if (isSignificant === null) {
|
92 |
+
return {
|
93 |
+
icon: "?",
|
94 |
+
colorClass: "text-gray-500",
|
95 |
+
tooltip: `Significance Undetermined (${effectText})`,
|
96 |
+
};
|
97 |
+
} else if (isLargeEffect === true && isSignificant === false) {
|
98 |
+
return {
|
99 |
+
icon: "●",
|
100 |
+
colorClass: "text-yellow-600",
|
101 |
+
tooltip: `Large Effect but Not Statistically Significant (${pValueText})`,
|
102 |
+
};
|
103 |
+
} else if (isSignificant === true) {
|
104 |
+
return {
|
105 |
+
icon: "✓",
|
106 |
+
colorClass: "text-green-600",
|
107 |
+
tooltip: `Statistically Significant but Not Large Effect (${effectText}, ${pValueText})`,
|
108 |
+
};
|
109 |
+
} else {
|
110 |
+
return {
|
111 |
+
icon: "✓",
|
112 |
+
colorClass: "text-gray-400",
|
113 |
+
tooltip: `Not Statistically Significant (${effectText}, ${pValueText})`,
|
114 |
+
};
|
115 |
+
}
|
116 |
+
}
|
117 |
+
|
118 |
+
/**
|
119 |
+
* Determine styling based on score for generic BADGES (background + text)
|
120 |
+
*/
|
121 |
+
export function getScoreBadgeColor(score, min = 0, max = 100) {
|
122 |
+
const numericScore = Number(score);
|
123 |
+
if (
|
124 |
+
score === null ||
|
125 |
+
score === undefined ||
|
126 |
+
score === "N/A" ||
|
127 |
+
isNaN(numericScore)
|
128 |
+
) {
|
129 |
+
return "bg-gray-100 text-gray-800";
|
130 |
+
}
|
131 |
+
const range = Math.abs(max - min);
|
132 |
+
if (range <= 0) return "bg-gray-100 text-gray-800";
|
133 |
+
let percent;
|
134 |
+
if (max > min) {
|
135 |
+
percent = ((numericScore - min) / range) * 100;
|
136 |
+
} else {
|
137 |
+
percent = ((min - numericScore) / range) * 100;
|
138 |
+
}
|
139 |
+
if (percent >= 80) return "bg-green-100 text-green-800";
|
140 |
+
if (percent >= 50) return "bg-blue-100 text-blue-800";
|
141 |
+
if (percent >= 20) return "bg-yellow-100 text-yellow-800";
|
142 |
+
return "bg-red-100 text-red-800";
|
143 |
+
}
|
144 |
+
|
145 |
+
/**
|
146 |
+
* Determine TEXT color based on score (0-100 scale, higher is better)
|
147 |
+
*/
|
148 |
+
export function getScoreColor(score) {
|
149 |
+
const numericScore = Number(score);
|
150 |
+
if (
|
151 |
+
score === null ||
|
152 |
+
score === undefined ||
|
153 |
+
score === "N/A" ||
|
154 |
+
isNaN(numericScore)
|
155 |
+
) {
|
156 |
+
return "text-gray-400";
|
157 |
+
}
|
158 |
+
if (numericScore >= 80) return "text-green-600 font-medium";
|
159 |
+
if (numericScore >= 60) return "text-blue-600";
|
160 |
+
if (numericScore >= 40) return "text-yellow-600";
|
161 |
+
return "text-red-600";
|
162 |
+
}
|
163 |
+
|
164 |
+
/**
|
165 |
+
* Tooltip text for metrics and table headers - Accepts original keys
|
166 |
+
*/
|
167 |
+
export const getMetricTooltip = (key) => {
|
168 |
+
// Format the key for display/lookup in tooltips map if needed
|
169 |
+
const titleCaseKey = formatDisplayKey(key); // Convert snake_case/camelCase to Title Case
|
170 |
+
|
171 |
+
const tooltips = {
|
172 |
+
// Use Title Case keys matching dropdowns/headers
|
173 |
+
// High-level
|
174 |
+
Helpfulness:
|
175 |
+
"How well the model provides useful assistance that addresses user needs",
|
176 |
+
Communication:
|
177 |
+
"Quality of clarity, coherence, and appropriateness of writing style",
|
178 |
+
Understanding:
|
179 |
+
"How well the model comprehends requests and contextual information",
|
180 |
+
Adaptiveness:
|
181 |
+
"How well the model adjusts to user needs and feedback during conversation",
|
182 |
+
Trustworthiness:
|
183 |
+
"Transparency, accuracy, and consistency in model responses",
|
184 |
+
Personality:
|
185 |
+
"Consistency and definition of the model's persona and ethical alignment",
|
186 |
+
"Background And Culture":
|
187 |
+
"Cultural sensitivity, relevance, and freedom from bias",
|
188 |
+
"Repeat Usage":
|
189 |
+
"User satisfaction and willingness to use the model again (score 0-100).",
|
190 |
+
|
191 |
+
// Low-level (use formatted names matching display)
|
192 |
+
Effectiveness: "How effectively the model helps accomplish specific goals",
|
193 |
+
Comprehensiveness:
|
194 |
+
"How thoroughly the model addresses all aspects of requests",
|
195 |
+
Usefulness: "Practicality and relevance of suggestions or solutions",
|
196 |
+
"Tone And Language Style":
|
197 |
+
"Appropriateness of tone and language for the context",
|
198 |
+
"Conversation Flow": "Natural and conversational quality of responses",
|
199 |
+
"Detail And Technical Language":
|
200 |
+
"Appropriate level of detail and technical language",
|
201 |
+
Accuracy: "Accuracy in interpreting user requests",
|
202 |
+
"Context Memory": "Ability to maintain conversation context",
|
203 |
+
Intuitiveness: "Ability to pick up on implicit aspects of requests",
|
204 |
+
Flexibility: "Adapting responses based on user feedback",
|
205 |
+
Clarity: "Ability to clarify ambiguities or misunderstandings",
|
206 |
+
"Conversation Building": "Building upon previous exchanges in conversation",
|
207 |
+
Consistency: "Consistency of responses across similar questions",
|
208 |
+
Confidence: "User confidence in accuracy of information",
|
209 |
+
Transparency: "Openness about limitations or uncertainties",
|
210 |
+
"Personality Consistency":
|
211 |
+
"Consistency of personality throughout interactions",
|
212 |
+
"Distinct Personality": "How well-defined the model's personality is",
|
213 |
+
"Honesty Empathy Fairness": "Alignment with ethical expectations",
|
214 |
+
"Ethical Alignment": "Alignment with user culture, viewpoint, or values",
|
215 |
+
"Cultural Awareness":
|
216 |
+
"Recognition of when cultural perspective is relevant",
|
217 |
+
"Bias And Stereotypes": "Freedom from stereotypes and bias in responses",
|
218 |
+
|
219 |
+
// Table headers
|
220 |
+
"Overall Score":
|
221 |
+
"Average score across high-level categories (0-100). Higher is better.",
|
222 |
+
"Overall SD":
|
223 |
+
"Standard Deviation (± points) of scores across high-level categories. Lower indicates more consistent performance across capabilities.",
|
224 |
+
"Max Equity Gap":
|
225 |
+
"Score difference (points) for the demographic gap with the largest statistical effect size for this model. Status icon indicates Equity Concern (▲) and/or Significance (✓/✗/?). Hover for details.",
|
226 |
+
"Max Gap Area":
|
227 |
+
"The specific Demographic Factor and Category where the 'Max Equity Gap' (largest effect size gap) occurred for this model.",
|
228 |
+
"Equity Concerns (%)":
|
229 |
+
"Percentage of evaluated demographic gaps flagged as Equity Concerns (Large Effect & Statistically Significant, p<0.05). Lower is better.",
|
230 |
+
"User Retention":
|
231 |
+
"Model score for the 'Repeat Usage' category (0-100), indicating likelihood of users using the model again.",
|
232 |
+
};
|
233 |
+
// Try lookup with formatted key, then original key as fallback
|
234 |
+
return tooltips[titleCaseKey] || tooltips[key] || "No description available";
|
235 |
+
};
|
236 |
+
|
237 |
+
/**
|
238 |
+
* Badge color based on Effect Size Class
|
239 |
+
*/
|
240 |
+
export function getEffectSizeBadgeColor(effectSizeClass) {
|
241 |
+
if (!effectSizeClass || effectSizeClass === "N/A") {
|
242 |
+
return "bg-gray-100 text-gray-800";
|
243 |
+
}
|
244 |
+
switch (effectSizeClass) {
|
245 |
+
case "Negligible":
|
246 |
+
return "bg-green-100 text-green-800";
|
247 |
+
case "Small":
|
248 |
+
return "bg-blue-100 text-blue-800";
|
249 |
+
case "Medium":
|
250 |
+
return "bg-yellow-100 text-yellow-800";
|
251 |
+
case "Large":
|
252 |
+
return "bg-red-100 text-red-800";
|
253 |
+
default:
|
254 |
+
return "bg-gray-100 text-gray-800";
|
255 |
+
}
|
256 |
+
}
|
257 |
+
|
258 |
+
/**
|
259 |
+
* Helper function to process task performance data
|
260 |
+
* Expects rawData input with snake_case keys
|
261 |
+
*/
|
262 |
+
function processTaskPerformance(rawData, taskCategoryMap, modelOrder) {
|
263 |
+
const result = {
|
264 |
+
bestModelPerTask: {},
|
265 |
+
keyMetricsByTask: {},
|
266 |
+
bestModelPerTaskCategory: {
|
267 |
+
creative: null,
|
268 |
+
practical: null,
|
269 |
+
analytical: null,
|
270 |
+
},
|
271 |
+
keyMetricsByTaskCategory: { creative: [], practical: [], analytical: [] },
|
272 |
+
};
|
273 |
+
// Access original snake_case key from input
|
274 |
+
const taskPerformance = rawData?.task_level_performance;
|
275 |
+
|
276 |
+
if (!taskPerformance || typeof taskPerformance !== "object") {
|
277 |
+
console.warn(
|
278 |
+
"Task level performance data missing or invalid in processTaskPerformance input."
|
279 |
+
);
|
280 |
+
return result;
|
281 |
+
}
|
282 |
+
|
283 |
+
// Task names are keys in taskPerformance
|
284 |
+
Object.keys(taskPerformance).forEach((taskName) => {
|
285 |
+
const taskData = taskPerformance[taskName];
|
286 |
+
if (!taskData) return;
|
287 |
+
let taskBestModel = null;
|
288 |
+
let taskBestAvgScore = -Infinity;
|
289 |
+
let taskBestModelMetrics = null;
|
290 |
+
modelOrder.forEach((modelName) => {
|
291 |
+
// Iterate through known models
|
292 |
+
const modelMetrics = taskData[modelName];
|
293 |
+
if (modelMetrics && typeof modelMetrics === "object") {
|
294 |
+
// Access metric scores using original snake_case keys within modelMetrics
|
295 |
+
const scores = Object.values(modelMetrics)
|
296 |
+
.map((s) => Number(s))
|
297 |
+
.filter((s) => !isNaN(s));
|
298 |
+
if (scores.length > 0) {
|
299 |
+
const avgScore =
|
300 |
+
scores.reduce((sum, score) => sum + score, 0) / scores.length;
|
301 |
+
if (avgScore > taskBestAvgScore) {
|
302 |
+
taskBestAvgScore = avgScore;
|
303 |
+
taskBestModel = modelName;
|
304 |
+
taskBestModelMetrics = modelMetrics;
|
305 |
+
}
|
306 |
+
}
|
307 |
+
}
|
308 |
+
});
|
309 |
+
|
310 |
+
if (taskBestModel && taskBestModelMetrics) {
|
311 |
+
result.bestModelPerTask[taskName] = {
|
312 |
+
model: taskBestModel,
|
313 |
+
score: taskBestAvgScore,
|
314 |
+
color: MODEL_COLORS[taskBestModel] || "#999999",
|
315 |
+
};
|
316 |
+
// Extract top metrics (keys are snake_case)
|
317 |
+
const metricsArray = Object.entries(taskBestModelMetrics)
|
318 |
+
.map(([metricKey, score]) => ({ metricKey, score: Number(score) || 0 }))
|
319 |
+
.sort((a, b) => b.score - a.score);
|
320 |
+
// Store with snake_case key, add display name
|
321 |
+
result.keyMetricsByTask[taskName] = metricsArray
|
322 |
+
.slice(0, 3)
|
323 |
+
.map((m) => ({ ...m, metricName: formatDisplayKey(m.metricKey) }));
|
324 |
+
} else {
|
325 |
+
result.bestModelPerTask[taskName] = {
|
326 |
+
model: "N/A",
|
327 |
+
score: "N/A",
|
328 |
+
color: "#999999",
|
329 |
+
};
|
330 |
+
result.keyMetricsByTask[taskName] = [];
|
331 |
+
}
|
332 |
+
});
|
333 |
+
|
334 |
+
// Task Categories processing
|
335 |
+
const tasksByCategory = { creative: [], practical: [], analytical: [] };
|
336 |
+
Object.entries(taskCategoryMap).forEach(([task, category]) => {
|
337 |
+
if (tasksByCategory[category] && taskPerformance[task]) {
|
338 |
+
tasksByCategory[category].push(task);
|
339 |
+
}
|
340 |
+
});
|
341 |
+
Object.entries(tasksByCategory).forEach(([category, tasks]) => {
|
342 |
+
const categoryNameDisplay = `${
|
343 |
+
category.charAt(0).toUpperCase() + category.slice(1)
|
344 |
+
} Tasks`;
|
345 |
+
if (tasks.length === 0) {
|
346 |
+
result.bestModelPerTaskCategory[category] = {
|
347 |
+
model: "N/A",
|
348 |
+
score: "N/A",
|
349 |
+
color: "#999999",
|
350 |
+
categoryName: categoryNameDisplay,
|
351 |
+
};
|
352 |
+
result.keyMetricsByTaskCategory[category] = [];
|
353 |
+
return;
|
354 |
+
}
|
355 |
+
const categoryModelScores = {};
|
356 |
+
modelOrder.forEach((modelName) => {
|
357 |
+
categoryModelScores[modelName] = { totalScore: 0, count: 0, metrics: {} };
|
358 |
+
tasks.forEach((task) => {
|
359 |
+
if (taskPerformance[task]?.[modelName]) {
|
360 |
+
// metricKey is original snake_case here
|
361 |
+
Object.entries(taskPerformance[task][modelName]).forEach(
|
362 |
+
([metricKey, score]) => {
|
363 |
+
const numScore = Number(score);
|
364 |
+
if (!isNaN(numScore)) {
|
365 |
+
categoryModelScores[modelName].totalScore += numScore;
|
366 |
+
categoryModelScores[modelName].count++;
|
367 |
+
if (!categoryModelScores[modelName].metrics[metricKey])
|
368 |
+
categoryModelScores[modelName].metrics[metricKey] = {
|
369 |
+
sum: 0,
|
370 |
+
count: 0,
|
371 |
+
};
|
372 |
+
categoryModelScores[modelName].metrics[metricKey].sum +=
|
373 |
+
numScore;
|
374 |
+
categoryModelScores[modelName].metrics[metricKey].count++;
|
375 |
+
}
|
376 |
+
}
|
377 |
+
);
|
378 |
+
}
|
379 |
+
});
|
380 |
+
});
|
381 |
+
let bestAvg = -Infinity;
|
382 |
+
let bestCatModel = null;
|
383 |
+
Object.entries(categoryModelScores).forEach(([model, data]) => {
|
384 |
+
if (data.count > 0) {
|
385 |
+
const avg = data.totalScore / data.count;
|
386 |
+
if (avg > bestAvg) {
|
387 |
+
bestAvg = avg;
|
388 |
+
bestCatModel = model;
|
389 |
+
}
|
390 |
+
}
|
391 |
+
});
|
392 |
+
|
393 |
+
if (bestCatModel) {
|
394 |
+
result.bestModelPerTaskCategory[category] = {
|
395 |
+
model: bestCatModel,
|
396 |
+
score: Number(bestAvg.toFixed(1)),
|
397 |
+
color: MODEL_COLORS[bestCatModel] || "#999999",
|
398 |
+
categoryName: categoryNameDisplay,
|
399 |
+
};
|
400 |
+
const bestModelMetricsData =
|
401 |
+
categoryModelScores[bestCatModel]?.metrics || {};
|
402 |
+
// metricKey is snake_case
|
403 |
+
const metricAverages = Object.entries(bestModelMetricsData)
|
404 |
+
.map(([metricKey, data]) => ({
|
405 |
+
metricKey,
|
406 |
+
score: data.count > 0 ? data.sum / data.count : 0,
|
407 |
+
}))
|
408 |
+
.sort((a, b) => b.score - a.score);
|
409 |
+
// Store with original key, add display name
|
410 |
+
result.keyMetricsByTaskCategory[category] = metricAverages
|
411 |
+
.slice(0, 5)
|
412 |
+
.map((m) => ({
|
413 |
+
metric: formatDisplayKey(m.metricKey),
|
414 |
+
score: m.score,
|
415 |
+
scoreDisplay: m.score.toFixed(1),
|
416 |
+
}));
|
417 |
+
} else {
|
418 |
+
result.bestModelPerTaskCategory[category] = {
|
419 |
+
model: "N/A",
|
420 |
+
score: "N/A",
|
421 |
+
color: "#999999",
|
422 |
+
categoryName: categoryNameDisplay,
|
423 |
+
};
|
424 |
+
result.keyMetricsByTaskCategory[category] = [];
|
425 |
+
}
|
426 |
+
});
|
427 |
+
return result; // Returns object with camelCase keys
|
428 |
+
}
|
429 |
+
|
430 |
+
/**
|
431 |
+
* Prepares the data from leaderboard_data.json for visualization
|
432 |
+
* FINAL v4: Reverted deep camelCase conversion. Processes top-level keys and adds equity concern %.
|
433 |
+
* Keeps nested raw data keys as original (snake_case).
|
434 |
+
* @param {Object} rawDataInput - The raw data from leaderboard_data.json (expected snake_case)
|
435 |
+
* @returns {Object} - Processed data ready for visualization
|
436 |
+
*/
|
437 |
+
export function prepareDataForVisualization(rawDataInput) {
|
438 |
+
// Basic Validation
|
439 |
+
const defaultReturn = {
|
440 |
+
models: [],
|
441 |
+
metricsData: { highLevelCategories: {}, lowLevelMetrics: {} },
|
442 |
+
radarData: [],
|
443 |
+
bestPerCategory: {},
|
444 |
+
bestPerMetric: {},
|
445 |
+
overviewCardData: {},
|
446 |
+
rawData: {},
|
447 |
+
metadata: {},
|
448 |
+
equityAnalysis: {},
|
449 |
+
};
|
450 |
+
if (
|
451 |
+
!rawDataInput ||
|
452 |
+
!rawDataInput.model_order ||
|
453 |
+
!Array.isArray(rawDataInput.model_order)
|
454 |
+
) {
|
455 |
+
console.error(
|
456 |
+
"prepareDataForVisualization received invalid rawData.",
|
457 |
+
rawDataInput
|
458 |
+
);
|
459 |
+
return defaultReturn;
|
460 |
+
}
|
461 |
+
|
462 |
+
// Keep original references where structure is maintained
|
463 |
+
const modelOrder = rawDataInput.model_order;
|
464 |
+
const equityAnalysis = rawDataInput.equity_analysis || {
|
465 |
+
all_equity_gaps: [],
|
466 |
+
model_max_effect_gaps: {},
|
467 |
+
universal_issues: [],
|
468 |
+
assessment_method: {},
|
469 |
+
demographic_variation_stats: {},
|
470 |
+
};
|
471 |
+
const allGaps = equityAnalysis.all_equity_gaps || [];
|
472 |
+
const metadata = rawDataInput.metadata || {};
|
473 |
+
const mrpDemographicsRaw = rawDataInput.mrp_demographics || {};
|
474 |
+
const taskLevelPerformanceRaw = rawDataInput.task_level_performance || {};
|
475 |
+
|
476 |
+
// Process MRP Demographics for filtering options
|
477 |
+
const demographicFactors = new Set();
|
478 |
+
const demographicLevels = {};
|
479 |
+
const availableMetrics = new Set();
|
480 |
+
if (mrpDemographicsRaw && typeof mrpDemographicsRaw === "object") {
|
481 |
+
Object.values(mrpDemographicsRaw).forEach((modelData) => {
|
482 |
+
Object.entries(modelData || {}).forEach(([factor, factorData]) => {
|
483 |
+
demographicFactors.add(factor);
|
484 |
+
if (!demographicLevels[factor]) demographicLevels[factor] = new Set();
|
485 |
+
Object.entries(factorData || {}).forEach(([level, levelData]) => {
|
486 |
+
demographicLevels[factor].add(level);
|
487 |
+
Object.keys(levelData || {}).forEach((metric) =>
|
488 |
+
availableMetrics.add(metric)
|
489 |
+
);
|
490 |
+
});
|
491 |
+
});
|
492 |
+
}); // metric is Title Case here from Python processing
|
493 |
+
}
|
494 |
+
const demographicOptions = {};
|
495 |
+
demographicFactors.forEach((factor) => {
|
496 |
+
demographicOptions[factor] = Array.from(
|
497 |
+
demographicLevels[factor] || new Set()
|
498 |
+
).sort();
|
499 |
+
});
|
500 |
+
const availableMetricsList = Array.from(availableMetrics).sort(); // These are Title Case
|
501 |
+
|
502 |
+
// Process Overall Rankings -> camelCase & add equity concern %
|
503 |
+
const overallRankingProcessed = (rawDataInput.overall_ranking || []).map(
|
504 |
+
(modelData) => {
|
505 |
+
const modelName = modelData.model;
|
506 |
+
// details object keys are snake_case from python
|
507 |
+
const maxEffectGapDetails = modelData.max_effect_gap_details || {};
|
508 |
+
const safeParseFloat = (val) => {
|
509 |
+
const num = Number(val);
|
510 |
+
return isNaN(num) ? null : num;
|
511 |
+
};
|
512 |
+
|
513 |
+
const modelSpecificGaps = allGaps.filter(
|
514 |
+
(gap) => gap.model === modelName
|
515 |
+
); // Access snake_case keys in allGaps
|
516 |
+
const totalGapsForModel = modelSpecificGaps.length;
|
517 |
+
const concernCountForModel = modelSpecificGaps.filter(
|
518 |
+
(gap) => gap.is_equity_concern === true
|
519 |
+
).length;
|
520 |
+
let equityConcernPercentage = null;
|
521 |
+
if (totalGapsForModel > 0) {
|
522 |
+
equityConcernPercentage =
|
523 |
+
(concernCountForModel / totalGapsForModel) * 100;
|
524 |
+
}
|
525 |
+
|
526 |
+
// Return structure with camelCase keys
|
527 |
+
return {
|
528 |
+
rank: modelData.rank,
|
529 |
+
model: modelName,
|
530 |
+
overallScore: safeParseFloat(modelData.overall_score),
|
531 |
+
highLevelCatScore: safeParseFloat(modelData.high_level_cat_score),
|
532 |
+
lowLevelCatScore: safeParseFloat(modelData.low_level_cat_score),
|
533 |
+
color: MODEL_COLORS[modelName] || "#999999",
|
534 |
+
// Use snake_case keys from input JSON for these fields
|
535 |
+
stdDevAcrossCats: modelData.std_dev_across_cats,
|
536 |
+
stdDevAcrossCatsNumeric: safeParseFloat(modelData.std_dev_across_cats),
|
537 |
+
repeatUsageScore: safeParseFloat(modelData.repeat_usage_score),
|
538 |
+
maxEffectCategory: modelData.max_effect_category, // snake_case from input
|
539 |
+
maxEffectFactor: maxEffectGapDetails.demographic_factor, // snake_case from input
|
540 |
+
maxEffectSize: safeParseFloat(maxEffectGapDetails.effect_size),
|
541 |
+
maxEffectGap: safeParseFloat(maxEffectGapDetails.score_range),
|
542 |
+
maxEffectConcernFlag: maxEffectGapDetails.is_equity_concern ?? false,
|
543 |
+
maxEffectSignificant: maxEffectGapDetails.is_statistically_significant,
|
544 |
+
maxEffectPValue: maxEffectGapDetails.p_value,
|
545 |
+
maxEffectSizeClass: maxEffectGapDetails.effect_size_class || "N/A",
|
546 |
+
maxEffectRawNHeuristic:
|
547 |
+
maxEffectGapDetails.raw_n_confidence_heuristic || "N/A",
|
548 |
+
maxEffectGapDetails: maxEffectGapDetails, // Pass original snake_case details
|
549 |
+
equityConcernPercentage: equityConcernPercentage,
|
550 |
+
};
|
551 |
+
}
|
552 |
+
);
|
553 |
+
|
554 |
+
// Process Metrics Breakdown -> camelCase keys for structure, keep original metric keys inside
|
555 |
+
const metricsBreakdownProcessed = {
|
556 |
+
highLevelCategories: {},
|
557 |
+
lowLevelMetrics: {},
|
558 |
+
};
|
559 |
+
if (
|
560 |
+
rawDataInput.metrics_breakdown &&
|
561 |
+
typeof rawDataInput.metrics_breakdown === "object"
|
562 |
+
) {
|
563 |
+
const processCategory = (displayKey, categoryData) => {
|
564 |
+
// Input displayKey is Title Case from python output
|
565 |
+
if (!categoryData || !categoryData.model_scores) {
|
566 |
+
console.warn(`Missing model_scores for category: ${displayKey}`);
|
567 |
+
return {
|
568 |
+
modelScores: {},
|
569 |
+
topPerformer: { model: "N/A", score: null, color: "#999999" },
|
570 |
+
};
|
571 |
+
}
|
572 |
+
const internalMetricKey = categoryData._internal_category_name; // Get original snake_case key
|
573 |
+
const processedModelScores = {};
|
574 |
+
modelOrder.forEach((modelName) => {
|
575 |
+
const scores = categoryData.model_scores[modelName]; // Access model scores
|
576 |
+
if (!scores) {
|
577 |
+
processedModelScores[modelName] = {
|
578 |
+
nationalScore: null,
|
579 |
+
color: MODEL_COLORS[modelName] || "#999999",
|
580 |
+
maxEffectGapInfo: {},
|
581 |
+
};
|
582 |
+
return;
|
583 |
+
}
|
584 |
+
const maxEffectGapInfoForCat = scores.max_effect_gap_info || {}; // snake_case keys inside? Check python output. Assume yes.
|
585 |
+
processedModelScores[modelName] = {
|
586 |
+
nationalScore: scores.national_score ?? null,
|
587 |
+
color: MODEL_COLORS[modelName] || "#999999",
|
588 |
+
// Keep original snake_case keys for gap info within this structure
|
589 |
+
maxEffectGapInfo: maxEffectGapInfoForCat,
|
590 |
+
};
|
591 |
+
});
|
592 |
+
const topPerf = categoryData.top_performer || {};
|
593 |
+
const topPerfScore =
|
594 |
+
topPerf.score === "N/A" || topPerf.score === null
|
595 |
+
? null
|
596 |
+
: Number(topPerf.score);
|
597 |
+
return {
|
598 |
+
modelScores: processedModelScores, // Nested scores
|
599 |
+
topPerformer: {
|
600 |
+
model: topPerf.model || "N/A",
|
601 |
+
score: isNaN(topPerfScore) ? null : topPerfScore,
|
602 |
+
color: MODEL_COLORS[topPerf.model] || "#999999",
|
603 |
+
},
|
604 |
+
internalMetricKey: internalMetricKey, // Store original snake_case key
|
605 |
+
};
|
606 |
+
};
|
607 |
+
Object.entries(
|
608 |
+
rawDataInput.metrics_breakdown.high_level_categories || {}
|
609 |
+
).forEach(([displayKey, catData]) => {
|
610 |
+
metricsBreakdownProcessed.highLevelCategories[displayKey] =
|
611 |
+
processCategory(displayKey, catData);
|
612 |
+
});
|
613 |
+
Object.entries(
|
614 |
+
rawDataInput.metrics_breakdown.low_level_metrics || {}
|
615 |
+
).forEach(([displayKey, metricData]) => {
|
616 |
+
metricsBreakdownProcessed.lowLevelMetrics[displayKey] = processCategory(
|
617 |
+
displayKey,
|
618 |
+
metricData
|
619 |
+
);
|
620 |
+
});
|
621 |
+
} else {
|
622 |
+
console.warn("rawDataInput.metrics_breakdown is missing or not an object.");
|
623 |
+
}
|
624 |
+
|
625 |
+
// Prepare Radar Chart Data
|
626 |
+
const radarChartData = Object.entries(
|
627 |
+
metricsBreakdownProcessed.highLevelCategories
|
628 |
+
).map(([displayKey, categoryData]) => {
|
629 |
+
// displayKey is Title Case here
|
630 |
+
const radarEntry = { category: displayKey }; // Use Title Case for radar axis label
|
631 |
+
modelOrder.forEach((modelName) => {
|
632 |
+
radarEntry[modelName] =
|
633 |
+
Number(categoryData.modelScores[modelName]?.nationalScore) || 0;
|
634 |
+
});
|
635 |
+
return radarEntry;
|
636 |
+
});
|
637 |
+
|
638 |
+
// Prepare Top Performers
|
639 |
+
const bestPerCategory = {};
|
640 |
+
Object.entries(metricsBreakdownProcessed.highLevelCategories).forEach(
|
641 |
+
([displayKey, catData]) => {
|
642 |
+
bestPerCategory[displayKey] = catData.topPerformer;
|
643 |
+
}
|
644 |
+
);
|
645 |
+
const bestPerMetric = {};
|
646 |
+
Object.entries(metricsBreakdownProcessed.lowLevelMetrics).forEach(
|
647 |
+
([displayKey, metricData]) => {
|
648 |
+
bestPerMetric[displayKey] = metricData.topPerformer;
|
649 |
+
}
|
650 |
+
);
|
651 |
+
|
652 |
+
// Prepare Task Performance Data
|
653 |
+
const taskCategoryMap = {
|
654 |
+
"Generating a Creative Idea": "creative",
|
655 |
+
"Creating a Travel Itinerary": "creative",
|
656 |
+
"Following Up on a Job Application": "practical",
|
657 |
+
"Planning Your Weekly Meals": "practical",
|
658 |
+
"Making a Decision Between Options": "analytical",
|
659 |
+
"Understanding a Complex Topic": "analytical",
|
660 |
+
};
|
661 |
+
// Pass the original rawDataInput to the helper, which expects snake_case keys internally
|
662 |
+
const taskPerformanceResults = processTaskPerformance(
|
663 |
+
rawDataInput,
|
664 |
+
taskCategoryMap,
|
665 |
+
modelOrder
|
666 |
+
);
|
667 |
+
const tasks = Object.keys(taskLevelPerformanceRaw || {}); // Use original snake_case keys
|
668 |
+
const taskCategories = {};
|
669 |
+
Object.entries(taskCategoryMap).forEach(([task, category]) => {
|
670 |
+
if (!taskCategories[category]) taskCategories[category] = [];
|
671 |
+
if (tasks.includes(task)) taskCategories[category].push(task);
|
672 |
+
});
|
673 |
+
const taskMetrics = new Set();
|
674 |
+
Object.values(taskLevelPerformanceRaw || {}).forEach((taskData) => {
|
675 |
+
Object.values(taskData || {}).forEach((modelData) => {
|
676 |
+
Object.keys(modelData || {}).forEach((metric) => taskMetrics.add(metric));
|
677 |
+
});
|
678 |
+
}); // metric is snake_case
|
679 |
+
const taskMetricsDisplayList = Array.from(taskMetrics)
|
680 |
+
.map(formatDisplayKey)
|
681 |
+
.sort(); // Create display list
|
682 |
+
const taskMetricsSnakeList = Array.from(taskMetrics).sort(); // List of original snake_case keys
|
683 |
+
|
684 |
+
// Final Return Structure
|
685 |
+
return {
|
686 |
+
models: overallRankingProcessed, // camelCase keys for top level
|
687 |
+
metricsData: metricsBreakdownProcessed, // Title Case keys for categories/metrics
|
688 |
+
radarData: radarChartData,
|
689 |
+
bestPerCategory: bestPerCategory, // Title Case keys
|
690 |
+
bestPerMetric: bestPerMetric, // Title Case keys
|
691 |
+
overviewCardData: taskPerformanceResults, // camelCase keys expected from helper
|
692 |
+
rawData: {
|
693 |
+
// Keep original structures under camelCase keys for clarity
|
694 |
+
taskLevelPerformance: taskLevelPerformanceRaw, // snake_case keys inside
|
695 |
+
mrpDemographics: mrpDemographicsRaw, // Title Case metric keys inside
|
696 |
+
// Processed lists/maps for filtering/display
|
697 |
+
demographicOptions: demographicOptions,
|
698 |
+
availableMetrics: availableMetricsList, // Title Case metric names
|
699 |
+
tasks: tasks,
|
700 |
+
taskCategories: taskCategories,
|
701 |
+
taskMetrics: taskMetricsDisplayList, // Title Case metric names for display
|
702 |
+
taskMetricsSnake: taskMetricsSnakeList, // snake_case keys for lookup
|
703 |
+
taskCategoryMap: taskCategoryMap,
|
704 |
+
},
|
705 |
+
metadata: metadata, // Original structure
|
706 |
+
equityAnalysis: equityAnalysis, // Original structure (snake_case keys)
|
707 |
+
};
|
708 |
+
}
|
leaderboard-app/next.config.mjs
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
/** @type {import('next').NextConfig} */
|
2 |
+
const nextConfig = {};
|
3 |
+
|
4 |
+
export default nextConfig;
|
leaderboard-app/package-lock.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
leaderboard-app/package.json
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"name": "leaderboard-app",
|
3 |
+
"version": "0.1.0",
|
4 |
+
"private": true,
|
5 |
+
"scripts": {
|
6 |
+
"dev": "next dev",
|
7 |
+
"build": "next build",
|
8 |
+
"start": "next start",
|
9 |
+
"lint": "next lint"
|
10 |
+
},
|
11 |
+
"dependencies": {
|
12 |
+
"lucide-react": "^0.487.0",
|
13 |
+
"next": "15.2.3",
|
14 |
+
"react": "^19.0.0",
|
15 |
+
"react-dom": "^19.0.0",
|
16 |
+
"recharts": "^2.15.1"
|
17 |
+
},
|
18 |
+
"devDependencies": {
|
19 |
+
"@eslint/eslintrc": "^3",
|
20 |
+
"@tailwindcss/postcss": "^4",
|
21 |
+
"eslint": "^9",
|
22 |
+
"eslint-config-next": "15.2.3",
|
23 |
+
"tailwindcss": "^4"
|
24 |
+
}
|
25 |
+
}
|
leaderboard-app/postcss.config.mjs
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
const config = {
|
2 |
+
plugins: ["@tailwindcss/postcss"],
|
3 |
+
};
|
4 |
+
|
5 |
+
export default config;
|
leaderboard-app/public/file.svg
ADDED
|
leaderboard-app/public/globe.svg
ADDED
|
leaderboard-app/public/leaderboard_data.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
leaderboard-app/public/next.svg
ADDED
|
leaderboard-app/public/vercel.svg
ADDED
|
leaderboard-app/public/window.svg
ADDED
|