Nora Petrova
Add project to new space
20e666e
raw
history blame
30.7 kB
"use client";
import React, { useState } from "react";
import {
ChevronDown,
ChevronUp,
Info,
Book,
Calculator,
BarChart,
UserCheck,
CheckCircle,
MessageCircle,
Brain,
SlidersHorizontal,
Shield,
Smile,
Globe,
} from "lucide-react";
const AboutTab = () => {
// Task list for easier management
const tasksUsed = [
"Following Up on Job Application: Drafting a professional follow-up email",
"Planning Weekly Meals: Creating a meal plan accommodating dietary restrictions",
"Creating Travel Itinerary: Planning a European city break",
"Understanding Complex Topic: Learning about day trading concepts",
"Generating Creative Ideas: Brainstorming unique birthday gift ideas",
"Making Decisions Between Options: Comparing tech products for purchase",
];
// State for collapsible sections
const [openSections, setOpenSections] = useState({
introduction: true,
methodology: true,
metricsCalculation: true,
metricsExplained: true,
});
// State for active metric tab
const [activeMetricTab, setActiveMetricTab] = useState("helpfulness");
// Toggle section visibility
const toggleSection = (section) => {
setOpenSections({
...openSections,
[section]: !openSections[section],
});
};
// Metrics data
const metricsData = [
{
id: "helpfulness",
title: "Helpfulness",
icon: <CheckCircle size={18} />,
color: "bg-green-500",
description:
"Evaluates how well the model provides useful, practical assistance that addresses the user's needs and helps them accomplish their goals.",
metrics: [
{
name: "Effectiveness",
description:
"How effectively did the model help you accomplish your specific goal?",
},
{
name: "Comprehensiveness",
description:
"How comprehensive was the model's response in addressing all aspects of your request?",
},
{
name: "Usefulness",
description:
"How useful were the model's suggestions or solutions for your needs?",
},
],
},
{
id: "communication",
title: "Communication",
icon: <MessageCircle size={18} />,
color: "bg-blue-500",
description:
"Assesses the clarity, coherence, and appropriateness of the model's writing style, including tone and language choices.",
metrics: [
{
name: "Tone and Language Style",
description:
"How well did the model match its tone and language style to the context of your interaction?",
},
{
name: "Conversation Flow",
description:
"How natural and conversational were the model's responses?",
},
{
name: "Detail and Technical Language",
description:
"How appropriate was the level of detail and technical language for your needs?",
},
],
},
{
id: "understanding",
title: "Understanding",
icon: <Brain size={18} />,
color: "bg-purple-500",
description:
"Measures how well the model comprehends the user's requests, including implicit needs and contextual information.",
metrics: [
{
name: "Accuracy",
description:
"How accurately did the model interpret your initial request?",
},
{
name: "Context Memory",
description:
"How well did the model maintain context throughout the conversation?",
},
{
name: "Intuitiveness",
description:
"How well did the model pick up on implicit aspects of your request without requiring explicit explanation?",
},
],
},
{
id: "adaptiveness",
title: "Adaptiveness",
icon: <SlidersHorizontal size={18} />,
color: "bg-amber-500",
description:
"Measures how well the model adjusts to different user needs, contexts, and feedback throughout a conversation.",
metrics: [
{
name: "Flexibility",
description:
"How effectively did the model adjust its responses based on your feedback?",
},
{
name: "Clarity",
description:
"How well did the model clarify ambiguities or misunderstandings?",
},
{
name: "Conversation Building",
description:
"How well did the model build upon previous exchanges in the conversation?",
},
],
},
{
id: "trustworthiness",
title: "Trustworthiness",
icon: <Shield size={18} />,
color: "bg-red-500",
description:
"Evaluates transparency, citations, acknowledgment of limitations, and overall user confidence in the model's responses.",
metrics: [
{
name: "Consistency",
description:
"How consistent were the model's responses across similar questions?",
},
{
name: "Confidence",
description:
"How confident were you in the accuracy of the model's information?",
},
{
name: "Transparency",
description:
"How transparent was the model about its limitations or uncertainties?",
},
],
},
{
id: "personality",
title: "Personality",
icon: <Smile size={18} />,
color: "bg-pink-500",
description:
"Assesses consistency and definition of the model's persona, and alignment with expectations of honesty, empathy, and fairness.",
metrics: [
{
name: "Personality Consistency",
description: "How consistent was the LLM's personality?",
},
{
name: "Distinct Personality",
description: "How well-defined was the LLM's personality?",
},
{
name: "Honesty Empathy Fairness",
description:
"How much did the LLM respond in a way that aligned with your expectations of honesty, empathy, or fairness?",
},
],
},
{
id: "background",
title: "Background and Culture",
icon: <Globe size={18} />,
color: "bg-teal-500",
description:
"Evaluates cultural sensitivity, alignment, relevance, and freedom from bias.",
metrics: [
{
name: "Ethical Alignment",
description:
"How aligned with your culture, viewpoint, or values was the LLM?",
},
{
name: "Cultural Awareness",
description:
"How well did the LLM recognize when your cultural perspective was relevant?",
},
{
name: "Bias and Stereotypes",
description:
"How free from stereotypes or bias was the LLM's response?",
},
],
},
];
// Section header component
const SectionHeader = ({ title, icon, section }) => (
<div
className="px-4 py-3 bg-gray-50 border-b flex justify-between items-center cursor-pointer"
onClick={() => toggleSection(section)}
>
<div className="flex items-center gap-2">
{icon}
<h3 className="font-semibold text-gray-800">{title}</h3>
</div>
{openSections[section] ? (
<ChevronUp size={16} />
) : (
<ChevronDown size={16} />
)}
</div>
);
return (
<div className="space-y-6">
{/* Introduction */}
<div className="border rounded-lg overflow-hidden shadow-sm">
<SectionHeader
title="About HUMAINE"
icon={<Info size={18} />}
section="introduction"
/>
{openSections.introduction && (
<div className="p-4 bg-gradient-to-r from-white to-blue-50">
<div className="flex flex-col md:flex-row gap-6">
<div className="md:w-2/3">
<p className="mb-4">
<strong>HUMAINE</strong> (Human Understanding and Measurement
of AI Natural Engagement) is an evaluation benchmark that
measures language model performance through actual user
experience. While many benchmarks focus on technical
capabilities, this evaluation captures how users perceive and
rate different LLMs across common, everyday use cases.
</p>
<p className="mb-4">
This study collected ratings from 514 participants
demographically representative of the US population. Each
participant completed real-world tasks with different LLMs and
provided structured feedback on various aspects of their
experience.
</p>
<p>
The evaluation framework includes 7 high-level categories and
21 specific low-level metrics that measure aspects like
helpfulness, communication quality, understanding,
adaptiveness, trustworthiness, personality, and cultural
awareness, alongside demographic equity analysis.
</p>
</div>
<div className="md:w-1/3 bg-white p-4 rounded-lg border shadow-sm">
<h4 className="font-medium text-gray-700 mb-2 border-b pb-1">
Tasks Evaluated
</h4>
<ul className="list-disc pl-5 space-y-2 text-sm">
{tasksUsed.map((task, index) => (
<li key={index} className="text-gray-700">
{task}
</li>
))}
</ul>
</div>
</div>
</div>
)}
</div>
{/* Methodology */}
<div className="border rounded-lg overflow-hidden shadow-sm">
<SectionHeader
title="Methodology"
icon={<Book size={18} />}
section="methodology"
/>
{openSections.methodology && (
<div className="p-4">
<div className="grid md:grid-cols-1 gap-4">
{/* Study Design */}
<div className="border rounded-lg p-4 bg-gray-50 hover:shadow-md transition-shadow">
<h4 className="text-lg font-medium mb-2 flex items-center gap-2 text-gray-800">
<span className="w-8 h-8 rounded-full bg-blue-500 flex items-center justify-center text-white">
1
</span>
Study Design
</h4>
<ul className="list-disc pl-5 space-y-1 text-sm">
<li>
<strong>Participants:</strong> 514 individuals representing
US demographics (stratified by age, sex, ethnicity,
political affiliation).
</li>
<li>
<strong>Task Design:</strong> Six everyday tasks spanning
creative, practical, and analytical use cases.
</li>
<li>
<strong>Process:</strong> Each participant completed all six
tasks, each with a different LLM. The assignment of tasks to
models and the order of tasks were fully randomized.
</li>
<li>
<strong>Models Evaluated:</strong> Latest o1, GPT-4o, Claude
3.7 (extended thinking), Gemini 2 Flash, LLama 3.1 405B,
Deepseek R1.
</li>
<li>
<strong>Model Access:</strong> All models were accessed via
openrouter.ai with temperature=1, min_tokens=50,
max_tokens=5,000.
</li>
<li>
<strong>Conversations:</strong> Participants were required
to exchange at least 4 messages with the models and they
could exchange more if they wished (not capped).
</li>
</ul>
</div>
{/* Evaluation Framework */}
<div className="border rounded-lg p-4 bg-gray-50 hover:shadow-md transition-shadow">
<h4 className="text-lg font-medium mb-2 flex items-center gap-2 text-gray-800">
<span className="w-8 h-8 rounded-full bg-blue-500 flex items-center justify-center text-white">
2
</span>
Evaluation Framework
</h4>
<p className="mb-2 text-sm">
Our approach captures multiple aspects of user experience:
</p>
<ul className="list-disc pl-5 space-y-1 text-sm">
<li>
<strong>Multi-Dimensional Metrics:</strong> Performance is
evaluated across 7 high-level categories (rated 1-7) and 21
specific low-level metrics (rated 1-5).
</li>
<li>
<strong>Demographic Analysis:</strong> We assess performance
consistency across different demographic groups through
equity assessment.
</li>
<li>
<strong>Scale Normalization:</strong> All ratings are
converted to a 0-100 scale for easier comparison.
</li>
</ul>
</div>
{/* Data Analysis & Weighting */}
<div className="border rounded-lg p-4 bg-gray-50 hover:shadow-md transition-shadow">
<h4 className="text-lg font-medium mb-2 flex items-center gap-2 text-gray-800">
<span className="w-8 h-8 rounded-full bg-blue-500 flex items-center justify-center text-white">
3
</span>
Data Analysis & Weighting
</h4>
<ul className="list-disc pl-5 space-y-1 text-sm">
<li>
<strong>MRP Methodology:</strong> Data is processed through
multiple regression with poststratification to create
results weighted to be highly representative of the US
population.
</li>
<li>
<strong>Robust Estimation:</strong> All model estimations
were parametrically bootstrapped (N = 1000) to ensure that
any uncertainty in the estimates was accounted for.
</li>
<li>
<strong>National Level Comparisons:</strong> For the Overall
Rankings and Metrics Breakdown tabs, we use the
national-level estimates derived from MRP.
</li>
<li>
<strong>Task-Level Comparisons:</strong> For task-specific
comparisons (Task Performance tab), we use the raw
(unweighted) data due to sample size constraints.
</li>
</ul>
</div>
{/* Demographic Equity Assessment */}
<div className="border rounded-lg p-4 bg-gray-50 hover:shadow-md transition-shadow">
<h4 className="text-lg font-medium mb-2 flex items-center gap-2 text-gray-800">
<span className="w-8 h-8 rounded-full bg-blue-500 flex items-center justify-center text-white">
4
</span>
Demographic Equity Assessment
</h4>
<p className="mb-2 text-sm">
The equity assessment evaluates performance consistency across
demographic groups using a standardized approach:
</p>
<div className="bg-white rounded p-3 border mb-2">
<p className="text-xs mb-2">
The <strong>Equity Gap</strong> is the score difference
between the highest and lowest scoring demographic groups
for a specific metric. For example, if a model scores 85
with users age 18-29 but 65 with users age 60+ on
helpfulness, the equity gap would be 20 points.
</p>
<p className="text-xs mb-2">
We evaluate equity gaps using both{" "}
<strong>Effect Size</strong> and{" "}
<strong>Statistical Significance</strong> to identify
meaningful performance differences:
</p>
<div className="text-xs mt-2 space-y-2">
<div>
<p className="font-medium text-gray-700">
Effect Size Calculation:
</p>
<p className="text-gray-600 ml-2">
We normalize each gap by dividing it by the category's
standard deviation:
<br />
<span className="font-mono bg-gray-100 px-1">
Effect Size = (Max Score - Min Score) / Category
Standard Deviation
</span>
</p>
<p className="text-gray-600 ml-2 mt-1">
Category Standard Deviation is calculated from all
demographic MRP scores within that specific category.
</p>
</div>
<div>
<p className="font-medium text-gray-700">
Effect Size Classification:
</p>
<div className="grid grid-cols-2 gap-x-3 gap-y-2 mt-1">
<div className="flex items-center gap-1">
<div className="w-3 h-3 rounded-full bg-red-100"></div>
<div>
<span className="font-medium text-gray-700">
Large
</span>
<p className="text-gray-500">Effect Size ≥ 0.8</p>
</div>
</div>
<div className="flex items-center gap-1">
<div className="w-3 h-3 rounded-full bg-yellow-100"></div>
<div>
<span className="font-medium text-gray-700">
Medium
</span>
<p className="text-gray-500">Effect Size 0.5-0.8</p>
</div>
</div>
<div className="flex items-center gap-1">
<div className="w-3 h-3 rounded-full bg-blue-100"></div>
<div>
<span className="font-medium text-gray-700">
Small
</span>
<p className="text-gray-500">Effect Size 0.2-0.5</p>
</div>
</div>
<div className="flex items-center gap-1">
<div className="w-3 h-3 rounded-full bg-green-100"></div>
<div>
<span className="font-medium text-gray-700">
Negligible
</span>
<p className="text-gray-500">
Effect Size &lt; 0.2
</p>
</div>
</div>
</div>
</div>
<div>
<p className="font-medium text-gray-700">
Statistical Significance:
</p>
<p className="text-gray-600 ml-2">
We use p-values to determine if gaps are statistically
significant (p &lt; 0.05). To account for the large
number of tests performed, p-values were adjusted using
the Benjamini-Hochberg (FDR) method. Significance
reported reflects this correction (q &lt; 0.05).
</p>
</div>
<div>
<p className="font-medium text-gray-700">
Equity Concerns:
</p>
<p className="text-gray-600 ml-2">
A gap is flagged as an equity concern when it has both:
<br />
1. Large Effect Size (≥ 0.8)
<br />
2. Statistical Significance (p &lt; 0.05)
</p>
</div>
</div>
<p className="text-xs text-gray-600 mt-2">
<strong>Note:</strong> This methodology allows us to
identify meaningful performance differences across
demographic groups while accounting for both the magnitude
of the gap (effect size) and its statistical reliability
(significance).
</p>
</div>
</div>
</div>
</div>
)}
</div>
{/* Metrics Calculation */}
<div className="border rounded-lg overflow-hidden shadow-sm">
<SectionHeader
title="Metrics Calculation"
icon={<Calculator size={18} />}
section="metricsCalculation"
/>
{openSections.metricsCalculation && (
<div className="p-4">
<p className="text-sm mb-4">
This section explains how the metrics in the Overview page's
ranking table are calculated.
</p>
<div className="grid md:grid-cols-2 lg:grid-cols-3 gap-3">
<div className="border rounded p-3 hover:shadow-md transition-shadow">
<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
<div className="w-4 h-4 rounded-full bg-blue-500"></div>
Overall Score
</h4>
<p className="text-xs text-gray-600">
Average score across high-level categories at the national
level (0-100). This represents overall model performance
across all evaluation dimensions.
</p>
</div>
<div className="border rounded p-3 hover:shadow-md transition-shadow">
<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
<div className="w-4 h-4 rounded-full bg-blue-500"></div>
Overall SD
</h4>
<p className="text-xs text-gray-600">
Standard Deviation across high-level categories (lower = more
consistent). Measures how consistent a model performs across
different capability areas.
</p>
</div>
<div className="border rounded p-3 hover:shadow-md transition-shadow">
<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
<div className="w-4 h-4 rounded-full bg-blue-500"></div>
Max Equity Gap
</h4>
<p className="text-xs text-gray-600">
Largest demographic score difference (hover for details).
Shows the maximum difference in scores between any two
demographic groups, with indicators for effect size and
statistical significance.
</p>
</div>
<div className="border rounded p-3 hover:shadow-md transition-shadow">
<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
<div className="w-4 h-4 rounded-full bg-blue-500"></div>
Max Gap Area
</h4>
<p className="text-xs text-gray-600">
Factor and Category where the Max Equity Gap occurs.
Identifies which demographic factor (e.g., Age, Gender) and
which category (e.g., Helpfulness, Understanding) shows the
largest performance difference.
</p>
</div>
<div className="border rounded p-3 hover:shadow-md transition-shadow">
<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
<div className="w-4 h-4 rounded-full bg-blue-500"></div>
Equity Concerns
</h4>
<p className="text-xs text-gray-600">
Percentage of demographic gaps flagged as equity concerns
(lower is better). An equity concern is defined as a gap with
both large effect size (≥0.8) and statistical significance.
</p>
</div>
<div className="border rounded p-3 hover:shadow-md transition-shadow">
<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
<div className="w-4 h-4 rounded-full bg-blue-500"></div>
User Retention
</h4>
<p className="text-xs text-gray-600">
Percentage of participants who said they would use the model
again. This is based on the "Repeat Usage" question and
indicates user satisfaction and likelihood to continue using
the model.
</p>
</div>
</div>
<div className="mt-4 bg-blue-50 border-l-4 border-blue-400 p-3 rounded">
<p className="text-xs text-blue-800">
<strong>Note:</strong> All scores shown in the dashboard are
based on MRP-adjusted (Multilevel Regression with
Poststratification) estimates to ensure they are representative
of the US population. The only exception is the Task Performance
tab, which uses raw scores due to sample size constraints at the
task level.
</p>
</div>
</div>
)}
</div>
{/* Metrics Explained */}
<div className="border rounded-lg overflow-hidden shadow-sm">
<SectionHeader
title="Metrics Explained"
icon={<BarChart size={18} />}
section="metricsExplained"
/>
{openSections.metricsExplained && (
<div className="p-4">
<p className="mb-4 text-sm">
Our evaluation uses 7 high-level categories (rated on a 1-7 Likert
scale) and 21 low-level metrics (rated on a 1-5 scale) to
comprehensively assess LLM performance from a user experience
perspective.
</p>
{/* Metric selector tabs */}
<div className="flex flex-wrap gap-1 mb-4 border-b">
{metricsData.map((metric) => (
<button
key={metric.id}
className={`px-3 py-2 text-sm rounded-t-lg flex items-center gap-1 ${
activeMetricTab === metric.id
? "bg-gray-100 font-medium border-t border-l border-r"
: "bg-white hover:bg-gray-50"
}`}
onClick={() => setActiveMetricTab(metric.id)}
>
<span
className={`w-2 h-2 rounded-full ${metric.color}`}
></span>
{metric.title}
</button>
))}
</div>
{/* Active metric content */}
{metricsData.map(
(metric) =>
activeMetricTab === metric.id && (
<div
key={metric.id}
className="border rounded-lg overflow-hidden"
>
<div className="px-4 py-3 bg-gray-50 border-b flex items-center gap-2">
<div className={`rounded-full`}>
{React.cloneElement(metric.icon, {
className: `text-gray-700 w-5 h-5`,
})}
</div>
<h4 className="font-medium text-gray-800">
{metric.title}{" "}
<span className="text-sm font-normal text-gray-600">
(1-7 scale)
</span>
</h4>
</div>
<div className="p-4">
<p className="text-sm mb-4">{metric.description}</p>
{metric.metrics.length > 0 && (
<>
<h5 className="text-sm font-medium mb-3 text-gray-700">
Specific Metrics (1-5 scale)
</h5>
<div className="grid md:grid-cols-3 gap-3">
{metric.metrics.map((subMetric, idx) => (
<div
key={idx}
className="border rounded p-3 hover:shadow-sm transition-shadow"
>
<p className="text-sm font-medium">
{subMetric.name}
</p>
<p className="text-xs text-gray-600 mt-1">
{subMetric.description}
</p>
</div>
))}
</div>
</>
)}
</div>
</div>
)
)}
</div>
)}
</div>
</div>
);
};
export default AboutTab;