Spaces:

nlpetprolific
/

user-experience-leaderboard

Running

user-experience-leaderboard / leaderboard-app /components /About.jsx

Nora Petrova

Add project to new space

20e666e 29 days ago

30.7 kB

	"use client";

	import React, { useState } from "react";
	import {
	ChevronDown,
	ChevronUp,
	Info,
	Book,
	Calculator,
	BarChart,
	UserCheck,
	CheckCircle,
	MessageCircle,
	Brain,
	SlidersHorizontal,
	Shield,
	Smile,
	Globe,
	} from "lucide-react";

	const AboutTab = () => {
	// Task list for easier management
	const tasksUsed = [
	"Following Up on Job Application: Drafting a professional follow-up email",
	"Planning Weekly Meals: Creating a meal plan accommodating dietary restrictions",
	"Creating Travel Itinerary: Planning a European city break",
	"Understanding Complex Topic: Learning about day trading concepts",
	"Generating Creative Ideas: Brainstorming unique birthday gift ideas",
	"Making Decisions Between Options: Comparing tech products for purchase",
	];

	// State for collapsible sections
	const [openSections, setOpenSections] = useState({
	introduction: true,
	methodology: true,
	metricsCalculation: true,
	metricsExplained: true,
	});

	// State for active metric tab
	const [activeMetricTab, setActiveMetricTab] = useState("helpfulness");

	// Toggle section visibility
	const toggleSection = (section) => {
	setOpenSections({
	...openSections,
	[section]: !openSections[section],
	});
	};

	// Metrics data
	const metricsData = [
	{
	id: "helpfulness",
	title: "Helpfulness",
	icon: <CheckCircle size={18} />,
	color: "bg-green-500",
	description:
	"Evaluates how well the model provides useful, practical assistance that addresses the user's needs and helps them accomplish their goals.",
	metrics: [
	{
	name: "Effectiveness",
	description:
	"How effectively did the model help you accomplish your specific goal?",
	},
	{
	name: "Comprehensiveness",
	description:
	"How comprehensive was the model's response in addressing all aspects of your request?",
	},
	{
	name: "Usefulness",
	description:
	"How useful were the model's suggestions or solutions for your needs?",
	},
	],
	},
	{
	id: "communication",
	title: "Communication",
	icon: <MessageCircle size={18} />,
	color: "bg-blue-500",
	description:
	"Assesses the clarity, coherence, and appropriateness of the model's writing style, including tone and language choices.",
	metrics: [
	{
	name: "Tone and Language Style",
	description:
	"How well did the model match its tone and language style to the context of your interaction?",
	},
	{
	name: "Conversation Flow",
	description:
	"How natural and conversational were the model's responses?",
	},
	{
	name: "Detail and Technical Language",
	description:
	"How appropriate was the level of detail and technical language for your needs?",
	},
	],
	},
	{
	id: "understanding",
	title: "Understanding",
	icon: <Brain size={18} />,
	color: "bg-purple-500",
	description:
	"Measures how well the model comprehends the user's requests, including implicit needs and contextual information.",
	metrics: [
	{
	name: "Accuracy",
	description:
	"How accurately did the model interpret your initial request?",
	},
	{
	name: "Context Memory",
	description:
	"How well did the model maintain context throughout the conversation?",
	},
	{
	name: "Intuitiveness",
	description:
	"How well did the model pick up on implicit aspects of your request without requiring explicit explanation?",
	},
	],
	},
	{
	id: "adaptiveness",
	title: "Adaptiveness",
	icon: <SlidersHorizontal size={18} />,
	color: "bg-amber-500",
	description:
	"Measures how well the model adjusts to different user needs, contexts, and feedback throughout a conversation.",
	metrics: [
	{
	name: "Flexibility",
	description:
	"How effectively did the model adjust its responses based on your feedback?",
	},
	{
	name: "Clarity",
	description:
	"How well did the model clarify ambiguities or misunderstandings?",
	},
	{
	name: "Conversation Building",
	description:
	"How well did the model build upon previous exchanges in the conversation?",
	},
	],
	},
	{
	id: "trustworthiness",
	title: "Trustworthiness",
	icon: <Shield size={18} />,
	color: "bg-red-500",
	description:
	"Evaluates transparency, citations, acknowledgment of limitations, and overall user confidence in the model's responses.",
	metrics: [
	{
	name: "Consistency",
	description:
	"How consistent were the model's responses across similar questions?",
	},
	{
	name: "Confidence",
	description:
	"How confident were you in the accuracy of the model's information?",
	},
	{
	name: "Transparency",
	description:
	"How transparent was the model about its limitations or uncertainties?",
	},
	],
	},
	{
	id: "personality",
	title: "Personality",
	icon: <Smile size={18} />,
	color: "bg-pink-500",
	description:
	"Assesses consistency and definition of the model's persona, and alignment with expectations of honesty, empathy, and fairness.",
	metrics: [
	{
	name: "Personality Consistency",
	description: "How consistent was the LLM's personality?",
	},
	{
	name: "Distinct Personality",
	description: "How well-defined was the LLM's personality?",
	},
	{
	name: "Honesty Empathy Fairness",
	description:
	"How much did the LLM respond in a way that aligned with your expectations of honesty, empathy, or fairness?",
	},
	],
	},
	{
	id: "background",
	title: "Background and Culture",
	icon: <Globe size={18} />,
	color: "bg-teal-500",
	description:
	"Evaluates cultural sensitivity, alignment, relevance, and freedom from bias.",
	metrics: [
	{
	name: "Ethical Alignment",
	description:
	"How aligned with your culture, viewpoint, or values was the LLM?",
	},
	{
	name: "Cultural Awareness",
	description:
	"How well did the LLM recognize when your cultural perspective was relevant?",
	},
	{
	name: "Bias and Stereotypes",
	description:
	"How free from stereotypes or bias was the LLM's response?",
	},
	],
	},
	];

	// Section header component
	const SectionHeader = ({ title, icon, section }) => (
	<div
	className="px-4 py-3 bg-gray-50 border-b flex justify-between items-center cursor-pointer"
	onClick={() => toggleSection(section)}
	>
	<div className="flex items-center gap-2">
	{icon}
	<h3 className="font-semibold text-gray-800">{title}</h3>
	</div>
	{openSections[section] ? (
	<ChevronUp size={16} />
	) : (
	<ChevronDown size={16} />
	)}
	</div>
	);

	return (
	<div className="space-y-6">
	{/* Introduction */}
	<div className="border rounded-lg overflow-hidden shadow-sm">
	<SectionHeader
	title="About HUMAINE"
	icon={<Info size={18} />}
	section="introduction"
	/>
	{openSections.introduction && (
	<div className="p-4 bg-gradient-to-r from-white to-blue-50">
	<div className="flex flex-col md:flex-row gap-6">
	<div className="md:w-2/3">
	<p className="mb-4">
	<strong>HUMAINE</strong> (Human Understanding and Measurement
	of AI Natural Engagement) is an evaluation benchmark that
	measures language model performance through actual user
	experience. While many benchmarks focus on technical
	capabilities, this evaluation captures how users perceive and
	rate different LLMs across common, everyday use cases.
	</p>
	<p className="mb-4">
	This study collected ratings from 514 participants
	demographically representative of the US population. Each
	participant completed real-world tasks with different LLMs and
	provided structured feedback on various aspects of their
	experience.
	</p>
	<p>
	The evaluation framework includes 7 high-level categories and
	21 specific low-level metrics that measure aspects like
	helpfulness, communication quality, understanding,
	adaptiveness, trustworthiness, personality, and cultural
	awareness, alongside demographic equity analysis.
	</p>
	</div>
	<div className="md:w-1/3 bg-white p-4 rounded-lg border shadow-sm">
	<h4 className="font-medium text-gray-700 mb-2 border-b pb-1">
	Tasks Evaluated
	</h4>
	<ul className="list-disc pl-5 space-y-2 text-sm">
	{tasksUsed.map((task, index) => (
	<li key={index} className="text-gray-700">
	{task}
	</li>
	))}
	</ul>
	</div>
	</div>
	</div>
	)}
	</div>

	{/* Methodology */}
	<div className="border rounded-lg overflow-hidden shadow-sm">
	<SectionHeader
	title="Methodology"
	icon={<Book size={18} />}
	section="methodology"
	/>
	{openSections.methodology && (
	<div className="p-4">
	<div className="grid md:grid-cols-1 gap-4">
	{/* Study Design */}
	<div className="border rounded-lg p-4 bg-gray-50 hover:shadow-md transition-shadow">
	<h4 className="text-lg font-medium mb-2 flex items-center gap-2 text-gray-800">
	<span className="w-8 h-8 rounded-full bg-blue-500 flex items-center justify-center text-white">
	1
	</span>
	Study Design
	</h4>
	<ul className="list-disc pl-5 space-y-1 text-sm">
	<li>
	<strong>Participants:</strong> 514 individuals representing
	US demographics (stratified by age, sex, ethnicity,
	political affiliation).
	</li>
	<li>
	<strong>Task Design:</strong> Six everyday tasks spanning
	creative, practical, and analytical use cases.
	</li>
	<li>
	<strong>Process:</strong> Each participant completed all six
	tasks, each with a different LLM. The assignment of tasks to
	models and the order of tasks were fully randomized.
	</li>
	<li>
	<strong>Models Evaluated:</strong> Latest o1, GPT-4o, Claude
	3.7 (extended thinking), Gemini 2 Flash, LLama 3.1 405B,
	Deepseek R1.
	</li>
	<li>
	<strong>Model Access:</strong> All models were accessed via
	openrouter.ai with temperature=1, min_tokens=50,
	max_tokens=5,000.
	</li>
	<li>
	<strong>Conversations:</strong> Participants were required
	to exchange at least 4 messages with the models and they
	could exchange more if they wished (not capped).
	</li>
	</ul>
	</div>
	{/* Evaluation Framework */}
	<div className="border rounded-lg p-4 bg-gray-50 hover:shadow-md transition-shadow">
	<h4 className="text-lg font-medium mb-2 flex items-center gap-2 text-gray-800">
	<span className="w-8 h-8 rounded-full bg-blue-500 flex items-center justify-center text-white">
	2
	</span>
	Evaluation Framework
	</h4>
	<p className="mb-2 text-sm">
	Our approach captures multiple aspects of user experience:
	</p>
	<ul className="list-disc pl-5 space-y-1 text-sm">
	<li>
	<strong>Multi-Dimensional Metrics:</strong> Performance is
	evaluated across 7 high-level categories (rated 1-7) and 21
	specific low-level metrics (rated 1-5).
	</li>
	<li>
	<strong>Demographic Analysis:</strong> We assess performance
	consistency across different demographic groups through
	equity assessment.
	</li>
	<li>
	<strong>Scale Normalization:</strong> All ratings are
	converted to a 0-100 scale for easier comparison.
	</li>
	</ul>
	</div>

	{/* Data Analysis & Weighting */}
	<div className="border rounded-lg p-4 bg-gray-50 hover:shadow-md transition-shadow">
	<h4 className="text-lg font-medium mb-2 flex items-center gap-2 text-gray-800">
	<span className="w-8 h-8 rounded-full bg-blue-500 flex items-center justify-center text-white">
	3
	</span>
	Data Analysis & Weighting
	</h4>
	<ul className="list-disc pl-5 space-y-1 text-sm">
	<li>
	<strong>MRP Methodology:</strong> Data is processed through
	multiple regression with poststratification to create
	results weighted to be highly representative of the US
	population.
	</li>
	<li>
	<strong>Robust Estimation:</strong> All model estimations
	were parametrically bootstrapped (N = 1000) to ensure that
	any uncertainty in the estimates was accounted for.
	</li>
	<li>
	<strong>National Level Comparisons:</strong> For the Overall
	Rankings and Metrics Breakdown tabs, we use the
	national-level estimates derived from MRP.
	</li>
	<li>
	<strong>Task-Level Comparisons:</strong> For task-specific
	comparisons (Task Performance tab), we use the raw
	(unweighted) data due to sample size constraints.
	</li>
	</ul>
	</div>

	{/* Demographic Equity Assessment */}
	<div className="border rounded-lg p-4 bg-gray-50 hover:shadow-md transition-shadow">
	<h4 className="text-lg font-medium mb-2 flex items-center gap-2 text-gray-800">
	<span className="w-8 h-8 rounded-full bg-blue-500 flex items-center justify-center text-white">
	4
	</span>
	Demographic Equity Assessment
	</h4>
	<p className="mb-2 text-sm">
	The equity assessment evaluates performance consistency across
	demographic groups using a standardized approach:
	</p>
	<div className="bg-white rounded p-3 border mb-2">
	<p className="text-xs mb-2">
	The <strong>Equity Gap</strong> is the score difference
	between the highest and lowest scoring demographic groups
	for a specific metric. For example, if a model scores 85
	with users age 18-29 but 65 with users age 60+ on
	helpfulness, the equity gap would be 20 points.
	</p>
	<p className="text-xs mb-2">
	We evaluate equity gaps using both{" "}
	<strong>Effect Size</strong> and{" "}
	<strong>Statistical Significance</strong> to identify
	meaningful performance differences:
	</p>
	<div className="text-xs mt-2 space-y-2">
	<div>
	<p className="font-medium text-gray-700">
	Effect Size Calculation:
	</p>
	<p className="text-gray-600 ml-2">
	We normalize each gap by dividing it by the category's
	standard deviation:
	<br />
	<span className="font-mono bg-gray-100 px-1">
	Effect Size = (Max Score - Min Score) / Category
	Standard Deviation
	</span>
	</p>
	<p className="text-gray-600 ml-2 mt-1">
	Category Standard Deviation is calculated from all
	demographic MRP scores within that specific category.
	</p>
	</div>

	<div>
	<p className="font-medium text-gray-700">
	Effect Size Classification:
	</p>
	<div className="grid grid-cols-2 gap-x-3 gap-y-2 mt-1">
	<div className="flex items-center gap-1">
	<div className="w-3 h-3 rounded-full bg-red-100"></div>
	<div>
	<span className="font-medium text-gray-700">
	Large
	</span>
	<p className="text-gray-500">Effect Size ≥ 0.8</p>
	</div>
	</div>
	<div className="flex items-center gap-1">
	<div className="w-3 h-3 rounded-full bg-yellow-100"></div>
	<div>
	<span className="font-medium text-gray-700">
	Medium
	</span>
	<p className="text-gray-500">Effect Size 0.5-0.8</p>
	</div>
	</div>
	<div className="flex items-center gap-1">
	<div className="w-3 h-3 rounded-full bg-blue-100"></div>
	<div>
	<span className="font-medium text-gray-700">
	Small
	</span>
	<p className="text-gray-500">Effect Size 0.2-0.5</p>
	</div>
	</div>
	<div className="flex items-center gap-1">
	<div className="w-3 h-3 rounded-full bg-green-100"></div>
	<div>
	<span className="font-medium text-gray-700">
	Negligible
	</span>
	<p className="text-gray-500">
	Effect Size < 0.2
	</p>
	</div>
	</div>
	</div>
	</div>

	<div>
	<p className="font-medium text-gray-700">
	Statistical Significance:
	</p>
	<p className="text-gray-600 ml-2">
	We use p-values to determine if gaps are statistically
	significant (p < 0.05). To account for the large
	number of tests performed, p-values were adjusted using
	the Benjamini-Hochberg (FDR) method. Significance
	reported reflects this correction (q < 0.05).
	</p>
	</div>

	<div>
	<p className="font-medium text-gray-700">
	Equity Concerns:
	</p>
	<p className="text-gray-600 ml-2">
	A gap is flagged as an equity concern when it has both:
	<br />
	1. Large Effect Size (≥ 0.8)
	<br />
	2. Statistical Significance (p < 0.05)
	</p>
	</div>
	</div>
	<p className="text-xs text-gray-600 mt-2">
	<strong>Note:</strong> This methodology allows us to
	identify meaningful performance differences across
	demographic groups while accounting for both the magnitude
	of the gap (effect size) and its statistical reliability
	(significance).
	</p>
	</div>
	</div>
	</div>
	</div>
	)}
	</div>

	{/* Metrics Calculation */}
	<div className="border rounded-lg overflow-hidden shadow-sm">
	<SectionHeader
	title="Metrics Calculation"
	icon={<Calculator size={18} />}
	section="metricsCalculation"
	/>
	{openSections.metricsCalculation && (
	<div className="p-4">
	<p className="text-sm mb-4">
	This section explains how the metrics in the Overview page's
	ranking table are calculated.
	</p>

	<div className="grid md:grid-cols-2 lg:grid-cols-3 gap-3">
	<div className="border rounded p-3 hover:shadow-md transition-shadow">
	<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
	<div className="w-4 h-4 rounded-full bg-blue-500"></div>
	Overall Score
	</h4>
	<p className="text-xs text-gray-600">
	Average score across high-level categories at the national
	level (0-100). This represents overall model performance
	across all evaluation dimensions.
	</p>
	</div>

	<div className="border rounded p-3 hover:shadow-md transition-shadow">
	<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
	<div className="w-4 h-4 rounded-full bg-blue-500"></div>
	Overall SD
	</h4>
	<p className="text-xs text-gray-600">
	Standard Deviation across high-level categories (lower = more
	consistent). Measures how consistent a model performs across
	different capability areas.
	</p>
	</div>

	<div className="border rounded p-3 hover:shadow-md transition-shadow">
	<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
	<div className="w-4 h-4 rounded-full bg-blue-500"></div>
	Max Equity Gap
	</h4>
	<p className="text-xs text-gray-600">
	Largest demographic score difference (hover for details).
	Shows the maximum difference in scores between any two
	demographic groups, with indicators for effect size and
	statistical significance.
	</p>
	</div>

	<div className="border rounded p-3 hover:shadow-md transition-shadow">
	<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
	<div className="w-4 h-4 rounded-full bg-blue-500"></div>
	Max Gap Area
	</h4>
	<p className="text-xs text-gray-600">
	Factor and Category where the Max Equity Gap occurs.
	Identifies which demographic factor (e.g., Age, Gender) and
	which category (e.g., Helpfulness, Understanding) shows the
	largest performance difference.
	</p>
	</div>

	<div className="border rounded p-3 hover:shadow-md transition-shadow">
	<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
	<div className="w-4 h-4 rounded-full bg-blue-500"></div>
	Equity Concerns
	</h4>
	<p className="text-xs text-gray-600">
	Percentage of demographic gaps flagged as equity concerns
	(lower is better). An equity concern is defined as a gap with
	both large effect size (≥0.8) and statistical significance.
	</p>
	</div>

	<div className="border rounded p-3 hover:shadow-md transition-shadow">
	<h4 className="text-sm font-medium text-gray-800 mb-1 flex items-center gap-1">
	<div className="w-4 h-4 rounded-full bg-blue-500"></div>
	User Retention
	</h4>
	<p className="text-xs text-gray-600">
	Percentage of participants who said they would use the model
	again. This is based on the "Repeat Usage" question and
	indicates user satisfaction and likelihood to continue using
	the model.
	</p>
	</div>
	</div>

	<div className="mt-4 bg-blue-50 border-l-4 border-blue-400 p-3 rounded">
	<p className="text-xs text-blue-800">
	<strong>Note:</strong> All scores shown in the dashboard are
	based on MRP-adjusted (Multilevel Regression with
	Poststratification) estimates to ensure they are representative
	of the US population. The only exception is the Task Performance
	tab, which uses raw scores due to sample size constraints at the
	task level.
	</p>
	</div>
	</div>
	)}
	</div>

	{/* Metrics Explained */}
	<div className="border rounded-lg overflow-hidden shadow-sm">
	<SectionHeader
	title="Metrics Explained"
	icon={<BarChart size={18} />}
	section="metricsExplained"
	/>
	{openSections.metricsExplained && (
	<div className="p-4">
	<p className="mb-4 text-sm">
	Our evaluation uses 7 high-level categories (rated on a 1-7 Likert
	scale) and 21 low-level metrics (rated on a 1-5 scale) to
	comprehensively assess LLM performance from a user experience
	perspective.
	</p>

	{/* Metric selector tabs */}
	<div className="flex flex-wrap gap-1 mb-4 border-b">
	{metricsData.map((metric) => (
	<button
	key={metric.id}
	className={`px-3 py-2 text-sm rounded-t-lg flex items-center gap-1 ${
	activeMetricTab === metric.id
	? "bg-gray-100 font-medium border-t border-l border-r"
	: "bg-white hover:bg-gray-50"
	}`}
	onClick={() => setActiveMetricTab(metric.id)}
	>
	<span
	className={`w-2 h-2 rounded-full ${metric.color}`}
	></span>
	{metric.title}
	</button>
	))}
	</div>

	{/* Active metric content */}
	{metricsData.map(
	(metric) =>
	activeMetricTab === metric.id && (
	<div
	key={metric.id}
	className="border rounded-lg overflow-hidden"
	>
	<div className="px-4 py-3 bg-gray-50 border-b flex items-center gap-2">
	<div className={`rounded-full`}>
	{React.cloneElement(metric.icon, {
	className: `text-gray-700 w-5 h-5`,
	})}
	</div>
	<h4 className="font-medium text-gray-800">
	{metric.title}{" "}
	<span className="text-sm font-normal text-gray-600">
	(1-7 scale)
	</span>
	</h4>
	</div>
	<div className="p-4">
	<p className="text-sm mb-4">{metric.description}</p>

	{metric.metrics.length > 0 && (
	<>
	<h5 className="text-sm font-medium mb-3 text-gray-700">
	Specific Metrics (1-5 scale)
	</h5>
	<div className="grid md:grid-cols-3 gap-3">
	{metric.metrics.map((subMetric, idx) => (
	<div
	key={idx}
	className="border rounded p-3 hover:shadow-sm transition-shadow"
	>
	<p className="text-sm font-medium">
	{subMetric.name}
	</p>
	<p className="text-xs text-gray-600 mt-1">
	{subMetric.description}
	</p>
	</div>
	))}
	</div>
	</>
	)}
	</div>
	</div>
	)
	)}
	</div>
	)}
	</div>
	</div>
	);
	};

	export default AboutTab;