Spaces:

sapienzanlp
/

ea-mt-leaderboard

Running

App Files Files Community

ea-mt-leaderboard / app.py

s-conia

Update description

2a0e0d2 3 months ago

raw

history blame contribute delete

16.4 kB

	import json
	from typing import Dict, List

	import pandas as pd
	import gradio as gr


	class LeaderboardData:
	def __init__(self, data_path: str):
	"""Initialize leaderboard data from a jsonl file.

	Args:
	data_path: Path to the jsonl file containing submission data.
	"""
	self.df = self._load_data(data_path)
	# Compute absolute ranking based on overall score.
	overall_score_cols = [
	col for col in self.df.columns if col.endswith("overall_overall_score")
	]
	if overall_score_cols:
	# Use the first overall score column to compute ranking.
	main_overall = overall_score_cols[0]
	# Higher overall scores get lower rank numbers (i.e. rank 1 is best).
	self.df["rank"] = (
	self.df[main_overall]
	.fillna(0)
	.rank(method="min", ascending=False)
	.astype(int)
	)
	else:
	self.df["rank"] = None

	self.column_categories = self._categorize_columns()

	def _load_data(self, data_path: str) -> pd.DataFrame:
	"""Load and process submission data from a jsonl file.

	This method flattens each score object into three separate columns:
	`<lang>_meta_score`, `<lang>_comet_score`, and `<lang>_overall_score`,
	rounding each value to 2 decimal digits.

	Args:
	data_path: Path to the data file.

	Returns:
	Processed DataFrame with submission data.
	"""
	records = []
	with open(data_path) as f:
	for line in f:
	submission = json.loads(line)
	for submission_id, data in submission.items():
	record = {
	"team_name": data["metadata"]["team_name"],
	"system_name": data["metadata"]["submission_name"],
	"uses_gold": "🟠" if data["metadata"]["uses_gold"] else "",
	"uses_rag": "🔍" if data["metadata"]["uses_rag"] else "",
	"uses_llm": (
	"🤖" if data["metadata"]["llm_name"] != "N/A" else ""
	),
	"llm_name": data["metadata"]["llm_name"],
	"is_finetuned": (
	"📚" if data["metadata"]["is_finetuned"] else ""
	),
	}

	# Flatten each score object into three separate columns and round to 2 decimals.
	# Sort languages by name to ensure consistent column order.
	languages = sorted(
	l for l in data["scores"].keys() if l != "overall"
	)
	languages.append("overall")

	for lang in languages:
	scores = data["scores"].get(lang, {})

	record[f"{lang}_meta_score"] = (
	round(scores.get("meta_score", 0), 2)
	if scores.get("meta_score") is not None
	else None
	)
	record[f"{lang}_comet_score"] = (
	round(scores.get("comet_score", 0), 2)
	if scores.get("comet_score") is not None
	else None
	)
	record[f"{lang}_overall_score"] = (
	round(scores.get("overall_score", 0), 2)
	if scores.get("overall_score") is not None
	else None
	)

	records.append(record)

	return pd.DataFrame.from_records(records)

	def _categorize_columns(self) -> Dict[str, List[str]]:
	"""Categorize DataFrame columns into team info, system details, and scores.

	Returns:
	Dictionary mapping column categories to column names.
	"""
	if self.df.empty:
	return {"team_info": [], "system_details": [], "scores": []}

	team_info_cols = ["team_name", "system_name"]
	system_details_cols = [
	"uses_gold",
	"uses_rag",
	"uses_llm",
	"llm_name",
	"is_finetuned",
	]
	score_cols = [
	col
	for col in self.df.columns
	if col not in team_info_cols + system_details_cols + ["rank"]
	]
	return {
	"team_info": team_info_cols,
	"system_details": system_details_cols,
	"scores": score_cols,
	}

	def get_team_names(self) -> List[str]:
	"""Get list of unique team names."""
	return self.df["team_name"].unique().tolist()

	def get_system_names(self) -> List[str]:
	"""Get list of unique system names."""
	return self.df["system_name"].unique().tolist()

	def get_llm_names(self) -> List[str]:
	"""Get list of unique LLM names."""
	return self.df["llm_name"].unique().tolist()


	def leaderboard_view(
	selected_team,
	selected_system,
	selected_llm,
	score_type,
	leaderboard_data,
	uses_gold,
	uses_rag,
	uses_llm,
	is_finetuned,
	):
	"""
	Filters the leaderboard based on dropdown selections and checkbox values,
	then selects only the score columns that match the score_type (meta, comet, or overall).
	The DataFrame is sorted (descending) by the first matching score column.
	Finally, the score columns are renamed to only show the language code.

	An absolute ranking column (based on overall score) is added as the first column.
	Column names are made more user-friendly.
	"""
	df = leaderboard_data.df.copy()

	# Apply dropdown filtering (if not "All").
	if selected_team != "All":
	df = df[df["team_name"] == selected_team]
	if selected_system != "All":
	df = df[df["system_name"] == selected_system]
	if selected_llm != "All":
	df = df[df["llm_name"] == selected_llm]

	# Apply checkbox filtering.
	if uses_gold:
	df = df[df["uses_gold"] == ""] # Show only systems that do not use gold.
	if uses_rag:
	df = df[df["uses_rag"] == ""] # Show only systems that do not use RAG.
	if uses_llm:
	df = df[df["uses_llm"] == ""] # Show only systems that do not use LLM.
	if is_finetuned:
	df = df[df["is_finetuned"] == ""] # Show only systems that are not finetuned.

	# Build columns: add the ranking column first, then team info and system details.
	cols = []
	if "rank" in df.columns:
	cols.append("rank")
	cols.extend(leaderboard_data.column_categories["team_info"])
	cols.extend(leaderboard_data.column_categories["system_details"])

	# Include only score columns ending with the desired score_type.
	score_cols = [
	col
	for col in leaderboard_data.column_categories["scores"]
	if col.endswith(f"_{score_type}_score")
	]
	cols.extend(score_cols)

	df = df[cols]

	# Sort by the score column matching score_type (if available).
	sort_col = next(
	(col for col in df.columns if col.endswith(f"overall_overall_score")), None
	)
	if sort_col is not None:
	df = df.sort_values(by=sort_col, ascending=False)

	# Rename columns: for score columns, strip the suffix; for others, use friendly names.
	friendly_names = {
	"rank": "Rank",
	"team_name": "Team",
	"system_name": "System",
	"uses_gold": "Uses Gold",
	"uses_rag": "Uses RAG",
	"uses_llm": "Uses LLM",
	"llm_name": "LLM Name",
	"is_finetuned": "Finetuned",
	}
	new_columns = {}
	for col in df.columns:
	if col.endswith(f"_{score_type}_score"):
	new_columns[col] = col.replace(f"_{score_type}_score", "")
	elif col in friendly_names:
	new_columns[col] = friendly_names[col]
	df = df.rename(columns=new_columns)

	# Reorder columns to ensure "Rank" is the first column.
	if "Rank" in df.columns:
	new_order = ["Rank"] + [c for c in df.columns if c != "Rank"]
	df = df[new_order]

	return df


	def update_all(
	selected_team,
	selected_system,
	selected_llm,
	leaderboard_data,
	uses_gold,
	uses_rag,
	uses_llm,
	is_finetuned,
	):
	"""Update all three tabs and return DataFrames for meta, comet, and overall scores."""
	meta_df = leaderboard_view(
	selected_team,
	selected_system,
	selected_llm,
	"meta",
	leaderboard_data,
	uses_gold,
	uses_rag,
	uses_llm,
	is_finetuned,
	)
	comet_df = leaderboard_view(
	selected_team,
	selected_system,
	selected_llm,
	"comet",
	leaderboard_data,
	uses_gold,
	uses_rag,
	uses_llm,
	is_finetuned,
	)
	overall_df = leaderboard_view(
	selected_team,
	selected_system,
	selected_llm,
	"overall",
	leaderboard_data,
	uses_gold,
	uses_rag,
	uses_llm,
	is_finetuned,
	)
	return meta_df, comet_df, overall_df


	def leaderboard_app(data_path: str):
	"""Create a leaderboard app with three tabs for meta, comet, and overall scores."""
	leaderboard_data = LeaderboardData(data_path)

	# Calculate initial values first.
	initial_meta, initial_comet, initial_overall = update_all(
	"All",
	"All",
	"All",
	leaderboard_data,
	uses_gold=False,
	uses_rag=False,
	uses_llm=False,
	is_finetuned=False,
	)

	# Prepend "All" to dropdown choices.
	team_choices = ["All"] + leaderboard_data.get_team_names()
	system_choices = ["All"] + leaderboard_data.get_system_names()
	llm_choices = ["All"] + leaderboard_data.get_llm_names()

	theme = gr.themes.Ocean(
	secondary_hue="indigo",
	neutral_hue="slate",
	spacing_size="lg",
	radius_size="md",
	)

	with gr.Blocks(theme=theme) as demo:

	gr.Markdown(
	"""
	# Entity-Aware Machine Translation Leaderboard

	## Overview
	This leaderboard showcases the performance of various systems on the EA-MT shared task, which has been organized as part of the [SemEval 2025](https://semeval.github.io/2025/) workshop.
	* The results are still provisional and subject to change.

	## Task Description
	The task is to translate a given input sentence from the source language (English) to the target language, where the input sentence contains named entities that may be challenging for machine translation systems to handle. The named entities may be entities that are rare, ambiguous, or unknown to the machine translation system. The task is to develop machine translation systems that can accurately translate such named entities in the input sentence to the target language.

	* Learn more about the task on the [EA-MT shared task page](https://sapienzanlp.github.io/ea-mt/).

	### Scoring
	The leaderboard is based on three main scores:
	- M-ETA Score: A score that evaluates the translation quality of named entities in the input sentence.
	- COMET Score: A score that evaluates the translation quality at the sentence level.
	- Overall Score: The harmonic mean of the M-ETA and COMET scores.

	### Legend
	- 🟠: Uses gold data, i.e., the gold Wikidata ID or information derived from it, at test time.
	- 🔍: Uses RAG (Retrieval-Augmented Generation) for named entity translation.
	- 🤖: Uses an LLM (Large Language Model) for named entity translation.
	- 📚: The system (LLM and/or MT model) is finetuned on additional data.
	"""
	)

	# Divider.
	gr.Markdown("---")

	# Filters and controls.
	gr.Markdown(
	"""## Filters and Controls
	Use the dropdowns and checkboxes to filter the leaderboard scores."""
	)

	# Controls at the top.

	uses_gold_checkbox = gr.Checkbox(
	label="🟠 : Display only those systems that do not use gold information at test time.",
	value=False,
	)
	uses_rag_checkbox = gr.Checkbox(
	label="🔍 : Display only those systems that do not use RAG.",
	value=False,
	)
	uses_llm_checkbox = gr.Checkbox(
	label="🤖 : Display only those systems that do not use LLM.",
	value=False,
	)
	is_finetuned_checkbox = gr.Checkbox(
	label="📚 : Display only those systems that are not finetuned.",
	value=False,
	)

	with gr.Row():
	with gr.Column(scale=1):
	team_dropdown = gr.Dropdown(
	choices=team_choices, label="Team Name", value="All"
	)
	with gr.Column(scale=1):
	system_dropdown = gr.Dropdown(
	choices=system_choices, label="System Name", value="All"
	)
	with gr.Column(scale=1):
	llm_dropdown = gr.Dropdown(
	choices=llm_choices, label="LLM Name", value="All"
	)

	# Divider.
	gr.Markdown("---")

	# Tabs for meta, comet, and overall scores.
	gr.Markdown(
	"""
	## Leaderboard Scores
	You can view the leaderboard scores for each system based on the following metrics:
	- M-ETA Score: A score that evaluates the translation quality of named entities in the input sentence.
	- COMET Score: A score that evaluates the translation quality at the sentence level.
	- Overall Score: The harmonic mean of the M-ETA and COMET scores.
	Switch between the tabs to view the scores for each metric.

	> Note: You can sort the leaderboard by clicking on the column headers. For example, click on the "it_IT" column to sort by the Italian language scores.
	"""
	)

	with gr.Tabs() as tabs:
	with gr.TabItem("Overall Score"):
	overall_table = gr.Dataframe(
	value=initial_overall, # Set initial value here.
	label="Overall Score Leaderboard",
	interactive=True,
	)
	with gr.TabItem("M-ETA Score"):
	meta_table = gr.Dataframe(
	value=initial_meta, # Set initial value here.
	label="M-ETA Score Leaderboard",
	interactive=True,
	)
	with gr.TabItem("COMET Score"):
	comet_table = gr.Dataframe(
	value=initial_comet, # Set initial value here.
	label="Comet Score Leaderboard",
	interactive=True,
	)

	def update_callback(
	selected_team,
	selected_system,
	selected_llm,
	uses_gold,
	uses_rag,
	uses_llm,
	is_finetuned,
	):
	return update_all(
	selected_team,
	selected_system,
	selected_llm,
	leaderboard_data,
	uses_gold,
	uses_rag,
	uses_llm,
	is_finetuned,
	)

	for control in [
	team_dropdown,
	system_dropdown,
	llm_dropdown,
	uses_gold_checkbox,
	uses_rag_checkbox,
	uses_llm_checkbox,
	is_finetuned_checkbox,
	]:
	control.change(
	update_callback,
	inputs=[
	team_dropdown,
	system_dropdown,
	llm_dropdown,
	uses_gold_checkbox,
	uses_rag_checkbox,
	uses_llm_checkbox,
	is_finetuned_checkbox,
	],
	outputs=[meta_table, comet_table, overall_table],
	)

	return demo


	if __name__ == "__main__":
	data_path = "data/scores.jsonl"
	leaderboard_app(data_path).launch()