# Your leaderboard name TITLE = """

U-MATH / μ-MATH leaderboard

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ These datasets are designed to test the mathematical reasoning and meta-evaluation capabilities of Large Language Models (LLMs) on university-level problems. U-MATH provides a set of 1,100 university-level mathematical problems, while µ-MATH complements it with a meta-evaluation framework focusing on solution judgment with 1084 LLM solutions. """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = """ This repository contains the official leaderboard code for the U-MATH and $\mu$-MATH benchmarks. These datasets are designed to test the mathematical reasoning and meta-evaluation capabilities of Large Language Models (LLMs) on university-level problems. ### Overview U-MATH provides a set of 1,100 university-level mathematical problems, while µ-MATH complements it with a meta-evaluation framework focusing on solution judgment with 1084 LLM solutions. * 📊 [U-MATH benchmark at Huggingface](https://huggingface.co/datasets/toloka/umath) * 🔎 [μ-MATH benchmark at Huggingface](https://huggingface.co/datasets/toloka/mumath) * 🗞️ [Paper](https://arxiv.org/abs/2412.03205) * 👾 [Evaluation Code at GitHub](https://github.com/Toloka/u-math/) ### Licensing Information * The contents of the μ-MATH's machine-generated `model_output` column are subject to the underlying LLMs' licensing terms. * Contents of all the other dataset U-MATH and μ-MATH fields, as well as the code, are available under the MIT license. """ CITATION_TEXT = r"""@misc{chernyshev2024umath, title={U-MATH: A University-Level Benchmark for Evaluating Mathematical Skills in LLMs}, author={Konstantin Chernyshev and Vitaliy Polshkov and Ekaterina Artemova and Alex Myasnikov and Vlad Stepanov and Alexei Miasnikov and Sergei Tilga}, year={2024}, eprint={2412.03205}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2412.03205}, }"""