@misc{caswell2025smol, title={{SMOL: Professionally translated parallel data for 115 under-represented languages}}, author={Isaac Caswell and Elizabeth Nielsen and Jiaming Luo and Colin Cherry and Geza Kovacs and Hadar Shemtov and Partha Talukdar and Dinesh Tewari and Baba Mamadi Diane and Koulako Moussa Doumbouya and Djibrila Diane and Solo Farabado Cissé and Edoardo Ferrante and Alessandro Guasoni and Mamadou K. Keita and Sudhamoy DebBarma and Ali Kuzhuget and David Anugraha and Muhammad Ravi Shulthan Habibi and Sina Ahmadi and Mingfei Lau and Jonathan Eng}, year={2025}, eprint={2502.12301}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2502.12301}, } @inproceedings{jones-etal-2023-gatitos, title = {{"GATITOS: Using a New Multilingual Lexicon for Low-resource Machine Translation"}}, author = "Jones, Alexander and Caswell, Isaac and Firat, Orhan and Saxena, Ishank", editor = "Bouamor, Houda and Pino, Juan and Bali, Kalika", booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing", month = dec, year = "2023", address = "Singapore", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2023.emnlp-main.26/", doi = "10.18653/v1/2023.emnlp-main.26", pages = "371--405", abstract = "Modern machine translation models and language models are able to translate without having been trained on parallel data, greatly expanding the set of languages that they can serve. However, these models still struggle in a variety of predictable ways, a problem that cannot be overcome without at least some trusted bilingual data. This work expands on a cheap and abundant resource to combat this problem: bilingual lexica. We test the efficacy of bilingual lexica in a real-world set-up, on 200-language translation models trained on web-crawled text. We present several findings: (1) using lexical data augmentation, we demonstrate sizable performance gains for unsupervised translation; (2) we compare several families of data augmentation, demonstrating that they yield similar improvements, and can be combined for even greater improvements; (3) we demonstrate the importance of carefully curated lexica over larger, noisier ones, especially with larger models; and (4) we compare the efficacy of multilingual lexicon data versus human-translated parallel data. Based on results from (3), we develop and open-source GATITOS, a high-quality, curated dataset in 168 tail languages, one of the first human-translated resources to cover many of these languages." }

日期

2 个月前

数据集组织

论文 URL

2502.12301

许可证

CC BY 4.0

标签

机器学习

翻译

SMOL（Set for Maximal Overall Leverage）是由 Google 于 2025 年发布的专业翻译数据集，旨在为低资源语言训练翻译模型并提供高质量的平行数据，相关论文成果为 SMOL: Professionally translated parallel data for 115 under-represented languages 。该数据集收录了阿姆哈拉语、斯瓦希里语、阿法尔语等 221 种语料、标注数据稀缺的小语种 / 地方语言的专业翻译文本，覆盖广泛的语言对，包含专业翻译与志愿者贡献的文本，并针对部分语言加入了医疗领域垂直数据与事实性标注。

数据集构成:

SmolDoc：文档级翻译，涵盖 130 个语言对（129 种独立语言）；
SmolSent：句子级翻译，涵盖 114 个语言对（116 种独立语言）；
GATITOS：词元级翻译，涵盖 181 个语言对（183 种独立语言），主要作为多语言词典使用；
SmolDoc-factuality-annotations：针对 SmolDoc 中 661 个文档的事实性标注与推理依据。

Citations

@misc{caswell2025smol,
title={{SMOL: Professionally translated parallel data for 115 under-represented languages}},
author={Isaac Caswell and Elizabeth Nielsen and Jiaming Luo and Colin Cherry and Geza Kovacs and Hadar Shemtov and Partha Talukdar and Dinesh Tewari and Baba Mamadi Diane and Koulako Moussa Doumbouya and Djibrila Diane and Solo Farabado Cissé and Edoardo Ferrante and Alessandro Guasoni and Mamadou K. Keita and Sudhamoy DebBarma and Ali Kuzhuget and David Anugraha and Muhammad Ravi Shulthan Habibi and Sina Ahmadi and Mingfei Lau and Jonathan Eng},
year={2025},
eprint={2502.12301},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2502.12301},
}
@inproceedings{jones-etal-2023-gatitos,
title = {{"GATITOS: Using a New Multilingual Lexicon for Low-resource Machine Translation"}},
author = "Jones, Alexander  and
Caswell, Isaac  and
Firat, Orhan  and
Saxena, Ishank",
editor = "Bouamor, Houda  and
Pino, Juan  and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.26/",
doi = "10.18653/v1/2023.emnlp-main.26",
pages = "371--405",
abstract = "Modern machine translation models and language models are able to translate without having been trained on parallel data, greatly expanding the set of languages that they can serve. However, these models still struggle in a variety of predictable ways, a problem that cannot be overcome without at least some trusted bilingual data. This work expands on a cheap and abundant resource to combat this problem: bilingual lexica. We test the efficacy of bilingual lexica in a real-world set-up, on 200-language translation models trained on web-crawled text. We present several findings: (1) using lexical data augmentation, we demonstrate sizable performance gains for unsupervised translation; (2) we compare several families of data augmentation, demonstrating that they yield similar improvements, and can be combined for even greater improvements; (3) we demonstrate the importance of carefully curated lexica over larger, noisier ones, especially with larger models; and (4) we compare the efficacy of multilingual lexicon data versus human-translated parallel data. Based on results from (3), we develop and open-source GATITOS, a high-quality, curated dataset in 168 tail languages, one of the first human-translated resources to cover many of these languages."
}

此数据集由社区用户贡献,仅用于教育和信息目的。如有任何内容涉及版权侵权,请通过 [email protected] 联系我们,我们将及时审核并删除。

用 AI 构建 AI

从创意到上线——通过免费 AI 协同编码、开箱即用的环境和最优惠的 GPU 价格,加速您的 AI 开发。

AI 协同编码

开箱即用的 GPU

最优定价

开始使用查看定价

HyperAI Newsletters

订阅我们的最新资讯

我们会在北京时间 每周一的上午九点 向您的邮箱投递本周内的最新更新

邮件发送服务由 MailChimp 提供

数据集构成:

Citations

用 AI 构建 AI

HyperAI Newsletters

数据集构成:

Citations

相关数据集

TACK 靶向嵌合体知识库数据集

World Air Pollution and AQI 全球空气质量与 AQI 数据集

MemLens 多模态长上下文基准数据集

Breast Cancer：Multi-Modal Fusion 乳腺癌多模态融合数据集

QCalEval 量子校准图表理解数据集

用 AI 构建 AI

HyperAI Newsletters

数据集构成:

Citations

相关数据集

TACK 靶向嵌合体知识库数据集

World Air Pollution and AQI 全球空气质量与 AQI 数据集

MemLens 多模态长上下文基准数据集

Breast Cancer：Multi-Modal Fusion 乳腺癌多模态融合数据集

QCalEval 量子校准图表理解数据集

用 AI 构建 AI

HyperAI Newsletters

相关数据集

TACK 靶向嵌合体知识库数据集

World Air Pollution and AQI 全球空气质量与 AQI 数据集

MemLens 多模态长上下文基准数据集

Breast Cancer：Multi-Modal Fusion 乳腺癌多模态融合数据集

QCalEval 量子校准图表理解数据集

相关数据集

TACK 靶向嵌合体知识库数据集

World Air Pollution and AQI 全球空气质量与 AQI 数据集

MemLens 多模态长上下文基准数据集

Breast Cancer：Multi-Modal Fusion 乳腺癌多模态融合数据集

QCalEval 量子校准图表理解数据集

Command Palette

SMOL 多语言翻译平行数据集

数据集构成:

Citations

用 AI 构建 AI

HyperAI Newsletters

Command Palette

SMOL 多语言翻译平行数据集

数据集构成:

Citations

相关数据集

TACK 靶向嵌合体知识库数据集

World Air Pollution and AQI 全球空气质量与 AQI 数据集

MemLens 多模态长上下文基准数据集

Breast Cancer：Multi-Modal Fusion 乳腺癌多模态融合数据集

QCalEval 量子校准图表理解数据集

用 AI 构建 AI

HyperAI Newsletters

Command Palette

SMOL 多语言翻译平行数据集

数据集构成:

Citations

相关数据集

TACK 靶向嵌合体知识库数据集

World Air Pollution and AQI 全球空气质量与 AQI 数据集

MemLens 多模态长上下文基准数据集

Breast Cancer：Multi-Modal Fusion 乳腺癌多模态融合数据集

QCalEval 量子校准图表理解数据集

用 AI 构建 AI

HyperAI Newsletters

相关数据集

TACK 靶向嵌合体知识库数据集

World Air Pollution and AQI 全球空气质量与 AQI 数据集

MemLens 多模态长上下文基准数据集

Breast Cancer：Multi-Modal Fusion 乳腺癌多模态融合数据集

QCalEval 量子校准图表理解数据集

相关数据集

TACK 靶向嵌合体知识库数据集

World Air Pollution and AQI 全球空气质量与 AQI 数据集

MemLens 多模态长上下文基准数据集

Breast Cancer：Multi-Modal Fusion 乳腺癌多模态融合数据集

QCalEval 量子校准图表理解数据集