@misc{caswell2025smol, title={{SMOL: Professionally translated parallel data for 115 under-represented languages}}, author={Isaac Caswell and Elizabeth Nielsen and Jiaming Luo and Colin Cherry and Geza Kovacs and Hadar Shemtov and Partha Talukdar and Dinesh Tewari and Baba Mamadi Diane and Koulako Moussa Doumbouya and Djibrila Diane and Solo Farabado Cissé and Edoardo Ferrante and Alessandro Guasoni and Mamadou K. Keita and Sudhamoy DebBarma and Ali Kuzhuget and David Anugraha and Muhammad Ravi Shulthan Habibi and Sina Ahmadi and Mingfei Lau and Jonathan Eng}, year={2025}, eprint={2502.12301}, archivePrefix={arXiv}, primaryClass={cs.CL}, url={https://arxiv.org/abs/2502.12301}, } @inproceedings{jones-etal-2023-gatitos, title = {{"GATITOS: Using a New Multilingual Lexicon for Low-resource Machine Translation"}}, author = "Jones, Alexander and Caswell, Isaac and Firat, Orhan and Saxena, Ishank", editor = "Bouamor, Houda and Pino, Juan and Bali, Kalika", booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing", month = dec, year = "2023", address = "Singapore", publisher = "Association for Computational Linguistics", url = "https://aclanthology.org/2023.emnlp-main.26/", doi = "10.18653/v1/2023.emnlp-main.26", pages = "371--405", abstract = "Modern machine translation models and language models are able to translate without having been trained on parallel data, greatly expanding the set of languages that they can serve. However, these models still struggle in a variety of predictable ways, a problem that cannot be overcome without at least some trusted bilingual data. This work expands on a cheap and abundant resource to combat this problem: bilingual lexica. We test the efficacy of bilingual lexica in a real-world set-up, on 200-language translation models trained on web-crawled text. We present several findings: (1) using lexical data augmentation, we demonstrate sizable performance gains for unsupervised translation; (2) we compare several families of data augmentation, demonstrating that they yield similar improvements, and can be combined for even greater improvements; (3) we demonstrate the importance of carefully curated lexica over larger, noisier ones, especially with larger models; and (4) we compare the efficacy of multilingual lexicon data versus human-translated parallel data. Based on results from (3), we develop and open-source GATITOS, a high-quality, curated dataset in 168 tail languages, one of the first human-translated resources to cover many of these languages." }

Diesen Datensatz verwenden Auf Discord diskutieren

Datum

vor 2 Monaten

Organisation

Paper-URL

2502.12301

Lizenz

CC BY 4.0

Zusammensetzung des Datensatzes:

SmolDoc: Übersetzung auf Dokumentenebene, die 130 Sprachpaare (129 unabhängige Sprachen) abdeckt;
SmolSent: Übersetzung auf Satzebene, die 114 Sprachpaare (116 unabhängige Sprachen) abdeckt;
GATITOS: Ein Übersetzungstool auf Wortebene, das 181 Sprachpaare (183 unabhängige Sprachen) abdeckt und hauptsächlich als mehrsprachiges Wörterbuch verwendet wird;
SmolDoc-Faktenvermerke: Faktenvermerke und Begründungen für 661 Dokumente in SmolDoc.

Zitate

@misc{caswell2025smol,
title={{SMOL: Professionally translated parallel data for 115 under-represented languages}},
author={Isaac Caswell and Elizabeth Nielsen and Jiaming Luo and Colin Cherry and Geza Kovacs and Hadar Shemtov and Partha Talukdar and Dinesh Tewari and Baba Mamadi Diane and Koulako Moussa Doumbouya and Djibrila Diane and Solo Farabado Cissé and Edoardo Ferrante and Alessandro Guasoni and Mamadou K. Keita and Sudhamoy DebBarma and Ali Kuzhuget and David Anugraha and Muhammad Ravi Shulthan Habibi and Sina Ahmadi and Mingfei Lau and Jonathan Eng},
year={2025},
eprint={2502.12301},
archivePrefix={arXiv},
primaryClass={cs.CL},
url={https://arxiv.org/abs/2502.12301},
}
@inproceedings{jones-etal-2023-gatitos,
title = {{"GATITOS: Using a New Multilingual Lexicon for Low-resource Machine Translation"}},
author = "Jones, Alexander  and
Caswell, Isaac  and
Firat, Orhan  and
Saxena, Ishank",
editor = "Bouamor, Houda  and
Pino, Juan  and
Bali, Kalika",
booktitle = "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing",
month = dec,
year = "2023",
address = "Singapore",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/2023.emnlp-main.26/",
doi = "10.18653/v1/2023.emnlp-main.26",
pages = "371--405",
abstract = "Modern machine translation models and language models are able to translate without having been trained on parallel data, greatly expanding the set of languages that they can serve. However, these models still struggle in a variety of predictable ways, a problem that cannot be overcome without at least some trusted bilingual data. This work expands on a cheap and abundant resource to combat this problem: bilingual lexica. We test the efficacy of bilingual lexica in a real-world set-up, on 200-language translation models trained on web-crawled text. We present several findings: (1) using lexical data augmentation, we demonstrate sizable performance gains for unsupervised translation; (2) we compare several families of data augmentation, demonstrating that they yield similar improvements, and can be combined for even greater improvements; (3) we demonstrate the importance of carefully curated lexica over larger, noisier ones, especially with larger models; and (4) we compare the efficacy of multilingual lexicon data versus human-translated parallel data. Based on results from (3), we develop and open-source GATITOS, a high-quality, curated dataset in 168 tail languages, one of the first human-translated resources to cover many of these languages."
}

Dieser Datensatz wurde von Community-Nutzern beigesteuert und dient ausschließlich Bildungs- und Informationszwecken. Falls Inhalte eine Urheberrechtsverletzung darstellen, kontaktieren Sie uns bitte unter [email protected] zur umgehenden Prüfung und Entfernung.

Zugehörige Datensätze

KI mit KI entwickeln

Von der Idee bis zum Launch – beschleunigen Sie Ihre KI-Entwicklung mit kostenlosem KI-Co-Coding, sofort einsatzbereiter Umgebung und bestem GPU-Preis.

KI-gestütztes kollaboratives Programmieren

Sofort einsatzbereite GPUs

Die besten Preise

Erste Schritte Preise anzeigen

HyperAI Newsletters

Abonnieren Sie unsere neuesten Updates

Wir werden die neuesten Updates der Woche in Ihren Posteingang liefern um neun Uhr jeden Montagmorgen

Unterstützt von MailChimp

Command Palette

SMOL Multilingual Translation Parallel Dataset

Zusammensetzung des Datensatzes:

Zitate

KI mit KI entwickeln

HyperAI Newsletters

Command Palette

SMOL Multilingual Translation Parallel Dataset

Zusammensetzung des Datensatzes:

Zitate

Zugehörige Datensätze

TACK Targeted Chimera Knowledge Base Dataset

Weltweiter Datensatz Zu Luftverschmutzung Und Luftqualitätsindex

MemLens Multimodal Long Context Benchmark-Datensatz

Brustkrebs: Multimodaler Fusionsdatensatz

QCalEval Quantenkalibrierungsdiagramm – Verständnisdatensatz

KI mit KI entwickeln

HyperAI Newsletters

Command Palette

SMOL Multilingual Translation Parallel Dataset

Zusammensetzung des Datensatzes:

Zitate

Zugehörige Datensätze

TACK Targeted Chimera Knowledge Base Dataset

Weltweiter Datensatz Zu Luftverschmutzung Und Luftqualitätsindex

MemLens Multimodal Long Context Benchmark-Datensatz

Brustkrebs: Multimodaler Fusionsdatensatz

QCalEval Quantenkalibrierungsdiagramm – Verständnisdatensatz

KI mit KI entwickeln

HyperAI Newsletters

Zugehörige Datensätze

TACK Targeted Chimera Knowledge Base Dataset

Weltweiter Datensatz Zu Luftverschmutzung Und Luftqualitätsindex

MemLens Multimodal Long Context Benchmark-Datensatz

Brustkrebs: Multimodaler Fusionsdatensatz

QCalEval Quantenkalibrierungsdiagramm – Verständnisdatensatz

Zugehörige Datensätze

TACK Targeted Chimera Knowledge Base Dataset

Weltweiter Datensatz Zu Luftverschmutzung Und Luftqualitätsindex

MemLens Multimodal Long Context Benchmark-Datensatz

Brustkrebs: Multimodaler Fusionsdatensatz

QCalEval Quantenkalibrierungsdiagramm – Verständnisdatensatz