Title |
Automatic Acquisition of Parallel Corpora from Websites with Dynamic Content |
Authors |
Yulia Tsvetkov and Shuly Wintner |
Abstract |
Parallel corpora are indispensable resources for a variety of multilingualnatural language processing tasks. This paper presents a technique for fullyautomatic construction of constantly growing parallel corpora. We propose asimple and effective dictionary-based algorithm to extract parallel documentpairs from a large collection of articles retrieved from the Internet, potentially containing manually translated texts. This algorithm wasimplemented and tested on Hebrew-English parallel texts. With properly selectedthresholds, precision of 100% can be obtained. |
Language |
Machine Translation, SpeechToSpeech Translation |
Topics |
Corpus (creation, annotation, etc.), Multilinguality, Machine Translation, SpeechToSpeech Translation |
Full paper  |
Automatic Acquisition of Parallel Corpora from Websites with Dynamic Content |
Bibtex |
@InProceedings{TSVETKOV10.40,
author = {Yulia Tsvetkov and Shuly Wintner}, title = {Automatic Acquisition of Parallel Corpora from Websites with Dynamic Content}, booktitle = {Proceedings of the Seventh conference on International Language Resources and Evaluation (LREC'10)}, year = {2010}, month = {may}, date = {19-21}, address = {Valletta, Malta}, editor = {Nicoletta Calzolari (Conference Chair), Khalid Choukri, Bente Maegaard, Joseph Mariani, Jan Odjik, Stelios Piperidis, Mike Rosner, Daniel Tapias}, publisher = {European Language Resources Association (ELRA)}, isbn = {2-9517408-6-7}, language = {english} } |