@inproceedings{MachineTranslationForNSW2012,
title = {Architecture for Text Normalization using Statistical Machine Translation techniques},
author = {V. López-Ludeña and R. San-Segundo and J. M. Montero and R. Barra-Chicote and J. Lorenzo},
url = {http://consortium.simple4all.org/files/2012/04/TextNormalizationIS_v3.pdf},
year = {2012},
date = {2012-11-07},
booktitle = {Proc. Iberspeech 2012},
abstract = {This paper proposes an architecture, based on statistical machine translation, for developing the text normalization module of a text to speech conversion system. The main target is to generate a language independent text normalization module, based on data and flexible enough to deal with all situations presented in this task. The proposed architecture is composed by three main modules: a tokenizer for splitting the text input into a token graph (tokenization), a phrase-based translation module (token translation) and a post-processing module for removing some tokens. This paper presents initial experiments for numbers and abbreviations. The very good results obtained validates the proposed architecture.},
keywords = {Abbreviations, Acronyms, language translation, numbers, text normalization, text to speech conversion}
}

This paper proposes an architecture, based on statistical machine translation, for developing the text normalization module of a text to speech conversion system. The main target is to generate a language independent text normalization module, based on data and flexible enough to deal with all situations presented in this task. The proposed architecture is composed by three main modules: a tokenizer for splitting the text input into a token graph (tokenization), a phrase-based translation module (token translation) and a post-processing module for removing some tokens. This paper presents initial experiments for numbers and abbreviations. The very good results obtained validates the proposed architecture.