We propose best practices for gloss annotation of sign languages taking into account the needs of data-driven approaches to recognition and translation of natural languages. Furthermore, we provide reference numbers for several technical aspects for the creation of new sign language data collections. Most available sign language data collections are of limited use to data-driven approaches, because they focus on rare sign language phenomena, or lack machine readable annotation schemes. Using a natural language processing point of view, we briefly discuss several sign language data collection, propose best practices for gloss annotation stemming from experience gained using two large scale sign language data collections, and derive reference numbers for several technical aspects from standard benchmark data collections for speech recognition and translation.
@inproceedings{forster:10038:sign-lang:lrec,
author = {Forster, Jens and Stein, Daniel and Ormel, Ellen and Crasborn, Onno and Ney, Hermann},
title = {Best Practice for Sign Language Data Collections Regarding the Needs of Data-Driven Recognition and Translation},
pages = {92--97},
editor = {Dreuw, Philippe and Efthimiou, Eleni and Hanke, Thomas and Johnston, Trevor and Mart{\'i}nez Ruiz, Gregorio and Schembri, Adam},
booktitle = {Proceedings of the {LREC2010} 4th Workshop on the Representation and Processing of Sign Languages: Corpora and Sign Language Technologies},
maintitle = {7th International Conference on Language Resources and Evaluation ({LREC} 2010)},
publisher = {{European Language Resources Association (ELRA)}},
address = {Valletta, Malta},
day = {22--23},
month = may,
year = {2010},
language = {english},
url = {https://www.sign-lang.uni-hamburg.de/lrec/pub/10038.pdf}
}