@inproceedings{sisto-etal-2022-challenges:lrec,
  author    = {De Sisto, Mirella and Vandeghinste, Vincent and Egea G{\'o}mez, Santiago and De Coster, Mathieu and Shterionov, Dimitar},
  title     = {Challenges with Sign Language Datasets for Sign Language Recognition and Translation},
  pages     = {2478--2487},
  editor    = {Calzolari, Nicoletta and B{\'e}chet, Fr{\'e}d{\'e}ric and Blache, Philippe and Choukri, Khalid and Cieri, Christopher and Declerck, Thierry and Goggi, Sara and Isahara, Hitoshi and Maegaard, Bente and Mariani, Joseph and Mazo, H{\'e}l{\`e}ne and Odijk, Jan and Piperidis, Stelios},
  booktitle = {13th International Conference on Language Resources and Evaluation ({LREC} 2022)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marseille, France},
  day       = {20--25},
  month     = jun,
  year      = {2022},
  isbn      = {979-10-95546-72-6},
  language  = {english},
  url       = {https://aclanthology.org/2022.lrec-1.264},
  abstract  = {Sign Languages (SLs) are the primary means of communication for at least half a million people in Europe alone. However, the development of SL recognition and translation tools is slowed down by a series of obstacles concerning resource scarcity and standardization issues in the available data. The former challenge relates to the volume of data available for machine learning as well as the time required to collect and process new data. The latter obstacle is linked to the variety of the data, i.e., annotation formats are not unified and vary amongst different resources. The available data formats are often not suitable for machine learning, obstructing the provision of automatic tools based on neural models. In the present paper, we give an overview of these challenges by comparing various SL corpora and SL machine learning datasets. Furthermore, we propose a framework to address the lack of standardization at format level, unify the available resources and facilitate SL research for different languages. Our framework takes ELAN files as inputs and returns textual and visual data ready to train SL recognition and translation models. We present a proof of concept, training neural translation models on the data produced by the proposed framework.}
}

@inproceedings{masso:10004:sign-lang:lrec,
  author    = {Mass{\'o}, Guillem and Badia, Toni},
  title     = {Dealing with Sign Language Morphemes in Statistical Machine Translation},
  pages     = {154--157},
  editor    = {Dreuw, Philippe and Efthimiou, Eleni and Hanke, Thomas and Johnston, Trevor and Mart{\'i}nez Ruiz, Gregorio and Schembri, Adam},
  booktitle = {Proceedings of the {LREC2010} 4th Workshop on the Representation and Processing of Sign Languages: Corpora and Sign Language Technologies},
  maintitle = {7th International Conference on Language Resources and Evaluation ({LREC} 2010)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Valletta, Malta},
  day       = {22--23},
  month     = may,
  year      = {2010},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/10004.html},
  abstract  = {The aim of this research is to establish the role of linguistic information in data-scarce statistical machine translation for sign languages using freely available tools. The main challenge in statistical machine translation is the scarcity of suitable data, and this problem becomes more pronounced in sign languages. The available corpora are small, usually not domain-specific, and their annotation conventions can vary considerably. Elaborating our own corpus is a very time-consuming task and the amount of data that we can obtain is even more reduced. Under these conditions, morpho-syntactic information helps to improve statistical machine translation results, but there are not linguistic processing tools for sign languages. We have managed to improve translations from Catalan to Catalan Sign Language by using factored models in an open source translation system with basic linguistic information such as the lemma or an annotation tier tag. Furthermore, this allows us to deal with sign language morphemes in a more systematic way.}
}

