@inproceedings{czehmann:26064:sign-lang:lrec,
  author    = {Czehmann, Vera and Yazdani, Shakib and Hamidullah, Yasser and Nunnari, Fabrizio and Avramidis, Eleftherios},
  title     = {"A Sacred Bird Called the Phoenix". Auditing the most-used Parallel Corpus for {German} {Sign} {Language} Recognition and Translation},
  pages     = {80--92},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC2026} 12th Workshop on the Representation and Processing of Sign Languages: Language in Motion},
  maintitle = {15th International Conference on Language Resources and Evaluation ({LREC} 2026)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Palma, Mallorca, Spain},
  day       = {16},
  month     = may,
  year      = {2026},
  isbn      = {978-2-493814-82-1},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/26064.html},
  abstract  = {This paper presents an empirical audit of the widely used RWTH-PHOENIX-2014T corpus, examining its suitability as a benchmark for sign language recognition and translation. Through human annotation of the training set and extensive sign-to-text back translation of the test set, we provide detailed statistics that indicate substantial quality issues, including information loss and lexical errors. Automatic scores comparing human sign-to-text back translations to the original speech transcribed references are remarkably low, suggesting strong translationese effects and substantial paraphrasing, revealing limitations of lexical metrics in adequately scoring translation quality. Replacing the original speech-transcribed references with human sign-to-text back translations while scoring existing sign language translation systems reveals the lack of robustness of system evaluation with lexical metrics against this test set. Our findings highlight risks associated with relying on this corpus for model evaluation and call for more rigorous, linguistically grounded evaluation practices in sign language technology research. The back-translated test set and error annotations are made publicly available.}
}

@inproceedings{yazdani-etal-2026-critical:lrec,
  author    = {Yazdani, Shakib and Hamidullah, Yasser and Espa{\~n}a-Bonet, Cristina and Avramidis, Eleftherios and van Genabith, Josef},
  title     = {A Critical Study of Automatic Evaluation in Sign Language Translation},
  pages     = {9535--9548},
  editor    = {Piperidis, Stelios and Bel, N{\'u}ria and van den Heuvel, Henk and Ide, Nancy and Krek, Simon and Toral, Antonio},
  booktitle = {15th International Conference on Language Resources and Evaluation ({LREC} 2026)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Palma, Mallorca, Spain},
  day       = {11--16},
  month     = may,
  year      = {2026},
  isbn      = {978-2-493814-49-4},
  language  = {english},
  url       = {https://lrec.elra.info/lrec2026-main-749},
  doi       = {10.63317/4n2sooe4fb2i},
  abstract  = {Automatic evaluation metrics are crucial for advancing sign language translation (SLT). Current SLT evaluation metrics, such as BLEU and ROUGE, are only text-based, and it remains unclear to what extent text-based metrics can reliably capture the quality of SLT outputs. To address this gap, we investigate the limitations of text-based SLT evaluation metrics by analyzing six metrics, including BLEU, chrF, and ROUGE, as well as BLEURT on the one hand, and large language model (LLM)-based evaluators such as G-Eval and GEMBA zero-shot direct assessment on the other hand. Specifically, we assess the consistency and robustness of these metrics under three controlled conditions: paraphrasing, hallucinations in model outputs, and variations in sentence length. Our analysis highlights the limitations of lexical overlap metrics and demonstrates that while LLM-based evaluators better capture semantic equivalence often missed by conventional metrics, they can also exhibit bias toward LLM-paraphrased translations. Moreover, although all metrics are able to detect hallucinations, BLEU tends to be overly sensitive, whereas BLEURT and LLM-based evaluators are comparatively lenient toward subtle cases. This motivates the need for multimodal evaluation frameworks that extend beyond text-based metrics to enable a more holistic assessment of SLT outputs.}
}

