@inproceedings{wang:26054:sign-lang:lrec,
  author    = {Wang, Zirui and Bono, Mayumi},
  title     = {Beyond {BLEU}: Linguistic Invisibility and Interactional Repair Sequence in End-to-End Sign Language Translation},
  pages     = {491--500},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC2026} 12th Workshop on the Representation and Processing of Sign Languages: Language in Motion},
  maintitle = {15th International Conference on Language Resources and Evaluation ({LREC} 2026)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Palma, Mallorca, Spain},
  day       = {16},
  month     = may,
  year      = {2026},
  isbn      = {978-2-493814-82-1},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/26054.html},
  abstract  = {Recent advances in end-to-end sign language translation (SLT) have achieved benchmark performance, yet little is known about whether these systems preserve the multi-channel linguistic structures that are essential for real-world communication. We argue that current optimization and evaluation practices create a form of linguistic invisibility, where interactionally decisive non-manual signals (NMS) are systematically underrepresented despite high translation scores.To empirically examine this issue, we analyze an interactional repair sequence from a Japanese Sign Language (JSL) conversational corpus as a diagnostic probe. Combining qualitative interactional analysis with kinematic measurements, we demonstrate a consistent manual--mouth decoupling pattern in which semantic resolution is carried primarily by mouthing while manual articulation remains largely constant. We show that such cross-channel contrast is unlikely to be preserved under current end-to-end training objectives that prioritize global motion similarity. Based on these findings, we argue that progress in SLT should be evaluated not only by sequence-level accuracy but also by the preservation of linguistically contrastive structures, motivating the development of diagnostic, multi-channel evaluation protocols for future SLT benchmarks. We therefore propose incorporating multi-channel diagnostic evaluation sets and decoupling-sensitive metrics into future SLT benchmarking frameworks, providing a pathway toward models that achieve both high performance and linguistic structural visibility.}
}

