@inproceedings{khan:26047:sign-lang:lrec,
  author    = {Khan, Sarmad and McLoughlin, Simon and Murtagh, Irene},
  title     = {A Comparative Analysis of Traditional and Contemporary Visual Features for Computational Annotation of {Irish} {Sign} {Language}},
  pages     = {239--247},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC2026} 12th Workshop on the Representation and Processing of Sign Languages: Language in Motion},
  maintitle = {15th International Conference on Language Resources and Evaluation ({LREC} 2026)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Palma, Mallorca, Spain},
  day       = {16},
  month     = may,
  year      = {2026},
  isbn      = {978-2-493814-82-1},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/26047.html},
  abstract  = {Automatic annotation of sign language data is critical for advancing linguistic research and developing sign language technologies, yet it remains a major bottleneck due to the inherently motion-based and multi-modal nature of signing. Irish Sign Language, like many sign languages, presents challenges for computational annotation and sign language processing due to limited annotated corpora and the inherent difficulty of reliably annotating movement, trajectories, and coarticulation across manual and non-manual articulators. This paper presents an automated computational framework for gloss-level annotation support in Irish Sign Language, designed to assist scalable corpus annotation by learning motion-related cues directly from sign language videos. Using ELAN-aligned segments from the Signs of Ireland Corpus, we compare contemporary self-supervised visual representations with traditional pose-based features derived from explicit skeletal tracking, evaluating three feature configurations: DINOv2, MediaPipe, and multi-modal fusion. Our results show that self-supervised visual embeddings achieve the highest average accuracy 86.12{\%}, outperforming both multi-modal fusion 84.28{\%} and pose-based representations 76.74{\%}. This indicates that recent visual models can implicitly encode linguistically relevant motion information, including articulator movement and transitional dynamics, reducing the need for explicit landmark extraction in practical annotation pipelines. Overall, this work provides empirical guidance and a deployable computational framework to support computational annotation and enrichment of sign language corpora.}
}

