Looking at lexical frequency and, by extension, lexical variation is often among the first objectives after compiling a sign language corpus, since the only prerequisite is existing sign gloss annotations. However, measuring lexical frequency in a theoretically and statistically meaningful way can be a challenge. In this paper, I provide an overview of how to approach lexical variation in sign language corpora. The aim is to show ways of tackle lexical variation from different angles, from data collection to statistics and visualization, and how to motivate choices based on the data available and the research goals, thus serving as a practical guide for sign language corpus research. Drawing from previous work by different sign language corpus project teams, various approaches to measuring lexical variation are illustrated with data from the Swedish Sign Language (STS) Corpus, with examples that can easily be adapted to any sign language corpus.
@inproceedings{borstell:24026:sign-lang:lrec,
author = {B{\"o}rstell, Carl},
title = {How to Approach Lexical Variation in Sign Language Corpora},
pages = {222--229},
editor = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Mesch, Johanna and Schulder, Marc},
booktitle = {Proceedings of the {LREC-COLING} 2024 11th Workshop on the Representation and Processing of Sign Languages: Evaluation of Sign Language Resources},
maintitle = {2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation ({LREC-COLING} 2024)},
publisher = {{ELRA Language Resources Association (ELRA) and the International Committee on Computational Linguistics (ICCL)}},
address = {Torino, Italy},
day = {25},
month = may,
year = {2024},
isbn = {978-2-493814-30-2},
language = {english},
url = {https://www.sign-lang.uni-hamburg.de/lrec/pub/24026.pdf}
}