@inproceedings{chan:26055:sign-lang:lrec,
  author    = {Chan, Frederick and Levow, Gina-Anne and Cheng, Qi},
  title     = {A Small Model for Big Articulators: Sign Language Detection With a Tiny Machine Learning Model},
  pages     = {71--79},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC2026} 12th Workshop on the Representation and Processing of Sign Languages: Language in Motion},
  maintitle = {15th International Conference on Language Resources and Evaluation ({LREC} 2026)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Palma, Mallorca, Spain},
  day       = {16},
  month     = may,
  year      = {2026},
  isbn      = {978-2-493814-82-1},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/26055.html},
  abstract  = {This paper introduces a small (1,013 parameter) machine learning model for sign language detection in videos of isolated American Sign Language (ASL) signs. Our model aims to alleviate the time-consuming nature of producing sign clips for psycholinguistic study stimuli, sign dictionaries, and sign databases. Given a video where the signer starts from a resting position, signs a sign, and returns to the resting position for an arbitrary number of repetitions, the model detects frames in which signing occurs that can be used to segment video into clips of individual signs. We train and evaluate our model on data with precise coding of signing onset and offset from ASL-LEX 2.0, so that our model's annotations are suitable for psycholinguistics research. The model works on both real signs and pseudosigns, two types of stimuli needed for certain psycholinguistic studies. Our model's small size compared to the state-of-the-art (100K parameters or more) enables quick, bulk processing even on resource-constrained hardware. It achieves this by computing Instantaneous Visual Change (IVC), a 1D measure of changes in brightness in the input video, extracting features from the IVC-over-time signal with a convolution, and classifying the video frames as signing or non-signing with three neural layers.}
}

