Sign language is an effective non-verbal communication mode for the hearing-impaired people. Since the video-based sign language detection models have high requirements for enough lighting and clear background, current wearing glove-based sign language models are robust for poor light and occlusion situations. In this paper, we annotate a new dataset of Word-based Wearable Chinese Sign Languag (WW-CSL) gestures. Specifically, we propose a three-form (e.g., sequential sensor data, gesture video, and gesture text) scheme to represent dynamic CSL gestures. Guided by the scheme, a total of 3,000 samples were collected, corresponding to 100 word-based CSL gestures. Furthermore, we present a transformer-based baseline model to fuse 2 inertial measurement unites (IMUs) and 10 flex sensors for the wearable CSL detection. In order to integrate the advantage of video-based and wearable glove-based CSL gestures, we also propose a transformer-based Multi-Modal CSL Detection (MM-CSLD) framework which adeptly integrates the local sequential sensor data derived from wearable-based CSL gestures with the global, fine-grained skeleton representations captured from video-based CSL gestures simultaneously.
@inproceedings{xu-etal-2024-wwcsl:lrec,
author = {Xu, Fan and Liu, Kai and Yang, Yifeng and Yan, Keyu},
title = {WW-CSL: A New Dataset for Word-Based Wearable Chinese Sign Language Detection},
pages = {17718--17724},
editor = {Calzolari, Nicoletta and Kan, Min-Yen and Hoste, Veronique and Lenci, Alessandro and Sakti, Sakriani and Xue, Nianwen},
booktitle = {2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation ({LREC-COLING} 2024)},
publisher = {{ELRA Language Resources Association (ELRA) and the International Committee on Computational Linguistics (ICCL)}},
address = {Torino, Italy},
day = {20--25},
month = may,
year = {2024},
isbn = {978-2-493814-10-4},
language = {english},
url = {https://aclanthology.org/2024.lrec-main.1541}
}