This paper presents a new dataset for Kazakh-Russian Sign Language (KRSL) created for the purposes of Sign Language Processing. In 2020, Kazakhstan's schools were quickly switched to online mode due to the COVID-19 pandemic. Every working day, the El-arna TV channel was broadcasting video lessons for grades from 1 to 11 with sign language translation. This opportunity allowed us to record a corpus with a large vocabulary and spontaneous SL interpretation. To this end, this corpus contains video recordings of Kazakhstan's online school translated to Kazakh-Russian sign language by 7 interpreters. At the moment we collected and cleaned 890 hours of video material. A custom annotation tool was created to make the process of data annotation simple and easy-to-use by the Deaf community. To date, around 325 hours of videos have been annotated with glosses and 4,009 lessons out of 4,547 were transcribed with automatic speech-to-text software. The KRSL-OnlineSchool dataset will be made publicly available at https://krslproject.github.io/online-school/
@inproceedings{mukushev:22031:sign-lang:lrec,
author = {Mukushev, Medet and Kydyrbekova, Aigerim and Kimmelman, Vadim and Sandygulova, Anara},
title = {Towards Large Vocabulary {Kazakh-Russian} {Sign} {Language} Dataset: {KRSL-OnlineSchool}},
pages = {154--158},
editor = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna and Schulder, Marc},
booktitle = {Proceedings of the {LREC2022} 10th Workshop on the Representation and Processing of Sign Languages: Multilingual Sign Language Resources},
maintitle = {13th International Conference on Language Resources and Evaluation ({LREC} 2022)},
publisher = {{European Language Resources Association (ELRA)}},
address = {Marseille, France},
day = {25},
month = jun,
year = {2022},
isbn = {979-10-95546-86-3},
language = {english},
url = {https://www.sign-lang.uni-hamburg.de/lrec/pub/22031.pdf}
}