@inproceedings{zhou:24015:sign-lang:lrec,
  author    = {Zhou, Yang and Xia, Zhaoyang and Chen, Yuxiao and Neidle, Carol and Metaxas, Dimitris},
  title     = {A Multimodal Spatio-Temporal {GCN} Model with Enhancements for Isolated Sign Recognition},
  pages     = {408--419},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC-COLING} 2024 11th Workshop on the Representation and Processing of Sign Languages: Evaluation of Sign Language Resources},
  maintitle = {2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation ({LREC-COLING} 2024)},
  publisher = {{ELRA Language Resources Association (ELRA) and the International Committee on Computational Linguistics (ICCL)}},
  address   = {Torino, Italy},
  day       = {25},
  month     = may,
  year      = {2024},
  isbn      = {978-2-493814-30-2},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/24015.html},
  abstract  = {We propose a multimodal network using skeletons and handshapes as input to recognize individual signs and detect their boundaries in American Sign Language (ASL) videos. Our method integrates a spatio-temporal Graph Convolutional Network (GCN) architecture to estimate human skeleton keypoints; it uses a late-fusion approach for both forward and backward processing of video streams. Our (core) method is designed for the extraction---and analysis of features from---ASL videos, to enhance accuracy and efficiency of recognition of individual signs. A Gating module based on per-channel multi-layer convolutions is employed to evaluate significant frames for recognition of isolated signs. Additionally, an auxiliary multimodal branch network, integrated with a transformer, is designed to estimate the linguistic start and end frames of an isolated sign within a video clip. We evaluated performance of our approach on multiple datasets that include isolated, citation-form signs and signs pre-segmented from continuous signing based on linguistic annotations of start and end points of signs within sentences. We have achieved very promising results when using both types of sign videos combined for training, with overall sign recognition accuracy of 80.8{\%} Top-1 and 95.2{\%} Top-5 for citation-form signs, and 80.4{\%} Top-1 and 93.0{\%} Top-5 for signs pre-segmented from continuous signing.}
}

@inproceedings{xia:22038:sign-lang:lrec,
  author    = {Xia, Zhaoyang and Chen, Yuxiao and Zhangli, Qilong and Huenerfauth, Matt and Neidle, Carol and Metaxas, Dimitris},
  title     = {Sign Language Video Anonymization},
  pages     = {202--211},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC2022} 10th Workshop on the Representation and Processing of Sign Languages: Multilingual Sign Language Resources},
  maintitle = {13th International Conference on Language Resources and Evaluation ({LREC} 2022)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marseille, France},
  day       = {25},
  month     = jun,
  year      = {2022},
  isbn      = {979-10-95546-86-3},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/22038.html},
  abstract  = {Deaf signers who wish to communicate in their native language frequently share videos on the Web. However, videos cannot preserve privacy---as is often desirable for discussion of sensitive topics---since both hands and face convey critical linguistic information and therefore cannot be obscured without degrading communication. Deaf signers have expressed interest in video anonymization that would preserve linguistic content. However, attempts to develop such technology have thus far shown limited success. We are developing a new method for such anonymization, with input from ASL signers. We modify a motion-based image animation model to generate high-resolution videos with the signer identity changed, but with the preservation of linguistically significant motions and facial expressions. An asymmetric encoder-decoder structured image generator is used to generate the high-resolution target frame from the low-resolution source frame based on the optical flow and confidence map. We explicitly guide the model to attain a clear generation of hands and faces by using bounding boxes to improve the loss computation. FID and KID scores are used for the evaluation of the realism of the generated frames. This technology shows great potential for practical applications to benefit deaf signers.}
}

