@inproceedings{xia:24014:sign-lang:lrec,
  author    = {Xia, Zhaoyang and Zhou, Yang and Han, Ligong and Neidle, Carol and Metaxas, Dimitris},
  title     = {Diffusion Models for Sign Language Video Anonymization},
  pages     = {119--131},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC-COLING} 2024 11th Workshop on the Representation and Processing of Sign Languages: Evaluation of Sign Language Resources},
  maintitle = {2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation ({LREC-COLING} 2024)},
  publisher = {{ELRA Language Resources Association (ELRA) and the International Committee on Computational Linguistics (ICCL)}},
  address   = {Torino, Italy},
  day       = {25},
  month     = may,
  year      = {2024},
  isbn      = {978-2-493814-30-2},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/24014.html},
  abstract  = {Since American Sign Language (ASL) has no standard written form, Deaf signers frequently share videos in order to communicate in their native language. However, this does not preserve privacy. Since critical linguistic information is transmitted through facial expressions, the face cannot be obscured. While signers have expressed interest, for a variety of applications, in sign language video anonymization that would effectively preserve linguistic content, attempts to develop such technology have had limited success and generally require pose estimation that cannot be readily carried out in the wild. To address current limitations, our research introduces DiffSLVA, a novel methodology that uses pre-trained large-scale diffusion models for text-guided sign language video anonymization. We incorporate ControlNet, which leverages low-level image features such as HED (Holistically-Nested Edge Detection) edges, to circumvent the need for pose estimation. Additionally, we develop a specialized module to capture linguistically essential facial expressions. We then combine the above methods to achieve anonymization that preserves the essential linguistic content of the original signer. This innovative methodology makes possible, for the first time, sign language video anonymization that could be used for real-world applications, which would offer significant benefits to the Deaf and Hard-of-Hearing communities.}
}

@inproceedings{zhou:24015:sign-lang:lrec,
  author    = {Zhou, Yang and Xia, Zhaoyang and Chen, Yuxiao and Neidle, Carol and Metaxas, Dimitris},
  title     = {A Multimodal Spatio-Temporal {GCN} Model with Enhancements for Isolated Sign Recognition},
  pages     = {132--143},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC-COLING} 2024 11th Workshop on the Representation and Processing of Sign Languages: Evaluation of Sign Language Resources},
  maintitle = {2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation ({LREC-COLING} 2024)},
  publisher = {{ELRA Language Resources Association (ELRA) and the International Committee on Computational Linguistics (ICCL)}},
  address   = {Torino, Italy},
  day       = {25},
  month     = may,
  year      = {2024},
  isbn      = {978-2-493814-30-2},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/24015.html},
  abstract  = {We propose a multimodal network using skeletons and handshapes as input to recognize individual signs and detect their boundaries in American Sign Language (ASL) videos. Our method integrates a spatio-temporal Graph Convolutional Network (GCN) architecture to estimate human skeleton keypoints; it uses a late-fusion approach for both forward and backward processing of video streams. Our (core) method is designed for the extraction---and analysis of features from---ASL videos, to enhance accuracy and efficiency of recognition of individual signs. A Gating module based on per-channel multi-layer convolutions is employed to evaluate significant frames for recognition of isolated signs. Additionally, an auxiliary multimodal branch network, integrated with a transformer, is designed to estimate the linguistic start and end frames of an isolated sign within a video clip. We evaluated performance of our approach on multiple datasets that include isolated, citation-form signs and signs pre-segmented from continuous signing based on linguistic annotations of start and end points of signs within sentences. We have achieved very promising results when using both types of sign videos combined for training, with overall sign recognition accuracy of 80.8{\%} Top-1 and 95.2{\%} Top-5 for citation-form signs, and 80.4{\%} Top-1 and 93.0{\%} Top-5 for signs pre-segmented from continuous signing.}
}

@inproceedings{neidle:22037:sign-lang:lrec,
  author    = {Neidle, Carol and Opoku, Augustine and Ballard, Carey M. and Dafnis, Konstantinos M. and Chroni, Evgenia and Metaxas, Dimitris},
  title     = {Resources for Computer-Based Sign Recognition from Video, and the Criticality of Consistency of Gloss Labeling across Multiple Large {ASL} Video Corpora},
  pages     = {165--172},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC2022} 10th Workshop on the Representation and Processing of Sign Languages: Multilingual Sign Language Resources},
  maintitle = {13th International Conference on Language Resources and Evaluation ({LREC} 2022)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marseille, France},
  day       = {25},
  month     = jun,
  year      = {2022},
  isbn      = {979-10-95546-86-3},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/22037.html},
  abstract  = {The WLASL purports to be ``the largest video dataset for Word-Level American Sign Language (ASL) recognition.'' It brings together various publicly shared video collections that could be quite valuable for sign recognition research, and it has been used extensively for such research. However, a critical problem with the accompanying annotations has heretofore not been recognized by the authors, nor by those who have exploited these data: There is no 1-1 correspondence between sign productions and gloss labels. Here we describe a large (and recently expanded and enhanced), linguistically annotated, downloadable, video corpus of citation-form ASL signs shared by the American Sign Language Linguistic Research Project (ASLLRP)---with 23,452 sign tokens and an online Sign Bank---in which such correspondences are enforced. We furthermore provide annotations for 19,672 of the WLASL video examples consistent with ASLLRP glossing conventions. For those wishing to use WLASL videos, this provides a set of annotations that makes it possible: (1) to use those data reliably for computational research; and/or (2) to combine the WLASL and ASLLRP datasets, creating a combined resource that is larger and richer than either of those datasets individually, with consistent gloss labeling for all signs. We also offer a summary of our own sign recognition research to date that exploits these data resources.}
}

@inproceedings{xia:22038:sign-lang:lrec,
  author    = {Xia, Zhaoyang and Chen, Yuxiao and Zhangli, Qilong and Huenerfauth, Matt and Neidle, Carol and Metaxas, Dimitris},
  title     = {Sign Language Video Anonymization},
  pages     = {202--211},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC2022} 10th Workshop on the Representation and Processing of Sign Languages: Multilingual Sign Language Resources},
  maintitle = {13th International Conference on Language Resources and Evaluation ({LREC} 2022)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marseille, France},
  day       = {25},
  month     = jun,
  year      = {2022},
  isbn      = {979-10-95546-86-3},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/22038.html},
  abstract  = {Deaf signers who wish to communicate in their native language frequently share videos on the Web. However, videos cannot preserve privacy---as is often desirable for discussion of sensitive topics---since both hands and face convey critical linguistic information and therefore cannot be obscured without degrading communication. Deaf signers have expressed interest in video anonymization that would preserve linguistic content. However, attempts to develop such technology have thus far shown limited success. We are developing a new method for such anonymization, with input from ASL signers. We modify a motion-based image animation model to generate high-resolution videos with the signer identity changed, but with the preservation of linguistically significant motions and facial expressions. An asymmetric encoder-decoder structured image generator is used to generate the high-resolution target frame from the low-resolution source frame based on the optical flow and confidence map. We explicitly guide the model to attain a clear generation of hands and faces by using bounding boxes to improve the loss computation. FID and KID scores are used for the evaluation of the realism of the generated frames. This technology shows great potential for practical applications to benefit deaf signers.}
}

@inproceedings{dafnis:70028:sltat:lrec,
  author    = {Dafnis, Konstantinos M. and Chroni, Evgenia and Neidle, Carol and Metaxas, Dimitris},
  title     = {Isolated Sign Recognition using {ASL} Datasets with Consistent Text-based Gloss Labeling and Curriculum Learning},
  pages     = {13--20},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and McDonald, John C. and Shterionov, Dimitar and Wolfe, Rosalee},
  booktitle = {Proceedings of the 7th International Workshop on Sign Language Translation and Avatar Technology: The Junction of the Visual and the Textual: Challenges and Perspectives},
  maintitle = {13th International Conference on Language Resources and Evaluation ({LREC} 2022)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marseille, France},
  day       = {24},
  month     = jun,
  year      = {2022},
  isbn      = {979-10-95546-82-5},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/2022.sltat-1.3.html},
  abstract  = {We present a new approach for isolated sign recognition, which combines a spatial-temporal Graph Convolution Network (GCN) architecture for modeling human skeleton keypoints with late fusion of both the forward and backward video streams, and we explore the use of curriculum learning. We employ a type of curriculum learning that dynamically estimates, during training, the order of difficulty of each input video for sign recognition; this involves learning a new family of data parameters that are dynamically updated during training. The research makes use of a large combined video dataset for American Sign Language (ASL), including data from both the American Sign Language Lexicon Video Dataset (ASLLVD) and the Word-Level American Sign Language (WLASL) dataset, with modified gloss labeling of the latter---to ensure 1-1 correspondence between gloss labels and distinct sign productions, as well as consistency in gloss labeling across the two datasets. This is the first time that these two datasets have been used in combination for isolated sign recognition research. We also compare the sign recognition performance on several different subsets of the combined dataset, varying in, e.g., the minimum number of samples per sign (and therefore also in the total number of sign classes and video examples).}
}

@inproceedings{saenz:70016:sltat:lrec,
  author    = {Saenz, Maria Del Carmen},
  title     = {Mouthing Recognition with {OpenPose} in Sign Language},
  pages     = {91--94},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and McDonald, John C. and Shterionov, Dimitar and Wolfe, Rosalee},
  booktitle = {Proceedings of the 7th International Workshop on Sign Language Translation and Avatar Technology: The Junction of the Visual and the Textual: Challenges and Perspectives},
  maintitle = {13th International Conference on Language Resources and Evaluation ({LREC} 2022)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marseille, France},
  day       = {24},
  month     = jun,
  year      = {2022},
  isbn      = {979-10-95546-82-5},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/2022.sltat-1.14.html},
  abstract  = {Many avatars focus on the hands and how they express sign language. However, sign language also uses mouth and face gestures to modify verbs, adjectives, or adverbs; these are known as non-manual components of the sign. To have a translation system that the Deaf community will accept, we need to include these non-manual signs. Just as machine learning is being used on generating hand signs, the work we are focusing on will be doing the same, but with mouthing and mouth gestures. We will be using data from The National Center for Sign Language and Gesture Resources. The data from the center are videos of native signers focusing on different areas of signer movement, gesturing, and mouthing, and are annotated specifically for mouthing studies. With this data, we will run a pre-trained Neural Network application called OpenPose. After running through OpenPose, further analysis of the data is conducted using a Random Forest Classifier. This research looks at how well an algorithm can be trained to spot certain mouthing points and output the mouth annotations with a high degree of accuracy. With this, the appropriate mouthing for animated signs can be easily applied to avatar technologies.}
}

@inproceedings{dafnis-etal-2022-bidirectional:lrec,
  author    = {Dafnis, Konstantinos M. and Chroni, Evgenia and Neidle, Carol and Metaxas, Dimitris},
  title     = {Bidirectional Skeleton-Based Isolated Sign Recognition using Graph Convolution Networks and Transfer Learning},
  pages     = {7328--7338},
  editor    = {Calzolari, Nicoletta and B{\'e}chet, Fr{\'e}d{\'e}ric and Blache, Philippe and Choukri, Khalid and Cieri, Christopher and Declerck, Thierry and Goggi, Sara and Isahara, Hitoshi and Maegaard, Bente and Mariani, Joseph and Mazo, H{\'e}l{\`e}ne and Odijk, Jan and Piperidis, Stelios},
  booktitle = {13th International Conference on Language Resources and Evaluation ({LREC} 2022)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marseille, France},
  day       = {20--25},
  month     = jun,
  year      = {2022},
  isbn      = {979-10-95546-72-6},
  language  = {english},
  url       = {https://aclanthology.org/2022.lrec-1.797},
  abstract  = {To improve computer-based recognition from video of isolated signs from American Sign Language (ASL), we propose a new skeleton-based method that involves explicit detection of the start and end frames of signs, trained on the ASLLVD dataset; it uses linguistically relevant parameters based on the skeleton input. Our method employs a bidirectional learning approach within a Graph Convolutional Network (GCN) framework. We apply this method to the WLASL dataset, but with corrections to the gloss labeling to ensure consistency in the labels assigned to different signs; it is important to have a 1-1 correspondence between signs and text-based gloss labels. We achieve a success rate of 77.43{\%} for top-1 and 94.54{\%} for top-5 using this modified WLASL dataset. Our method, which does not require multi-modal data input, outperforms other state-of-the-art approaches on the same modified WLASL dataset, demonstrating the importance of both attention to the start and end frames of signs and the use of bidirectional data streams in the GCNs for isolated sign recognition.}
}

@inproceedings{kacorri:16007:sign-lang:lrec,
  author    = {Kacorri, Hernisa and Syed, Ali Raza and Huenerfauth, Matt and Neidle, Carol},
  title     = {Centroid-Based Exemplar Selection of {ASL} Non-Manual Expressions using Multidimensional Dynamic Time Warping and {MPEG4} Features},
  pages     = {105--110},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna},
  booktitle = {Proceedings of the {LREC2016} 7th Workshop on the Representation and Processing of Sign Languages: Corpus Mining},
  maintitle = {10th International Conference on Language Resources and Evaluation ({LREC} 2016)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Portoro{\v z}, Slovenia},
  day       = {28},
  month     = may,
  year      = {2016},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/16007.html},
  abstract  = {We investigate a method for selecting recordings of human face and head movements from a sign language corpus to serve as a basis for generating animations of novel sentences of American Sign Language (ASL). Drawing from a collection of recordings that have been categorized into various types of non-manual expressions (NMEs), we define a method for selecting an exemplar recording of a given type using a centroid-based selection procedure, using multivariate dynamic time warping (DTW) as the distance function. Through intra- and inter-signer methods of evaluation, we demonstrate the efficacy of this technique, and we note useful potential for the DTW visualizations generated in this study for linguistic researchers collecting and analyzing sign language corpora.}
}

@inproceedings{wolfe-etal-2014-expanding:lrec,
  author    = {Wolfe, Rosalee and McDonald, John C. and Berke, Larwan and Stumbo, Marie},
  title     = {Expanding n-gram analytics in {ELAN} and a case study for sign synthesis},
  pages     = {1880--1885},
  editor    = {Calzolari, Nicoletta and Choukri, Khalid and Declerck, Thierry and Loftsson, Hrafn and Maegaard, Bente and Mariani, Joseph and Moreno, Asuncion and Odijk, Jan and Piperidis, Stelios,},
  booktitle = {9th International Conference on Language Resources and Evaluation ({LREC} 2014)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Reykjavik, Iceland},
  day       = {26--31},
  month     = may,
  year      = {2014},
  isbn      = {978-2-9517408-8-4},
  language  = {english},
  url       = {https://aclanthology.org/L14-1484},
  abstract  = {Corpus analysis is a powerful tool for signed language synthesis. A new extension to ELAN offers expanded n-gram analysis tools including improved search capabilities and an extensive library of statistical measures of association for n-grams. Uncovering and exploring coarticulatory timing effects via corpus analysis requires n-gram analysis to discover the most frequently occurring bigrams. This paper presents an overview of the new tools and a case study in American Sign Language synthesis that exploits these capabilities for computing more natural timing in generated sentences. The new extension provides a time-saving convenience for language researchers using ELAN.}
}

@inproceedings{liu-etal-2014-3d:lrec,
  author    = {Liu, Bo and Liu, Jingjing and Yu, Xiang and Metaxas, Dimitris and Neidle, Carol},
  title     = {3{D} Face Tracking and Multi-Scale, Spatio-temporal Analysis of Linguistically Significant Facial Expressions and Head Positions in {ASL}},
  pages     = {4512--4518},
  editor    = {Calzolari, Nicoletta and Choukri, Khalid and Declerck, Thierry and Loftsson, Hrafn and Maegaard, Bente and Mariani, Joseph and Moreno, Asuncion and Odijk, Jan and Piperidis, Stelios,},
  booktitle = {9th International Conference on Language Resources and Evaluation ({LREC} 2014)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Reykjavik, Iceland},
  day       = {26--31},
  month     = may,
  year      = {2014},
  isbn      = {978-2-9517408-8-4},
  language  = {english},
  url       = {https://aclanthology.org/L14-1318},
  abstract  = {Essential grammatical information is conveyed in signed languages by clusters of events involving facial expressions and movements of the head and upper body. This poses a significant challenge for computer-based sign language recognition. Here, we present new methods for the recognition of nonmanual grammatical markers in American Sign Language (ASL) based on: (1) new 3D tracking methods for the estimation of 3D head pose and facial expressions to determine the relevant low-level features; (2) methods for higher-level analysis of component events (raised/lowered eyebrows, periodic head nods and head shakes) used in grammatical markings―with differentiation of temporal phases (onset, core, offset, where appropriate), analysis of their characteristic properties, and extraction of corresponding features; (3) a 2-level learning framework to combine low- and high-level features of differing spatio-temporal scales. This new approach achieves significantly better tracking and recognition results than our previous methods.}
}

@inproceedings{wolfe:12006:sign-lang:lrec,
  author    = {Wolfe, Rosalee and McDonald, John C. and Toro, Jorge and Schnepp, Jerry},
  title     = {A Proposal for Making Corpora More Accessible for Synthesis: A Case Study Involving Pointing and Agreement Verbs},
  pages     = {163--166},
  editor    = {Crasborn, Onno and Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Kristoffersen, Jette and Mesch, Johanna},
  booktitle = {Proceedings of the {LREC2012} 5th Workshop on the Representation and Processing of Sign Languages: Interactions between Corpus and Lexicon},
  maintitle = {8th International Conference on Language Resources and Evaluation ({LREC} 2012)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Istanbul, Turkey},
  day       = {27},
  month     = may,
  year      = {2012},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/12006.html},
  abstract  = {Sign language corpora serve many purposes, including linguistic analysis, curation of endangered languages, and evaluation of linguistic theories. They also have the potential to serve as  an invaluable resource for improving sign language synthesis. Making corpora more accessible for synthesis  requires geometric as well as linguistic data. We explore alternate approaches and analyze the tradeoffs for the case of synthesizing indexing and agreement verbs.   We conclude with a series of questions exploring the feasibility of utilizing corpora for synthesis.}
}

