@inproceedings{xia:24014:sign-lang:lrec,
  author    = {Xia, Zhaoyang and Zhou, Yang and Han, Ligong and Neidle, Carol and Metaxas, Dimitris},
  title     = {Diffusion Models for Sign Language Video Anonymization},
  pages     = {119--131},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC-COLING} 2024 11th Workshop on the Representation and Processing of Sign Languages: Evaluation of Sign Language Resources},
  maintitle = {2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation ({LREC-COLING} 2024)},
  publisher = {{ELRA Language Resources Association (ELRA) and the International Committee on Computational Linguistics (ICCL)}},
  address   = {Torino, Italy},
  day       = {25},
  month     = may,
  year      = {2024},
  isbn      = {978-2-493814-30-2},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/24014.html},
  abstract  = {Since American Sign Language (ASL) has no standard written form, Deaf signers frequently share videos in order to communicate in their native language. However, this does not preserve privacy. Since critical linguistic information is transmitted through facial expressions, the face cannot be obscured. While signers have expressed interest, for a variety of applications, in sign language video anonymization that would effectively preserve linguistic content, attempts to develop such technology have had limited success and generally require pose estimation that cannot be readily carried out in the wild. To address current limitations, our research introduces DiffSLVA, a novel methodology that uses pre-trained large-scale diffusion models for text-guided sign language video anonymization. We incorporate ControlNet, which leverages low-level image features such as HED (Holistically-Nested Edge Detection) edges, to circumvent the need for pose estimation. Additionally, we develop a specialized module to capture linguistically essential facial expressions. We then combine the above methods to achieve anonymization that preserves the essential linguistic content of the original signer. This innovative methodology makes possible, for the first time, sign language video anonymization that could be used for real-world applications, which would offer significant benefits to the Deaf and Hard-of-Hearing communities.}
}

@inproceedings{zhou:24015:sign-lang:lrec,
  author    = {Zhou, Yang and Xia, Zhaoyang and Chen, Yuxiao and Neidle, Carol and Metaxas, Dimitris},
  title     = {A Multimodal Spatio-Temporal {GCN} Model with Enhancements for Isolated Sign Recognition},
  pages     = {132--143},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC-COLING} 2024 11th Workshop on the Representation and Processing of Sign Languages: Evaluation of Sign Language Resources},
  maintitle = {2024 Joint International Conference on Computational Linguistics, Language Resources and Evaluation ({LREC-COLING} 2024)},
  publisher = {{ELRA Language Resources Association (ELRA) and the International Committee on Computational Linguistics (ICCL)}},
  address   = {Torino, Italy},
  day       = {25},
  month     = may,
  year      = {2024},
  isbn      = {978-2-493814-30-2},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/24015.html},
  abstract  = {We propose a multimodal network using skeletons and handshapes as input to recognize individual signs and detect their boundaries in American Sign Language (ASL) videos. Our method integrates a spatio-temporal Graph Convolutional Network (GCN) architecture to estimate human skeleton keypoints; it uses a late-fusion approach for both forward and backward processing of video streams. Our (core) method is designed for the extraction---and analysis of features from---ASL videos, to enhance accuracy and efficiency of recognition of individual signs. A Gating module based on per-channel multi-layer convolutions is employed to evaluate significant frames for recognition of isolated signs. Additionally, an auxiliary multimodal branch network, integrated with a transformer, is designed to estimate the linguistic start and end frames of an isolated sign within a video clip. We evaluated performance of our approach on multiple datasets that include isolated, citation-form signs and signs pre-segmented from continuous signing based on linguistic annotations of start and end points of signs within sentences. We have achieved very promising results when using both types of sign videos combined for training, with overall sign recognition accuracy of 80.8{\%} Top-1 and 95.2{\%} Top-5 for citation-form signs, and 80.4{\%} Top-1 and 93.0{\%} Top-5 for signs pre-segmented from continuous signing.}
}

@inproceedings{neidle:22037:sign-lang:lrec,
  author    = {Neidle, Carol and Opoku, Augustine and Ballard, Carey M. and Dafnis, Konstantinos M. and Chroni, Evgenia and Metaxas, Dimitris},
  title     = {Resources for Computer-Based Sign Recognition from Video, and the Criticality of Consistency of Gloss Labeling across Multiple Large {ASL} Video Corpora},
  pages     = {165--172},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC2022} 10th Workshop on the Representation and Processing of Sign Languages: Multilingual Sign Language Resources},
  maintitle = {13th International Conference on Language Resources and Evaluation ({LREC} 2022)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marseille, France},
  day       = {25},
  month     = jun,
  year      = {2022},
  isbn      = {979-10-95546-86-3},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/22037.html},
  abstract  = {The WLASL purports to be ``the largest video dataset for Word-Level American Sign Language (ASL) recognition.'' It brings together various publicly shared video collections that could be quite valuable for sign recognition research, and it has been used extensively for such research. However, a critical problem with the accompanying annotations has heretofore not been recognized by the authors, nor by those who have exploited these data: There is no 1-1 correspondence between sign productions and gloss labels. Here we describe a large (and recently expanded and enhanced), linguistically annotated, downloadable, video corpus of citation-form ASL signs shared by the American Sign Language Linguistic Research Project (ASLLRP)---with 23,452 sign tokens and an online Sign Bank---in which such correspondences are enforced. We furthermore provide annotations for 19,672 of the WLASL video examples consistent with ASLLRP glossing conventions. For those wishing to use WLASL videos, this provides a set of annotations that makes it possible: (1) to use those data reliably for computational research; and/or (2) to combine the WLASL and ASLLRP datasets, creating a combined resource that is larger and richer than either of those datasets individually, with consistent gloss labeling for all signs. We also offer a summary of our own sign recognition research to date that exploits these data resources.}
}

@inproceedings{xia:22038:sign-lang:lrec,
  author    = {Xia, Zhaoyang and Chen, Yuxiao and Zhangli, Qilong and Huenerfauth, Matt and Neidle, Carol and Metaxas, Dimitris},
  title     = {Sign Language Video Anonymization},
  pages     = {202--211},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC2022} 10th Workshop on the Representation and Processing of Sign Languages: Multilingual Sign Language Resources},
  maintitle = {13th International Conference on Language Resources and Evaluation ({LREC} 2022)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marseille, France},
  day       = {25},
  month     = jun,
  year      = {2022},
  isbn      = {979-10-95546-86-3},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/22038.html},
  abstract  = {Deaf signers who wish to communicate in their native language frequently share videos on the Web. However, videos cannot preserve privacy---as is often desirable for discussion of sensitive topics---since both hands and face convey critical linguistic information and therefore cannot be obscured without degrading communication. Deaf signers have expressed interest in video anonymization that would preserve linguistic content. However, attempts to develop such technology have thus far shown limited success. We are developing a new method for such anonymization, with input from ASL signers. We modify a motion-based image animation model to generate high-resolution videos with the signer identity changed, but with the preservation of linguistically significant motions and facial expressions. An asymmetric encoder-decoder structured image generator is used to generate the high-resolution target frame from the low-resolution source frame based on the optical flow and confidence map. We explicitly guide the model to attain a clear generation of hands and faces by using bounding boxes to improve the loss computation. FID and KID scores are used for the evaluation of the realism of the generated frames. This technology shows great potential for practical applications to benefit deaf signers.}
}

@inproceedings{metaxas:18005:sign-lang:lrec,
  author    = {Metaxas, Dimitris and Dilsizian, Mark and Neidle, Carol},
  title     = {Scalable {ASL} Sign Recognition using Model-based Machine Learning and Linguistically Annotated Corpora},
  pages     = {127--132},
  editor    = {Bono, Mayumi and Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna and Osugi, Yutaka},
  booktitle = {Proceedings of the {LREC2018} 8th Workshop on the Representation and Processing of Sign Languages: Involving the Language Community},
  maintitle = {11th International Conference on Language Resources and Evaluation ({LREC} 2018)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Miyazaki, Japan},
  day       = {12},
  month     = may,
  year      = {2018},
  isbn      = {979-10-95546-01-6},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/18005.html},
  abstract  = {We report on the high success rates of our new, scalable, signer-independent, computational approach for sign recognition from monocular video, exploiting linguistically annotated ASL data sets. We recognize signs using a hybrid framework that combines state-of-the-art learning methods with features based on what is known about the linguistic composition of lexical signs. We model and recognize the sub-components of sign production, with attention to hand shape, orientation, location, motion trajectories, as well as facial features, and we combine these within a CRF framework. The effect is to make the sign recognition problem robust, scalable, and feasible with relatively smaller datasets than are required for purely data-driven methods. From a 350-sign vocabulary of isolated, citation-form lexical signs from the American Sign Language Lexicon Video Dataset (ASLLVD), including both 1- and 2-handed signs, we achieve a top-1 accuracy of 93.6{\%} and a top-5 accuracy of 97.9{\%}. The high probability with which we can produce 5 sign candidates that contain the correct result opens the door to potential applications, as it is reasonable to provide a sign lookup functionality that offers the user 5 possible signs, in decreasing order of likelihood, with the user then asked to select the desired sign.}
}

@inproceedings{neidle:18001:sign-lang:lrec,
  author    = {Neidle, Carol and Opoku, Augustine and Dimitriadis, Gregory and Metaxas, Dimitris},
  title     = {New shared {\&} interconnected {ASL} resources: {SignStream{\textregistered}} 3 software; {DAI} 2 for web access to linguistically annotated video corpora; and a sign bank},
  pages     = {147--154},
  editor    = {Bono, Mayumi and Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna and Osugi, Yutaka},
  booktitle = {Proceedings of the {LREC2018} 8th Workshop on the Representation and Processing of Sign Languages: Involving the Language Community},
  maintitle = {11th International Conference on Language Resources and Evaluation ({LREC} 2018)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Miyazaki, Japan},
  day       = {12},
  month     = may,
  year      = {2018},
  isbn      = {979-10-95546-01-6},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/18001.html},
  abstract  = {2017 marked the release of a new version of SignStream{\textregistered} software, designed to facilitate linguistic analysis of ASL video. SignStream{\textregistered} provides an intuitive interface for labeling and time-aligning manual and non-manual components of the signing. Version 3 has many new features. For example, it enables representation of morpho-phonological information, including display of handshapes. An expanding ASL video corpus, annotated through use of SignStream{\textregistered}, is shared publicly on the Web. This corpus (video plus annotations) is Web-accessible---browsable, searchable, and downloadable---thanks to a new, improved version of our Data Access Interface: DAI 2. DAI 2 also offers Web access to a brand new Sign Bank, containing about 10,000 examples of about 3,000 distinct signs, as produced by up to 9 different ASL signers. This Sign Bank is also directly accessible from within SignStream{\textregistered}, thereby boosting the efficiency and consistency of annotation; new items can also be added to the Sign Bank. Soon to be integrated into SignStream{\textregistered} 3 and DAI 2 are visualizations of computer-generated analyses of the video: graphical display of eyebrow height, eye aperture, and head position. These resources are publicly available, for linguistic and computational research and for those who use or study ASL.}
}

@inproceedings{dilsizian:16031:sign-lang:lrec,
  author    = {Dilsizian, Mark and Tang, Zhiqiang and Metaxas, Dimitris and Huenerfauth, Matt and Neidle, Carol},
  title     = {The Importance of {3D} Motion Trajectories for Computer-based Sign Recognition},
  pages     = {53--58},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna},
  booktitle = {Proceedings of the {LREC2016} 7th Workshop on the Representation and Processing of Sign Languages: Corpus Mining},
  maintitle = {10th International Conference on Language Resources and Evaluation ({LREC} 2016)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Portoro{\v z}, Slovenia},
  day       = {28},
  month     = may,
  year      = {2016},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/16031.html},
  abstract  = {Computer-based sign language recognition from video is a challenging problem because of the spatiotemporal complexities inherent in sign production and the variations within and across signers. However, linguistic information can help constrain sign recognition to make it a more feasible classification problem. We have previously explored recognition of linguistically significant 3D hand configurations, as start and end handshapes represent one major component of signs; others include hand orientation, place of articulation in space, and movement. Thus, although recognition of handshapes (on one or both hands) at the start and end of a sign is essential for sign identification, it is not sufficient. Analysis of hand and arm movement trajectories can provide additional information critical for sign identification. In order to test the discriminative potential of the hand motion analysis, we performed sign recognition based exclusively on hand trajectories while holding the handshape constant. To facilitate this evaluation, we captured a collection of videos involving signs with a constant handshape produced by multiple subjects; and we automatically annotated the 3D motion trajectories. 3D hand locations are normalized in accordance with invariant properties of ASL movements. We trained time-series learning-based models for different signs of constant handshape in our dataset using the normalized 3D motion trajectories. Results show significant computer-based sign recognition accuracy across subjects and across a diverse set of signs. Our framework demonstrates the discriminative power and importance of 3D hand motion trajectories for sign recognition, given known handshapes.}
}

@inproceedings{kacorri:16007:sign-lang:lrec,
  author    = {Kacorri, Hernisa and Syed, Ali Raza and Huenerfauth, Matt and Neidle, Carol},
  title     = {Centroid-Based Exemplar Selection of {ASL} Non-Manual Expressions using Multidimensional Dynamic Time Warping and {MPEG4} Features},
  pages     = {105--110},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna},
  booktitle = {Proceedings of the {LREC2016} 7th Workshop on the Representation and Processing of Sign Languages: Corpus Mining},
  maintitle = {10th International Conference on Language Resources and Evaluation ({LREC} 2016)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Portoro{\v z}, Slovenia},
  day       = {28},
  month     = may,
  year      = {2016},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/16007.html},
  abstract  = {We investigate a method for selecting recordings of human face and head movements from a sign language corpus to serve as a basis for generating animations of novel sentences of American Sign Language (ASL). Drawing from a collection of recordings that have been categorized into various types of non-manual expressions (NMEs), we define a method for selecting an exemplar recording of a given type using a centroid-based selection procedure, using multivariate dynamic time warping (DTW) as the distance function. Through intra- and inter-signer methods of evaluation, we demonstrate the efficacy of this technique, and we note useful potential for the DTW visualizations generated in this study for linguistic researchers collecting and analyzing sign language corpora.}
}

@inproceedings{neidle:14004:sign-lang:lrec,
  author    = {Neidle, Carol and Liu, Jingjing and Liu, Bo and Peng, Xi and Vogler, Christian and Metaxas, Dimitris},
  title     = {Computer-based tracking, analysis, and visualization of linguistically significant non-manual events in {American} {Sign} {Language} ({ASL})},
  pages     = {127--134},
  editor    = {Crasborn, Onno and Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna},
  booktitle = {Proceedings of the {LREC2014} 6th Workshop on the Representation and Processing of Sign Languages: Beyond the Manual Channel},
  maintitle = {9th International Conference on Language Resources and Evaluation ({LREC} 2014)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Reykjavik, Iceland},
  day       = {31},
  month     = may,
  year      = {2014},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/14004.html},
  abstract  = {Our linguistically annotated American Sign Language (ASL) corpora have formed a basis for research to automate detection by computer of essential linguistic information conveyed through facial expressions and head movements. We have tracked head position and facial deformations, and used computational learning to discern specific grammatical markings. Our ability to detect, identify, and temporally localize the occurrence of such markings in ASL videos has recently been improved by incorporation of (1) new techniques for deformable model-based 3D tracking of head position and facial expressions, which provide significantly better tracking accuracy and recover quickly from temporary loss of track due to occlusion; and (2) a computational learning approach incorporating 2-level Conditional Random Fields (CRFs), suited to the multi-scale spatio-temporal characteristics of the data, which analyses not only low-level appearance characteristics, but also the patterns that enable identification of significant gestural components, such as periodic head movements and raised or lowered eyebrows.  Here we summarize our linguistically motivated computational approach and the results for detection and recognition of nonmanual grammatical markings;  demonstrate our data visualizations, and discuss the relevance for linguistic research; and describe work underway to enable such visualizations to be produced over large corpora and shared publicly on the Web.}
}

@inproceedings{neidle:12027:sign-lang:lrec,
  author    = {Neidle, Carol and Vogler, Christian},
  title     = {A New Web Interface to Facilitate Access to Corpora: Development of the {ASLLRP} Data Access Interface},
  pages     = {137--142},
  editor    = {Crasborn, Onno and Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Kristoffersen, Jette and Mesch, Johanna},
  booktitle = {Proceedings of the {LREC2012} 5th Workshop on the Representation and Processing of Sign Languages: Interactions between Corpus and Lexicon},
  maintitle = {8th International Conference on Language Resources and Evaluation ({LREC} 2012)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Istanbul, Turkey},
  day       = {27},
  month     = may,
  year      = {2012},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/12027.html},
  abstract  = {A significant obstacle to broad utilization of corpora is the difficulty in gaining access to the specific subsets of data and annotations that may be relevant for particular types of research. With that in mind, we have developed a web-based Data Access Interface (DAI), to provide access to the expanding datasets of the American Sign Language Linguistic Research Project (ASLLRP). The DAI facilitates browsing the corpora, viewing videos and annotations, searching for phenomena of interest, and downloading selected materials from the website. The web interface, compared to providing videos and annotation files off-line, also greatly increases access by people that have no prior experience in working with linguistic annotation tools, and it opens the door to integrating the data with third-party applications on the desktop and in the mobile space. In this paper we give an overview of the available videos, annotations, and search functionality of the DAI, as well as plans for future enhancements. We also summarize best practices and key lessons learned that are crucial to the success of similar projects.}
}

@inproceedings{neidle:12011:sign-lang:lrec,
  author    = {Neidle, Carol and Thangali, Ashwin and Sclaroff, Stan},
  title     = {Challenges in Development of the {American} {Sign} {Language} Lexicon Video Dataset ({ASLLVD}) Corpus},
  pages     = {143--150},
  editor    = {Crasborn, Onno and Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Kristoffersen, Jette and Mesch, Johanna},
  booktitle = {Proceedings of the {LREC2012} 5th Workshop on the Representation and Processing of Sign Languages: Interactions between Corpus and Lexicon},
  maintitle = {8th International Conference on Language Resources and Evaluation ({LREC} 2012)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Istanbul, Turkey},
  day       = {27},
  month     = may,
  year      = {2012},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/12011.html},
  abstract  = {The American Sign Language Lexicon Video Dataset (ASLLVD) consists of videos of >3,300 ASL signs in citation form, each produced by 1-6 native ASL signers, for a total of almost 9,800 tokens. This dataset, including multiple synchronized videos showing the signing from different angles, will be shared publicly once the linguistic annotations and verifications are complete. Linguistic annotations include gloss labels, sign start and end time codes, start and end handshape labels for both hands, morphological and articulatory classifications of sign type. For compound signs, the dataset includes annotations for each morpheme. To facilitate computer vision-based sign language recognition, the dataset also includes numeric ID labels for sign variants, video sequences in uncompressed-raw format, camera calibration sequences, and software for skin region extraction. We discuss here some of the challenges involved in the linguistic annotations and categorizations. We also report an example computer vision application that leverages the ASLLVD: the formulation employs a HandShapes Bayesian Network (HSBN), which models the transition probabilities between start and end handshapes in monomorphemic lexical signs. Further details and statistics for the ASLLVD dataset, as well as information about annotation conventions, are available from http://www.bu.edu/asllrp/lexicon.}
}

@inproceedings{athitsos:10022:sign-lang:lrec,
  author    = {Athitsos, Vassilis and Neidle, Carol and Sclaroff, Stan and Nash, Joan and Stefan, Alexandra and Thangali, Ashwin and Wang, Haijing and Yuan, Quan},
  title     = {Large Lexicon Project: {American} {Sign} {Language} Video Corpus and Sign Language Indexing/Retrieval Algorithms},
  pages     = {11--14},
  editor    = {Dreuw, Philippe and Efthimiou, Eleni and Hanke, Thomas and Johnston, Trevor and Mart{\'i}nez Ruiz, Gregorio and Schembri, Adam},
  booktitle = {Proceedings of the {LREC2010} 4th Workshop on the Representation and Processing of Sign Languages: Corpora and Sign Language Technologies},
  maintitle = {7th International Conference on Language Resources and Evaluation ({LREC} 2010)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Valletta, Malta},
  day       = {22--23},
  month     = may,
  year      = {2010},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/10022.html},
  abstract  = {When we encounter a word that we do not understand in a written language, we can look it up in a dictionary. However, looking up the meaning of an unknown sign in American Sign Language (ASL) is not nearly as straightforward. This paper describes progress in an ongoing project aiming to build a computer vision system that helps users look up the meaning of an unknown ASL sign. When a user encounters an unknown ASL sign, the user submits a video of that sign as a query to the system. The system evaluates the similarity between the query and video examples of all signs in the known lexicon, and presents the most similar signs to the user. The user can then look at the retrieved signs and determine if any of them matches the query sign.
\par
An important part of the project is building a video database containing examples of a large number of signs. So far we have recorded at least two video examples for almost all of the 3,000 signs contained in the Gallaudet dictionary. Each video sequence is captured simultaneously from four different cameras, providing two frontal views, a side view, and a view zoomed in on the face of the signer. Our entire video dataset is publicly available on the Web.
\par
Automatic computer vision-based evaluation of similarity between signs is a challenging task. In order to improve accuracy, we manually annotate the hand locations in each frame of each sign in the database. While this is a time-consuming process, this process incurs a one-time preprocessing cost that is invisible to the end-user of the system. At runtime, once the user has submitted the query video, the current version of the system asks the user to specify hand locations in the first frame, and then the system automatically tracks the location of the hands in the rest of the query video. The user can review and correct the hand location results. Every correction that the user makes on a specific frame is used by the system to further improve the hand location estimates in other frames.
\par
Once hand locations have been estimated for the query video, the system evaluates the similarity between the query video and every sign video in the database. Similarity is measured using the Dynamic Time Warping (DTW) algorithm, a well-known algorithm for comparing time series. The performance of the system has been evaluated in experiments where 933 signs from 921 distinct sign classes are used as the dataset of known signs, and 193 signs are used as a test set. In those experiments, only a single frontal view was used for all test and training examples. For 68{\%} of the test signs, the correct sign is included in the 20 most similar signs retrieved by the system.
\par
In ongoing work, we are manually annotating hand locations in the remainder of our collected videos, so as to gradually incorporate more signs into our system. We are also investigating better ways for measuring similarity between signs, and for making the system more automatic, reducing or eliminating the need for the user to manually provide information to the system about hand locations.}
}

@inproceedings{michael:10029:sign-lang:lrec,
  author    = {Michael, Nicholas and Neidle, Carol and Metaxas, Dimitris},
  title     = {Computer-based recognition of facial expressions in {ASL}: from face tracking to linguistic interpretation},
  pages     = {164--167},
  editor    = {Dreuw, Philippe and Efthimiou, Eleni and Hanke, Thomas and Johnston, Trevor and Mart{\'i}nez Ruiz, Gregorio and Schembri, Adam},
  booktitle = {Proceedings of the {LREC2010} 4th Workshop on the Representation and Processing of Sign Languages: Corpora and Sign Language Technologies},
  maintitle = {7th International Conference on Language Resources and Evaluation ({LREC} 2010)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Valletta, Malta},
  day       = {22--23},
  month     = may,
  year      = {2010},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/10029.html},
  abstract  = {Most research in the field of sign language recognition has focused on the manual component of signing, despite the fact that there is critical grammatical information expressed through facial expressions and head gestures.  We, therefore, propose a novel framework for robust tracking and analysis of nonmanual behaviors, with an application to sign language recognition.
\par
Our method uses computer vision techniques to track facial expressions and head movements from video, in order to recognize such linguistically significant expressions. The methods described here have relied crucially on the use of a linguistically annotated video corpus that is being developed, as the annotated video examples have served for training and testing our machine learning  models.  We apply our framework to continuous recognition of three classes of grammatical expressions, namely wh-questions, negative expressions, and topics. 
\par
Our method is signer-independent, utilizing spatial pyramids and Hidden Markov Models (HMMs) to model the temporal variations of facial shape and appearance.}
}

@inproceedings{dafnis:70028:sltat:lrec,
  author    = {Dafnis, Konstantinos M. and Chroni, Evgenia and Neidle, Carol and Metaxas, Dimitris},
  title     = {Isolated Sign Recognition using {ASL} Datasets with Consistent Text-based Gloss Labeling and Curriculum Learning},
  pages     = {13--20},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and McDonald, John C. and Shterionov, Dimitar and Wolfe, Rosalee},
  booktitle = {Proceedings of the 7th International Workshop on Sign Language Translation and Avatar Technology: The Junction of the Visual and the Textual: Challenges and Perspectives},
  maintitle = {13th International Conference on Language Resources and Evaluation ({LREC} 2022)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marseille, France},
  day       = {24},
  month     = jun,
  year      = {2022},
  isbn      = {979-10-95546-82-5},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/2022.sltat-1.3.html},
  abstract  = {We present a new approach for isolated sign recognition, which combines a spatial-temporal Graph Convolution Network (GCN) architecture for modeling human skeleton keypoints with late fusion of both the forward and backward video streams, and we explore the use of curriculum learning. We employ a type of curriculum learning that dynamically estimates, during training, the order of difficulty of each input video for sign recognition; this involves learning a new family of data parameters that are dynamically updated during training. The research makes use of a large combined video dataset for American Sign Language (ASL), including data from both the American Sign Language Lexicon Video Dataset (ASLLVD) and the Word-Level American Sign Language (WLASL) dataset, with modified gloss labeling of the latter---to ensure 1-1 correspondence between gloss labels and distinct sign productions, as well as consistency in gloss labeling across the two datasets. This is the first time that these two datasets have been used in combination for isolated sign recognition research. We also compare the sign recognition performance on several different subsets of the combined dataset, varying in, e.g., the minimum number of samples per sign (and therefore also in the total number of sign classes and video examples).}
}

@inproceedings{dafnis-etal-2022-bidirectional:lrec,
  author    = {Dafnis, Konstantinos M. and Chroni, Evgenia and Neidle, Carol and Metaxas, Dimitris},
  title     = {Bidirectional Skeleton-Based Isolated Sign Recognition using Graph Convolution Networks and Transfer Learning},
  pages     = {7328--7338},
  editor    = {Calzolari, Nicoletta and B{\'e}chet, Fr{\'e}d{\'e}ric and Blache, Philippe and Choukri, Khalid and Cieri, Christopher and Declerck, Thierry and Goggi, Sara and Isahara, Hitoshi and Maegaard, Bente and Mariani, Joseph and Mazo, H{\'e}l{\`e}ne and Odijk, Jan and Piperidis, Stelios},
  booktitle = {13th International Conference on Language Resources and Evaluation ({LREC} 2022)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marseille, France},
  day       = {20--25},
  month     = jun,
  year      = {2022},
  isbn      = {979-10-95546-72-6},
  language  = {english},
  url       = {https://aclanthology.org/2022.lrec-1.797},
  abstract  = {To improve computer-based recognition from video of isolated signs from American Sign Language (ASL), we propose a new skeleton-based method that involves explicit detection of the start and end frames of signs, trained on the ASLLVD dataset; it uses linguistically relevant parameters based on the skeleton input. Our method employs a bidirectional learning approach within a Graph Convolutional Network (GCN) framework. We apply this method to the WLASL dataset, but with corrections to the gloss labeling to ensure consistency in the labels assigned to different signs; it is important to have a 1-1 correspondence between signs and text-based gloss labels. We achieve a success rate of 77.43{\%} for top-1 and 94.54{\%} for top-5 using this modified WLASL dataset. Our method, which does not require multi-modal data input, outperforms other state-of-the-art approaches on the same modified WLASL dataset, demonstrating the importance of both attention to the start and end frames of signs and the use of bidirectional data streams in the GCNs for isolated sign recognition.}
}

@inproceedings{metaxas-etal-2018-linguistically:lrec,
  author    = {Metaxas, Dimitris and Dilsizian, Mark and Neidle, Carol},
  title     = {Linguistically-driven Framework for Computationally Efficient and Scalable Sign Recognition},
  pages     = {1711--1718},
  editor    = {Calzolari, Nicoletta and Choukri, Khalid and Cieri, Christopher and Declerck, Thierry and Goggi, Sara and Hasida, Koiti and Isahara, Hitoshi and Maegaard, Bente and Mariani, Joseph and Mazo,  H{\'e}l{\`e}ne and Moreno, Asuncion and Odijk, Jan and Piperidis, Stelios and Tokunaga, Takenobu},
  booktitle = {11th International Conference on Language Resources and Evaluation ({LREC} 2018)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Miyazaki, Japan},
  day       = {7--12},
  month     = may,
  year      = {2018},
  isbn      = {979-10-95546-00-9},
  language  = {english},
  url       = {https://aclanthology.org/L18-1271}
}

@inproceedings{yanovich-etal-2016-detection:lrec,
  author    = {Yanovich, Polina and Neidle, Carol and Metaxas, Dimitris},
  title     = {Detection of Major {ASL} Sign Types in Continuous Signing For {ASL} Recognition},
  pages     = {3067--3073},
  editor    = {Calzolari, Nicoletta and Choukri, Khalid and Declerck, Thierry and Goggi, Sara and Grobelnik, Marko and Maegaard, Bente and Mariani, Joseph and Mazo, H{\'e}l{\`e}ne and Moreno, Asuncion and Odijk, Jan and Piperidis, Stelios},
  booktitle = {10th International Conference on Language Resources and Evaluation ({LREC} 2016)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Portoro{\v z}, Slovenia},
  day       = {23--28},
  month     = may,
  year      = {2016},
  isbn      = {978-2-9517408-9-1},
  language  = {english},
  url       = {https://aclanthology.org/L16-1490},
  abstract  = {In American Sign Language (ASL) as well as other signed languages, different classes of signs (e.g., lexical signs, fingerspelled signs, and classifier constructions) have different internal structural properties. Continuous sign recognition accuracy can be improved through use of distinct recognition strategies, as well as different training datasets, for each class of signs. For these strategies to be applied, continuous signing video needs to be segmented into parts corresponding to particular classes of signs. In this paper we present a multiple instance learning-based segmentation system that accurately labels 91.27{\%} of the video frames of 500 continuous utterances (including 7 different subjects) from the publicly accessible NCSLGR corpus (Neidle and Vogler, 2012). The system uses novel feature descriptors derived from both motion and shape statistics of the regions of high local motion. The system does not require a hand tracker.}
}

@inproceedings{dilsizian-etal-2014-new:lrec,
  author    = {Dilsizian, Mark and Yanovich, Polina and Wang, Shu and Neidle, Carol and Metaxas, Dimitris},
  title     = {A New Framework for Sign Language Recognition based on 3{D} Handshape Identification and Linguistic Modeling},
  pages     = {1924--1929},
  editor    = {Calzolari, Nicoletta and Choukri, Khalid and Declerck, Thierry and Loftsson, Hrafn and Maegaard, Bente and Mariani, Joseph and Moreno, Asuncion and Odijk, Jan and Piperidis, Stelios,},
  booktitle = {9th International Conference on Language Resources and Evaluation ({LREC} 2014)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Reykjavik, Iceland},
  day       = {26--31},
  month     = may,
  year      = {2014},
  isbn      = {978-2-9517408-8-4},
  language  = {english},
  url       = {https://aclanthology.org/L14-1096},
  abstract  = {Current approaches to sign recognition by computer generally have at least some of the following limitations: they rely on laboratory conditions for sign production, are limited to a small vocabulary, rely on 2D modeling (and therefore cannot deal with occlusions and off-plane rotations), and/or achieve limited success. Here we propose a new framework that (1) provides a new tracking method less dependent than others on laboratory conditions and able to deal with variations in background and skin regions (such as the face, forearms, or other hands); (2) allows for identification of 3D hand configurations that are linguistically important in American Sign Language (ASL); and (3) incorporates statistical information reflecting linguistic constraints in sign production. For purposes of large-scale computer-based sign language recognition from video, the ability to distinguish hand configurations accurately is critical. Our current method estimates the 3D hand configuration to distinguish among 77 hand configurations linguistically relevant for ASL. Constraining the problem in this way makes recognition of 3D hand configuration more tractable and provides the information specifically needed for sign recognition. Further improvements are obtained by incorporation of statistical information about linguistic dependencies among handshapes within a sign derived from an annotated corpus of almost 10,000 sign tokens.}
}

@inproceedings{liu-etal-2014-3d:lrec,
  author    = {Liu, Bo and Liu, Jingjing and Yu, Xiang and Metaxas, Dimitris and Neidle, Carol},
  title     = {3{D} Face Tracking and Multi-Scale, Spatio-temporal Analysis of Linguistically Significant Facial Expressions and Head Positions in {ASL}},
  pages     = {4512--4518},
  editor    = {Calzolari, Nicoletta and Choukri, Khalid and Declerck, Thierry and Loftsson, Hrafn and Maegaard, Bente and Mariani, Joseph and Moreno, Asuncion and Odijk, Jan and Piperidis, Stelios,},
  booktitle = {9th International Conference on Language Resources and Evaluation ({LREC} 2014)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Reykjavik, Iceland},
  day       = {26--31},
  month     = may,
  year      = {2014},
  isbn      = {978-2-9517408-8-4},
  language  = {english},
  url       = {https://aclanthology.org/L14-1318},
  abstract  = {Essential grammatical information is conveyed in signed languages by clusters of events involving facial expressions and movements of the head and upper body. This poses a significant challenge for computer-based sign language recognition. Here, we present new methods for the recognition of nonmanual grammatical markers in American Sign Language (ASL) based on: (1) new 3D tracking methods for the estimation of 3D head pose and facial expressions to determine the relevant low-level features; (2) methods for higher-level analysis of component events (raised/lowered eyebrows, periodic head nods and head shakes) used in grammatical markings―with differentiation of temporal phases (onset, core, offset, where appropriate), analysis of their characteristic properties, and extraction of corresponding features; (3) a 2-level learning framework to combine low- and high-level features of differing spatio-temporal scales. This new approach achieves significantly better tracking and recognition results than our previous methods.}
}

@inproceedings{metaxas-etal-2012-recognition:lrec,
  author    = {Metaxas, Dimitris and Liu, Bo and Yang, Fei and Yang, Peng and Michael, Nicholas and Neidle, Carol},
  title     = {Recognition of Nonmanual Markers in {A}merican {S}ign {L}anguage ({ASL}) Using Non-Parametric Adaptive 2{D}-3{D} Face Tracking},
  pages     = {2414--2420},
  editor    = {Calzolari, Nicoletta and Choukri, Khalid and Declerck, Thierry and Do{\u g}an, Mehmet U{\u g}ur and Maegaard, Bente and Mariani, Joseph and Moreno, Asuncion and Odijk, Jan and Piperidis, Stelios},
  booktitle = {8th International Conference on Language Resources and Evaluation ({LREC} 2012)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Istanbul, Turkey},
  day       = {21--27},
  month     = may,
  year      = {2012},
  isbn      = {978-2-9517408-7-7},
  language  = {english},
  url       = {https://aclanthology.org/L12-1123},
  abstract  = {This paper addresses the problem of automatically recognizing linguistically significant nonmanual expressions in American Sign Language from video. We develop a fully automatic system that is able to track facial expressions and head movements, and detect and recognize facial events continuously from video. The main contributions of the proposed framework are the following: (1) We have built a stochastic and adaptive ensemble of face trackers to address factors resulting in lost face track; (2) We combine 2D and 3D deformable face models to warp input frames, thus correcting for any variation in facial appearance resulting from changes in 3D head pose; (3) We use a combination of geometric features and texture features extracted from a canonical frontal representation. The proposed new framework makes it possible to detect grammatically significant nonmanual expressions from continuous signing and to differentiate successfully among linguistically significant expressions that involve subtle differences in appearance. We present results that are based on the use of a dataset containing 330 sentences from videos that were collected and linguistically annotated at Boston University.}
}

@inproceedings{gavrilov-etal-2012-detecting:lrec,
  author    = {Gavrilov, Zoya and Sclaroff, Stan and Neidle, Carol and Dickinson, Sven},
  title     = {Detecting Reduplication in Videos of {A}merican {S}ign {L}anguage},
  pages     = {3767--3773},
  editor    = {Calzolari, Nicoletta and Choukri, Khalid and Declerck, Thierry and Do{\u g}an, Mehmet U{\u g}ur and Maegaard, Bente and Mariani, Joseph and Moreno, Asuncion and Odijk, Jan and Piperidis, Stelios},
  booktitle = {8th International Conference on Language Resources and Evaluation ({LREC} 2012)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Istanbul, Turkey},
  day       = {21--27},
  month     = may,
  year      = {2012},
  isbn      = {978-2-9517408-7-7},
  language  = {english},
  url       = {https://aclanthology.org/L12-1057},
  abstract  = {A framework is proposed for the detection of reduplication in digital videos of American Sign Language (ASL). In ASL, reduplication is used for a variety of linguistic purposes, including overt marking of plurality on nouns, aspectual inflection on verbs, and nominalization of verbal forms. Reduplication involves the repetition, often partial, of the articulation of a sign. In this paper, the apriori algorithm for mining frequent patterns in data streams is adapted for finding reduplication in videos of ASL. The proposed algorithm can account for varying weights on items in the apriori algorithm's input sequence. In addition, the apriori algorithm is extended to allow for inexact matching of similar hand motion subsequences and to provide robustness to noise. The formulation is evaluated on 105 lexical signs produced by two native signers. To demonstrate the formulation, overall hand motion direction and magnitude are considered; however, the formulation should be amenable to combining these features with others, such as hand shape, orientation, and place of articulation.}
}

@inproceedings{dreuw-etal-2008-benchmark:lrec,
  author    = {Dreuw, Philippe and Neidle, Carol and Athitsos, Vassilis and Sclaroff, Stan and Ney, Hermann},
  title     = {Benchmark Databases for Video-Based Automatic Sign Language Recognition},
  pages     = {1115--1120},
  editor    = {Calzolari, Nicoletta and Choukri, Khalid and Maegaard, Bente and Mariani, Joseph and Odijk, Jan and Piperidis, Stelios and Tapias, Daniel},
  booktitle = {6th International Conference on Language Resources and Evaluation ({LREC} 2008)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marrakech, Morocco},
  day       = {26},
  month     = may,
  year      = {2008},
  isbn      = {978-2-9517408-4-6},
  language  = {english},
  url       = {https://aclanthology.org/L08-1469},
  abstract  = {A new, linguistically annotated, video database for automatic sign language recognition is presented. The new RWTH-BOSTON-400 corpus, which consists of 843 sentences, several speakers and separate subsets for training, development, and testing is described in detail. For evaluation and benchmarking of automatic sign language recognition, large corpora are needed. Recent research has focused mainly on isolated sign language recognition methods using video sequences that have been recorded under lab conditions using special hardware like data gloves. Such databases have often consisted generally of only one speaker and thus have been speaker-dependent, and have had only small vocabularies. A new database access interface, which was designed and created to provide fast access to the database statistics and content, makes it possible to easily browse and retrieve particular subsets of the video database. Preliminary baseline results on the new corpora are presented. In contradistinction to other research in this area, all databases presented in this paper will be publicly available.}
}

