@inproceedings{athitsos:10022:sign-lang:lrec,
  author    = {Athitsos, Vassilis and Neidle, Carol and Sclaroff, Stan and Nash, Joan and Stefan, Alexandra and Thangali, Ashwin and Wang, Haijing and Yuan, Quan},
  title     = {Large Lexicon Project: {American} {Sign} {Language} Video Corpus and Sign Language Indexing/Retrieval Algorithms},
  pages     = {11--14},
  editor    = {Dreuw, Philippe and Efthimiou, Eleni and Hanke, Thomas and Johnston, Trevor and Mart{\'i}nez Ruiz, Gregorio and Schembri, Adam},
  booktitle = {Proceedings of the {LREC2010} 4th Workshop on the Representation and Processing of Sign Languages: Corpora and Sign Language Technologies},
  maintitle = {7th International Conference on Language Resources and Evaluation ({LREC} 2010)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Valletta, Malta},
  day       = {22--23},
  month     = may,
  year      = {2010},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/10022.html},
  abstract  = {When we encounter a word that we do not understand in a written language, we can look it up in a dictionary. However, looking up the meaning of an unknown sign in American Sign Language (ASL) is not nearly as straightforward. This paper describes progress in an ongoing project aiming to build a computer vision system that helps users look up the meaning of an unknown ASL sign. When a user encounters an unknown ASL sign, the user submits a video of that sign as a query to the system. The system evaluates the similarity between the query and video examples of all signs in the known lexicon, and presents the most similar signs to the user. The user can then look at the retrieved signs and determine if any of them matches the query sign.
\par
An important part of the project is building a video database containing examples of a large number of signs. So far we have recorded at least two video examples for almost all of the 3,000 signs contained in the Gallaudet dictionary. Each video sequence is captured simultaneously from four different cameras, providing two frontal views, a side view, and a view zoomed in on the face of the signer. Our entire video dataset is publicly available on the Web.
\par
Automatic computer vision-based evaluation of similarity between signs is a challenging task. In order to improve accuracy, we manually annotate the hand locations in each frame of each sign in the database. While this is a time-consuming process, this process incurs a one-time preprocessing cost that is invisible to the end-user of the system. At runtime, once the user has submitted the query video, the current version of the system asks the user to specify hand locations in the first frame, and then the system automatically tracks the location of the hands in the rest of the query video. The user can review and correct the hand location results. Every correction that the user makes on a specific frame is used by the system to further improve the hand location estimates in other frames.
\par
Once hand locations have been estimated for the query video, the system evaluates the similarity between the query video and every sign video in the database. Similarity is measured using the Dynamic Time Warping (DTW) algorithm, a well-known algorithm for comparing time series. The performance of the system has been evaluated in experiments where 933 signs from 921 distinct sign classes are used as the dataset of known signs, and 193 signs are used as a test set. In those experiments, only a single frontal view was used for all test and training examples. For 68{\%} of the test signs, the correct sign is included in the 20 most similar signs retrieved by the system.
\par
In ongoing work, we are manually annotating hand locations in the remainder of our collected videos, so as to gradually incorporate more signs into our system. We are also investigating better ways for measuring similarity between signs, and for making the system more automatic, reducing or eliminating the need for the user to manually provide information to the system about hand locations.}
}

@inproceedings{dreuw-etal-2008-benchmark:lrec,
  author    = {Dreuw, Philippe and Neidle, Carol and Athitsos, Vassilis and Sclaroff, Stan and Ney, Hermann},
  title     = {Benchmark Databases for Video-Based Automatic Sign Language Recognition},
  pages     = {1115--1120},
  editor    = {Calzolari, Nicoletta and Choukri, Khalid and Maegaard, Bente and Mariani, Joseph and Odijk, Jan and Piperidis, Stelios and Tapias, Daniel},
  booktitle = {6th International Conference on Language Resources and Evaluation ({LREC} 2008)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marrakech, Morocco},
  day       = {26},
  month     = may,
  year      = {2008},
  isbn      = {978-2-9517408-4-6},
  language  = {english},
  url       = {https://aclanthology.org/L08-1469},
  abstract  = {A new, linguistically annotated, video database for automatic sign language recognition is presented. The new RWTH-BOSTON-400 corpus, which consists of 843 sentences, several speakers and separate subsets for training, development, and testing is described in detail. For evaluation and benchmarking of automatic sign language recognition, large corpora are needed. Recent research has focused mainly on isolated sign language recognition methods using video sequences that have been recorded under lab conditions using special hardware like data gloves. Such databases have often consisted generally of only one speaker and thus have been speaker-dependent, and have had only small vocabularies. A new database access interface, which was designed and created to provide fast access to the database statistics and content, makes it possible to easily browse and retrieve particular subsets of the video database. Preliminary baseline results on the new corpora are presented. In contradistinction to other research in this area, all databases presented in this paper will be publicly available.}
}

