In this work we present methods for automatic estimation of non-manual gestures in sign language videos. More specifically, we study the estimation of three head pose angles (yaw, pitch, roll) and the state of facial elements (eyebrow position, eye openness, and mouth state). This kind of estimation facilitates automatic annotation of sign language videos and promotes more prolific production of annotated sign language corpora. The proposed estimation methods are incorporated in our publicly available SLMotion software package for sign language video processing and analysis. Our method implements a model-based approach: for head pose we employ facial landmarks and skins masks as features, and estimate yaw and pitch angles by regression and roll using a geometric measure; for the state of facial elements we use the geometric information of facial elements of the face as features, and estimate quantized states using a classification algorithm. We evaluate the results of our proposed methods in quantitative and qualitative experiments.
@inproceedings{luzardo:14021:sign-lang:lrec,
author = {Luzardo, Marcos and Viitaniemi, Ville and Karppa, Matti and Laaksonen, Jorma and Jantunen, Tommi},
title = {Estimating head pose and state of facial elements for sign language video},
pages = {105--112},
editor = {Crasborn, Onno and Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna},
booktitle = {Proceedings of the {LREC2014} 6th Workshop on the Representation and Processing of Sign Languages: Beyond the Manual Channel},
maintitle = {9th International Conference on Language Resources and Evaluation ({LREC} 2014)},
publisher = {{European Language Resources Association (ELRA)}},
address = {Reykjavik, Iceland},
day = {31},
month = may,
year = {2014},
language = {english},
url = {https://www.sign-lang.uni-hamburg.de/lrec/pub/14021.pdf}
}