@inproceedings{smith:22017:sign-lang:lrec,
  author    = {Smith, River Tae and Willoughby, Louisa and Johnston, Trevor},
  title     = {Integrating {Auslan} Resources into the Language Data Commons of {Australia}},
  pages     = {181--186},
  editor    = {Efthimiou, Eleni and Fotinea, Stavroula-Evita and Hanke, Thomas and Hochgesang, Julie A. and Kristoffersen, Jette and Mesch, Johanna and Schulder, Marc},
  booktitle = {Proceedings of the {LREC2022} 10th Workshop on the Representation and Processing of Sign Languages: Multilingual Sign Language Resources},
  maintitle = {13th International Conference on Language Resources and Evaluation ({LREC} 2022)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marseille, France},
  day       = {25},
  month     = jun,
  year      = {2022},
  isbn      = {979-10-95546-86-3},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/22017.html},
  abstract  = {This paper describes a project to secure Auslan (Australian Sign Language) resources within a national language data network called the Language Data Commons of Australia (LDaCA). The resources are Auslan Signbank, a web-based multi-media dictionary, and the Auslan Corpus, a collection of video recordings of the language being used in various contexts with time-aligned ELAN annotation files. We aim to make these resources accessible to the language community, encourage community participation in the curation of the data, and facilitate and extend their uses in language teaching and linguistic research. The software platforms of both resources will be made compatible with other LDaCA resources; and the two will also be aggregated and linked so that (i) users of the dictionary can view attested corpus examples for an entry; and (ii) users of the corpus can instantly view the dictionary entry for an already glossed sign to check phonological, lexical and grammatical information about it, and/or to ensure that the correct annotation gloss (aka `ID-gloss') for a sign token has been chosen. This will enhance additions to annotations in the Auslan Corpus, entries in Auslan Signbank and the integrity of research based on both.}
}

@inproceedings{johnston:08031:sign-lang:lrec,
  author    = {Johnston, Trevor},
  title     = {Corpus linguistics and signed languages: no lemmata, no corpus},
  pages     = {82--87},
  editor    = {Crasborn, Onno and Efthimiou, Eleni and Hanke, Thomas and Thoutenhoofd, Ernst D. and Zwitserlood, Inge},
  booktitle = {Proceedings of the {LREC2008} 3rd Workshop on the Representation and Processing of Sign Languages: Construction and Exploitation of Sign Language Corpora},
  maintitle = {6th International Conference on Language Resources and Evaluation ({LREC} 2008)},
  publisher = {{European Language Resources Association (ELRA)}},
  address   = {Marrakech, Morocco},
  day       = {1},
  month     = jun,
  year      = {2008},
  language  = {english},
  url       = {https://www.sign-lang.uni-hamburg.de/lrec/pub/08031.html},
  abstract  = {A fundamental problem in the creation of signed language corpora is lemmatisation. Lemmatisation---the classification or identification of related word forms under a single label or lemma (the equivalent of headwords or headsigns in a dictionary)---is central to the process of corpus creation. The reason is that signed language corpora---as with all modern linguistic corpora---need to be machine-readable and this means that sign annotations should not only be informed by linguistic theory but also that tags appended to these annotations should be used consistently and systematically. In addition, a corpus must also be well documented (i.e., with accurate and relevant metadata) and representative of the language community (i.e., of relevant registers and sociolinguistic). All this requires dedicated technology (e.g., ELAN), standards and protocols (e.g., IMDI metadata descriptors), and transparent and agreed grammatical tags (e.g., grammatical class labels). However, it also requires the identification of lemmata and this presupposes the unique identification of sign forms. In other words, a successful corpus project presupposes the availability of a reference dictionary or lexical database to facilitate lemma identification and consistency in lemmatisation. Without lemmatisation a collection of recordings with various related appended annotation files will not be able to be used as a true linguistic corpus as the counting, sorting, tagging. etc. of types and tokens is rendered virtually impossible. This presentation draws on the Australian experience of corpus creation to show how a dictionary in the form of a computerized lexical database needs to be created and integrated into any signed language corpus project. Plans for the creation of new signed language corpora will be seriously flawed if they do not take this into account.}
}

