Entry Saharia:2014:SRP from talip.bib

Last update: Sun Oct 15 02:55:04 MDT 2017                Valid HTML 3.2!

Index sections

Top | Symbols | Numbers | Math | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Z

BibTeX entry

@Article{Saharia:2014:SRP,
  author =       "Navanath Saharia and Utpal Sharma and Jugal Kalita",
  title =        "Stemming resource-poor {Indian} languages",
  journal =      j-TALIP,
  volume =       "13",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629670",
  ISSN =         "1530-0226 (print), 1558-3430 (electronic)",
  ISSN-L =       "1530-0226",
  bibdate =      "Sat Oct 4 06:09:41 MDT 2014",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/talip.bib",
  abstract =     "Stemming is a basic method for morphological
                 normalization of natural language texts. In this study,
                 we focus on the problem of stemming several
                 resource-poor languages from Eastern India, viz.,
                 Assamese, Bengali, Bishnupriya Manipuri and Bodo. While
                 Assamese, Bengali and Bishnupriya Manipuri are
                 Indo-Aryan, Bodo is a Tibeto-Burman language. We design
                 a rule-based approach to remove suffixes from words. To
                 reduce over-stemming and under-stemming errors, we
                 introduce a dictionary of frequent words. We observe
                 that, for these languages a dominant amount of suffixes
                 are single letters creating problems during suffix
                 stripping. As a result, we introduce an HMM-based
                 hybrid approach to classify the mis-matched last
                 character. For each word, the stem is extracted by
                 calculating the most probable path in four HMM states.
                 At each step we measure the stemming accuracy for each
                 language. We obtain 94\% accuracy for Assamese and
                 Bengali and 87\%, and 82\% for Bishnupriya Manipuri and
                 Bodo, respectively, using the hybrid approach. We
                 compare our work with Morfessor [Creutz and Lagus
                 2005]. As of now, there is no reported work on stemming
                 for Bishnupriya Manipuri and Bodo. Our results on
                 Assamese and Bengali show significant improvement over
                 prior published work [Sarkar and Bandyopadhyay 2008;
                 Sharma et al. 2002, 2003].",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Asian Language Information
                 Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}

Related entries