Entry Dolamic:2010:CSI from talip.bib

Last update: Sun Oct 15 02:55:04 MDT 2017                Valid HTML 3.2!

Index sections

Top | Symbols | Numbers | Math | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Z

BibTeX entry

@Article{Dolamic:2010:CSI,
  author =       "Ljiljana Dolamic and Jacques Savoy",
  title =        "Comparative Study of Indexing and Search Strategies
                 for the {Hindi}, {Marathi}, and {Bengali} Languages",
  journal =      j-TALIP,
  volume =       "9",
  number =       "3",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1838745.1838748",
  ISSN =         "1530-0226 (print), 1558-3430 (electronic)",
  ISSN-L =       "1530-0226",
  bibdate =      "Sat Sep 18 15:58:58 MDT 2010",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/talip.bib",
  abstract =     "The main goal of this article is to describe and
                 evaluate various indexing and search strategies for the
                 Hindi, Bengali, and Marathi languages. These three
                 languages are ranked among the world's 20 most spoken
                 languages and they share similar syntax, morphology,
                 and writing systems. In this article we examine these
                 languages from an Information Retrieval (IR)
                 perspective through describing the key elements of
                 their inflectional and derivational morphologies, and
                 suggest a light and more aggressive stemming approach
                 based on them.\par

                 In our evaluation of these stemming strategies we make
                 use of the FIRE 2008 test collections, and then to
                 broaden our comparisons we implement and evaluate two
                 language independent indexing methods: the $n$-gram and
                 trunc-$n$ (truncation of the first $n$ letters). We
                 evaluate these solutions by applying our various IR
                 models, including the Okapi, Divergence from Randomness
                 (DFR) and statistical language models (LM) together
                 with two classical vector-space approaches: {\em tf
                 idf\/} and {\em Lnu-ltc}.\par

                 Experiments performed with all three languages
                 demonstrate that the I(n$_e$)C2 model derived from the
                 Divergence from Randomness paradigm tends to provide
                 the best mean average precision (MAP). Our own tests
                 suggest that improved retrieval effectiveness would be
                 obtained by applying more aggressive stemmers,
                 especially those accounting for certain derivational
                 suffixes, compared to those involving a light stemmer
                 or ignoring this type of word normalization procedure.
                 Comparisons between no stemming and stemming indexing
                 schemes shows that performance differences are almost
                 always statistically significant. When, for example, an
                 aggressive stemmer is applied, the relative
                 improvements obtained are $\approx$28\% for the Hindi
                 language, $\approx$42\% for Marathi, and $\approx$18\%
                 for Bengali, as compared to a no-stemming approach.
                 Based on a comparison of word-based and
                 language-independent approaches we find that the
                 trunc-4 indexing scheme tends to result in performance
                 levels statistically similar to those of an aggressive
                 stemmer, yet better than the 4-gram indexing scheme. A
                 query-by-query analysis reveals the reasons for this,
                 and also demonstrates the advantage of applying a
                 stemming or a trunc-4 indexing scheme.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Asian Language Information
                 Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J820",
  keywords =     "Bengali language; Hindi language; Indic languages;
                 Marathi language; natural language processing with
                 Indo-European languages; search engines for Asian
                 languages; stemmer",
}

Related entries