Entry Fukunishi:2013:BAA from talip.bib

Last update: Sun Oct 15 02:55:04 MDT 2017                Valid HTML 3.2!

Index sections

Top | Symbols | Numbers | Math | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Z

BibTeX entry

@Article{Fukunishi:2013:BAA,
  author =       "Takaaki Fukunishi and Andrew Finch and Seiichi
                 Yamamoto and Eiichiro Sumita",
  title =        "A {Bayesian} Alignment Approach to Transliteration
                 Mining",
  journal =      j-TALIP,
  volume =       "12",
  number =       "3",
  pages =        "9:1--9:??",
  month =        aug,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2499955.2499957",
  ISSN =         "1530-0226 (print), 1558-3430 (electronic)",
  ISSN-L =       "1530-0226",
  bibdate =      "Mon Aug 19 18:39:55 MDT 2013",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/talip.bib",
  abstract =     "In this article we present a technique for mining
                 transliteration pairs using a set of simple features
                 derived from a many-to-many bilingual forced-alignment
                 at the grapheme level to classify candidate
                 transliteration word pairs as correct transliterations
                 or not. We use a nonparametric Bayesian method for the
                 alignment process, as this process rewards the reuse of
                 parameters, resulting in compact models that align in a
                 consistent manner and tend not to over-fit. Our
                 approach uses the generative model resulting from
                 aligning the training data to force-align the test
                 data. We rely on the simple assumption that correct
                 transliteration pairs would be well modeled and
                 generated easily, whereas incorrect pairs---being more
                 random in character---would be more costly to model and
                 generate. Our generative model generates by
                 concatenating bilingual grapheme sequence pairs. The
                 many-to-many generation process is essential for
                 handling many languages with non-Roman scripts, and it
                 is hard to train well using a maximum likelihood
                 techniques, as these tend to over-fit the data. Our
                 approach works on the principle that generation using
                 only grapheme sequence pairs that are in the model
                 results in a high probability derivation, whereas if
                 the model is forced to introduce a new parameter in
                 order to explain part of the candidate pair, the
                 derivation probability is substantially reduced and
                 severely reduced if the new parameter corresponds to a
                 sequence pair composed of a large number of graphemes.
                 The features we extract from the alignment of the test
                 data are not only based on the scores from the
                 generative model, but also on the relative proportions
                 of each sequence that are hard to generate. The
                 features are used in conjunction with a support vector
                 machine classifier trained on known positive examples
                 together with synthetic negative examples to determine
                 whether a candidate word pair is a correct
                 transliteration pair. In our experiments, we used all
                 data tracks from the 2010 Named-Entity Workshop
                 (NEWS'10) and use the performance of the best system
                 for each language pair as a reference point. Our
                 results show that the new features we propose are
                 powerfully predictive, enabling our approach to achieve
                 levels of performance on this task that are comparable
                 to the state of the art.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Asian Language Information
                 Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}

Related entries