Entry Wang:2012:IGD from talip.bib

Last update: Sun Oct 15 02:55:04 MDT 2017                Valid HTML 3.2!

Index sections

Top | Symbols | Numbers | Math | A | B | C | D | E | F | G | H | I | J | K | L | M | N | O | P | Q | R | S | T | U | V | W | X | Y | Z

BibTeX entry

@Article{Wang:2012:IGD,
  author =       "Kun Wang and Chengqing Zong and Keh-Yih Su",
  title =        "Integrating Generative and Discriminative
                 Character-Based Models for {Chinese} Word
                 Segmentation",
  journal =      j-TALIP,
  volume =       "11",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2012",
  DOI =          "https://doi.org/10.1145/2184436.2184440",
  ISSN =         "1530-0226 (print), 1558-3430 (electronic)",
  ISSN-L =       "1530-0226",
  bibdate =      "Tue Jun 12 11:20:16 MDT 2012",
  bibsource =    "http://portal.acm.org/;
                 http://www.math.utah.edu/pub/tex/bib/talip.bib",
  abstract =     "Among statistical approaches to Chinese word
                 segmentation, the word-based n-gram ( generative )
                 model and the character-based tagging ( discriminative
                 ) model are two dominant approaches in the literature.
                 The former gives excellent performance for the
                 in-vocabulary (IV) words; however, it handles
                 out-of-vocabulary (OOV) words poorly. On the other
                 hand, though the latter is more robust for OOV words,
                 it fails to deliver satisfactory performance for IV
                 words. These two approaches behave differently due to
                 the unit they use (word vs. character) and the model
                 form they adopt (generative vs. discriminative). In
                 general, character-based approaches are more robust
                 than word-based ones, as the vocabulary of characters
                 is a closed set; and discriminative models are more
                 robust than generative ones, since they can flexibly
                 include all kinds of available information, such as
                 future context. This article first proposes a
                 character-based n -gram model to enhance the robustness
                 of the generative approach. Then the proposed
                 generative model is further integrated with the
                 character-based discriminative model to take advantage
                 of both approaches. Our experiments show that this
                 integrated approach outperforms all the existing
                 approaches reported in the literature. Afterwards, a
                 complete and detailed error analysis is conducted.
                 Since a significant portion of the critical errors is
                 related to numerical/foreign strings, character-type
                 information is then incorporated into the model to
                 further improve its performance. Last, the proposed
                 integrated approach is tested on cross-domain corpora,
                 and a semi-supervised domain adaptation algorithm is
                 proposed and shown to be effective in our
                 experiments.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Asian Language Information
                 Processing (TALIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?&idx=J820",
}

Related entries