Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.00",
%%%     date            = "03 April 2017",
%%%     time            = "08:32:22 MST",
%%%     filename        = "tallip.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "38156 3447 19567 184394",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "ACM Transactions on Asian and Low-Resource
%%%                        Language Information Processing (TALLIP);
%%%                        bibliography; BibTeX; TALLIP",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        ACM Transactions on Asian and Low-Resource
%%%                        Language Information Processing (TALLIP)
%%%                        (CODEN none, ISSN 2375-4699 (print),
%%%                        2375-4702 (electronic)).  Publication began
%%%                        with volume 14, number 1, in 2015 as a
%%%                        continuation of the predecessor journal,
%%%                        ACM Transactions on Asian language
%%%                        information processing (TALIP), which is
%%%                        covered in a separate bibliography, talip.bib.
%%%
%%%                        The journal has a World Wide Web site at
%%%
%%%                            http://portal.acm.org/browse_dl.cfm?&idx=J1521
%%%
%%%                        At version 1.00, the year coverage looked
%%%                        like this:
%%%
%%%                             2015 (  19)    2016 (  43)    2017 (   7)
%%%
%%%                             Article:         69
%%%
%%%                             Total entries:   69
%%%
%%%                        This bibliography has been constructed
%%%                        primarily from the publisher Web site.
%%%
%%%                        Numerous errors in the sources noted above
%%%                        have been corrected.  Spelling has been
%%%                        verified with the UNIX spell and GNU ispell
%%%                        programs using the exception dictionary
%%%                        stored in the companion file with extension
%%%                        .sok.
%%%
%%%                        BibTeX citation tags are uniformly chosen as
%%%                        name:year:abbrev, where name is the family
%%%                        name of the first author or editor, year is a
%%%                        4-digit number, and abbrev is a 3-letter
%%%                        condensation of important title words.
%%%                        Citation labels were automatically generated
%%%                        by software developed for the BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, with the help of
%%%                        ``bibsort -byvolume''.  The bibsort utility
%%%                        is available from ftp.math.utah.edu in
%%%                        /pub/tex/bib.
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================
@Preamble{
    "\hyphenation{ }"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-TALLIP                = "ACM Transactions on Asian and Low-Resource
                                  Language Information Processing (TALLIP)"}

%%% ====================================================================
%%% Bibliography entries:
@Article{Uematsu:2015:IMD,
  author =       "Sumire Uematsu and Takuya Matsuzaki and Hiroki Hanaoka
                 and Yusuke Miyao and Hideki Mima",
  title =        "Integrating Multiple Dependency Corpora for Inducing
                 Wide-Coverage {Japanese} {CCG} Resources",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2658997",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:48 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "A novel method to induce wide-coverage Combinatory
                 Categorial Grammar (CCG) resources for Japanese is
                 proposed in this article. For some languages including
                 English, the availability of large annotated corpora
                 and the development of data-based induction of
                 lexicalized grammar have enabled deep parsing, i.e.,
                 parsing based on lexicalized grammars. However, deep
                 parsing for Japanese has not been widely studied. This
                 is mainly because most Japanese syntactic resources are
                 represented in chunk-based dependency structures, while
                 previous methods for inducing grammars are dependent on
                 tree corpora. To translate syntactic information
                 presented in chunk-based dependencies to phrase
                 structures as accurately as possible, integration of
                 annotation from multiple dependency-based corpora is
                 proposed. Our method first integrates dependency
                 structures and predicate-argument information and
                 converts them into phrase structure trees. The trees
                 are then transformed into CCG derivations in a similar
                 way to previously proposed methods. The quality of the
                 conversion is empirically evaluated in terms of the
                 coverage of the obtained CCG lexicon and the accuracy
                 of the parsing with the grammar. While the transforming
                 process used in this study is specialized for Japanese,
                 the framework of our method would be applicable to
                 other languages for which dependency-based analysis has
                 been regarded as more appropriate than phrase
                 structure-based analysis due to morphosyntactic
                 features.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ramrakhiyani:2015:ATE,
  author =       "Nitin Ramrakhiyani and Prasenjit Majumder",
  title =        "Approaches to Temporal Expression Recognition in
                 {Hindi}",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629574",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:48 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Temporal annotation of plain text is considered a
                 useful component of modern information retrieval tasks.
                 In this work, different approaches for identification
                 and classification of temporal expressions in Hindi are
                 developed and analyzed. First, a rule-based approach is
                 developed, which takes plain text as input and based on
                 a set of hand-crafted rules, produces a tagged output
                 with identified temporal expressions. This approach
                 performs with a strict F1-measure of 0.83. In another
                 approach, a CRF-based classifier is trained with human
                 tagged data and is then tested on a test dataset. The
                 trained classifier identifies the time expressions from
                 plain text and further classifies them to various
                 classes. This approach performs with a strict
                 F1-measure of 0.78. Next, the CRF is replaced by an
                 SVM-based classifier and the same experiment is
                 performed with the same features. This approach is
                 shown to be comparable to the CRF and performs with a
                 strict F1-measure of 0.77. Using the rule base
                 information as an additional feature enhances the
                 performances to 0.86 and 0.84 for the CRF and SVM
                 respectively. With three different comparable systems
                 performing the extraction task, merging them to take
                 advantage of their positives is the next step. As the
                 first merge experiment, rule-based tagged data is fed
                 to the CRF and SVM classifiers as additional training
                 data. Evaluation results report an increase in
                 F1-measure of the CRF from 0.78 to 0.8. Second, a
                 voting-based approach is implemented, which chooses the
                 best class for each token from the outputs of the three
                 approaches. This approach results in the best
                 performance for this task with a strict F1-measure of
                 0.88. In this process a reusable gold standard dataset
                 for temporal tagging in Hindi is also developed. Named
                 the ILTIMEX2012 corpus, it consists of 300 manually
                 tagged Hindi news documents.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Kumari:2015:ITD,
  author =       "B. Venkata Seshu Kumari and Ramisetty Rajeshwara Rao",
  title =        "Improving {Telugu} Dependency Parsing using
                 Combinatory Categorial Grammar Supertags",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2693190.2693191",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:48 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "We show that Combinatory Categorial Grammar (CCG)
                 supertags can improve Telugu dependency parsing. In
                 this process, we first extract a CCG lexicon from the
                 dependency treebank. Using both the CCG lexicon and the
                 dependency treebank, we create a CCG treebank using a
                 chart parser. Exploring different morphological
                 features of Telugu, we develop a supertagger using
                 maximum entropy models. We provide CCG supertags as
                 features to the Telugu dependency parser (MST parser).
                 We get an improvement of 1.8\% in the unlabelled
                 attachment score and 2.2\% in the labelled attachment
                 score. Our results show that CCG supertags improve the
                 MST parser, especially on verbal arguments for which it
                 has weak rates of recovery.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ketui:2015:EBA,
  author =       "Nongnuch Ketui and Thanaruk Theeramunkong and
                 Chutamanee Onsuwan",
  title =        "An {EDU}-Based Approach for {Thai} Multi-Document
                 Summarization and Its Application",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2641567",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:48 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Due to lack of a word/phrase/sentence boundary,
                 summarization of Thai multiple documents has several
                 challenges in unit segmentation, unit selection,
                 duplication elimination, and evaluation dataset
                 construction. In this article, we introduce Thai
                 Elementary Discourse Units (TEDUs) and their
                 derivatives, called Combined TEDUs (CTEDUs), and then
                 present our three-stage method of Thai multi-document
                 summarization, that is, unit segmentation, unit-graph
                 formulation, and unit selection and summary generation.
                 To examine performance of our proposed method, a number
                 of experiments are conducted using 50 sets of Thai news
                 articles with their manually constructed reference
                 summaries. Based on measures of ROUGE-1, ROUGE-2, and
                 ROUGE-SU4, the experimental results show that: (1) the
                 TEDU-based summarization outperforms paragraph-based
                 summarization; (2) our proposed graph-based TEDU
                 weighting with importance-based selection achieves the
                 best performance; and (3) unit duplication
                 consideration and weight recalculation help improve
                 summary quality.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sproat:2015:TPE,
  author =       "Richard Sproat",
  title =        "{TALLIP} Perspectives: Editorial Commentary: The
                 Broadened Focus of the Journal",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2710043",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:48 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Shen:2015:MGA,
  author =       "Han-ping Shen and Chung-hsien Wu and Pei-shan Tsai",
  title =        "Model Generation of Accented Speech using Model
                 Transformation and Verification for Bilingual Speech
                 Recognition",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "2",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2661637",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Nowadays, bilingual or multilingual speech recognition
                 is confronted with the accent-related problem caused by
                 non-native speech in a variety of real-world
                 applications. Accent modeling of non-native speech is
                 definitely challenging, because the acoustic properties
                 in highly-accented speech pronounced by non-native
                 speakers are quite divergent. The aim of this study is
                 to generate highly Mandarin-accented English models for
                 speakers whose mother tongue is Mandarin. First, a
                 two-stage, state-based verification method is proposed
                 to extract the state-level, highly-accented speech
                 segments automatically. Acoustic features and
                 articulatory features are successively used for robust
                 verification of the extracted speech segments. Second,
                 Gaussian components of the highly-accented speech
                 models are generated from the corresponding Gaussian
                 components of the native speech models using a linear
                 transformation function. A decision tree is constructed
                 to categorize the transformation functions and used for
                 transformation function retrieval to deal with the data
                 sparseness problem. Third, a discrimination function is
                 further applied to verify the generated accented
                 acoustic models. Finally, the successfully verified
                 accented English models are integrated into the native
                 bilingual phone model set for Mandarin-English
                 bilingual speech recognition. Experimental results show
                 that the proposed approach can effectively alleviate
                 recognition performance degradation due to accents and
                 can obtain absolute improvements of 4.1\%, 1.8\%, and
                 2.7\% in word accuracy for bilingual speech recognition
                 compared to that using traditional ASR approaches,
                 MAP-adapted, and MLLR-adapted ASR methods,
                 respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Awajan:2015:KEA,
  author =       "Arafat Awajan",
  title =        "Keyword Extraction from {Arabic} Documents using Term
                 Equivalence Classes",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "2",
  pages =        "7:1--7:??",
  month =        mar,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2665077",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The rapid growth of the Internet and other computing
                 facilities in recent years has resulted in the creation
                 of a large amount of text in electronic form, which has
                 increased the interest in and importance of different
                 automatic text processing applications, including
                 keyword extraction and term indexing. Although keywords
                 are very useful for many applications, most documents
                 available online are not provided with keywords. We
                 describe a method for extracting keywords from Arabic
                 documents. This method identifies the keywords by
                 combining linguistics and statistical analysis of the
                 text without using prior knowledge from its domain or
                 information from any related corpus. The text is
                 preprocessed to extract the main linguistic
                 information, such as the roots and morphological
                 patterns of derivative words. A cleaning phase is then
                 applied to eliminate the meaningless words from the
                 text. The most frequent terms are clustered into
                 equivalence classes in which the derivative words
                 generated from the same root and the non-derivative
                 words generated from the same stem are placed together,
                 and their count is accumulated. A vector space model is
                 then used to capture the most frequent N-gram in the
                 text. Experiments carried out using a real-world
                 dataset show that the proposed method achieves good
                 results with an average precision of 31\% and average
                 recall of 53\% when tested against manually assigned
                 keywords.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sundaram:2015:BLM,
  author =       "Suresh Sundaram and A. G. Ramakrishnan",
  title =        "Bigram Language Models and Reevaluation Strategy for
                 Improved Recognition of Online Handwritten {Tamil}
                 Words",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "2",
  pages =        "8:1--8:??",
  month =        mar,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2671014",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article describes a postprocessing strategy for
                 online, handwritten, isolated Tamil words.
                 Contributions have been made with regard to two issues
                 hardly addressed in the online Indic word recognition
                 literature, namely, use of (1) language models
                 exploiting the idiosyncrasies of Indic scripts and (2)
                 expert classifiers for the disambiguation of confused
                 symbols. The input word is first segmented into its
                 individual symbols, which are recognized using a
                 primary support vector machine (SVM) classifier.
                 Thereafter, we enhance the recognition accuracy by
                 utilizing (i) a bigram language model at the symbol or
                 character level and (ii) expert classifiers for
                 reevaluating and disambiguating the different sets of
                 confused symbols. The symbol-level bigram model is used
                 in a traditional Viterbi framework. The concept of a
                 character comprising multiple symbols is unique to
                 Dravidian languages such as Tamil. This multi-symbol
                 feature of Tamil characters has been exploited in
                 proposing a novel, prefix-tree-based character-level
                 bigram model that does not use Viterbi search; rather
                 it reduces the search space for each input symbol based
                 on its left context. For disambiguating confused
                 symbols, a dynamic time-warping approach is proposed to
                 automatically identify the parts of the online trace
                 that discriminates between the confused classes. Fine
                 classification of these regions by dedicated expert
                 SVMs reduces the extent of confusions between such
                 symbols. The integration of segmentation,
                 prefix-tree-based language model and disambiguation of
                 confused symbols is presented on a set of 15,000
                 handwritten isolated online Tamil words. Our results
                 show recognition accuracies of 93.0\% and 81.6\% at the
                 symbol and word level, respectively, as compared to the
                 baseline classifier performance of 88.4\% and 65.1\%,
                 respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Zhang:2015:TMT,
  author =       "Jiajun Zhang and Shujie Liu and Mu Li and Ming Zhou
                 and Chengqing Zong",
  title =        "Towards Machine Translation in Semantic Vector Space",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "2",
  pages =        "9:1--9:??",
  month =        mar,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2699927",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Measuring the quality of the translation rules and
                 their composition is an essential issue in the
                 conventional statistical machine translation (SMT)
                 framework. To express the translation quality, the
                 previous lexical and phrasal probabilities are
                 calculated only according to the co-occurrence
                 statistics in the bilingual corpus and may be not
                 reliable due to the data sparseness problem. To address
                 this issue, we propose measuring the quality of the
                 translation rules and their composition in the semantic
                 vector embedding space (VES). We present a recursive
                 neural network (RNN)-based translation framework, which
                 includes two submodels. One is the
                 bilingually-constrained recursive auto-encoder, which
                 is proposed to convert the lexical translation rules
                 into compact real-valued vectors in the semantic VES.
                 The other is a type-dependent recursive neural network,
                 which is proposed to perform the decoding process by
                 minimizing the semantic gap (meaning distance) between
                 the source language string and its translation
                 candidates at each state in a bottom-up structure. The
                 RNN-based translation model is trained using a
                 max-margin objective function that maximizes the margin
                 between the reference translation and the n-best
                 translations in forced decoding. In the experiments, we
                 first show that the proposed vector representations for
                 the translation rules are very reliable for application
                 in translation modeling. We further show that the
                 proposed type-dependent, RNN-based model can
                 significantly improve the translation quality in the
                 large-scale, end-to-end Chinese-to-English translation
                 evaluation.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Na:2015:CRF,
  author =       "Seung-Hoon Na",
  title =        "Conditional Random Fields for {Korean} Morpheme
                 Segmentation and {POS} Tagging",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "3",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700051",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "There has been recent interest in statistical
                 approaches to Korean morphological analysis. However,
                 previous studies have been based mostly on generative
                 models, including a hidden Markov model (HMM), without
                 utilizing discriminative models such as a conditional
                 random field (CRF). We present a two-stage
                 discriminative approach based on CRFs for Korean
                 morphological analysis. Similar to methods used for
                 Chinese, we perform two disambiguation procedures based
                 on CRFs: (1) morpheme segmentation and (2) POS tagging.
                 In morpheme segmentation, an input sentence is
                 segmented into sequences of morphemes, where a morpheme
                 unit is either atomic or compound. In the POS tagging
                 procedure, each morpheme (atomic or compound) is
                 assigned a POS tag. Once POS tagging is complete, we
                 carry out a post-processing of the compound morphemes,
                 where each compound morpheme is further decomposed into
                 atomic morphemes, which is based on pre-analyzed
                 patterns and generalized HMMs obtained from the given
                 tagged corpus. Experimental results show the promise of
                 our proposed method.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Liu:2015:MTM,
  author =       "Xiaodong Liu and Kevin Duh and Yuji Matsumoto",
  title =        "Multilingual Topic Models for Bilingual Dictionary
                 Extraction",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "3",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2699939",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "A machine-readable bilingual dictionary plays a
                 crucial role in many natural language processing tasks,
                 such as statistical machine translation and
                 cross-language information retrieval. In this article,
                 we propose a framework for extracting a bilingual
                 dictionary from comparable corpora by exploiting a
                 novel combination of topic modeling and word aligners
                 such as the IBM models. Using a multilingual topic
                 model, we first convert a comparable document -aligned
                 corpus into a parallel topic -aligned corpus. This
                 novel topic-aligned corpus is similar in structure to
                 the sentence -aligned corpus frequently employed in
                 statistical machine translation and allows us to
                 extract a bilingual dictionary using a word alignment
                 model. The main advantages of our framework is that (1)
                 no seed dictionary is necessary for bootstrapping the
                 process, and (2) multilingual comparable corpora in
                 more than two languages can also be exploited. In our
                 experiments on a large-scale Wikipedia dataset, we
                 demonstrate that our approach can extract higher
                 precision dictionaries compared to previous approaches
                 and that our method improves further as we add more
                 languages to the dataset.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Li:2015:UMS,
  author =       "Xiaoqing Li and Chengqing Zong and Keh-yih Su",
  title =        "A Unified Model for Solving the {OOV} Problem of
                 {Chinese} Word Segmentation",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "3",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2699940",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article proposes a unified, character-based,
                 generative model to incorporate additional resources
                 for solving the out-of-vocabulary (OOV) problem of
                 Chinese word segmentation, within which different types
                 of additional information can be utilized independently
                 in corresponding submodels. This article mainly
                 addresses the following three types of OOV: unseen
                 dictionary words, named entities, and suffix-derived
                 words, none of which are handled well by current
                 approaches. The results show that our approach can
                 effectively improve the performance of the first two
                 types with positive interaction in F-score.
                 Additionally, we also analyze reason that suffix
                 information is not helpful. After integrating the
                 proposed generative model with the corresponding
                 discriminative approach, our evaluation on various
                 corpora---including SIGHAN-2005, CIPS-SIGHAN-2010, and
                 the Chinese Treebank (CTB)---shows that our integrated
                 approach achieves the best performance reported in the
                 literature on all testing sets when additional
                 information and resources are allowed.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Goto:2015:PUT,
  author =       "Isao Goto and Masao Utiyama and Eiichiro Sumita and
                 Sadao Kurohashi",
  title =        "Preordering using a Target-Language Parser via
                 Cross-Language Syntactic Projection for Statistical
                 Machine Translation",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "3",
  pages =        "13:1--13:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2699925",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "When translating between languages with widely
                 different word orders, word reordering can present a
                 major challenge. Although some word reordering methods
                 do not employ source-language syntactic structures,
                 such structures are inherently useful for word
                 reordering. However, high-quality syntactic parsers are
                 not available for many languages. We propose a
                 preordering method using a target-language syntactic
                 parser to process source-language syntactic structures
                 without a source-language syntactic parser. To train
                 our preordering model based on ITG, we produced
                 syntactic constituent structures for source-language
                 training sentences by (1) parsing target-language
                 training sentences, (2) projecting constituent
                 structures of the target-language sentences to the
                 corresponding source-language sentences, (3) selecting
                 parallel sentences with highly synchronized parallel
                 structures, (4) producing probabilistic models for
                 parsing using the projected partial structures and the
                 Pitman-Yor process, and (5) parsing to produce full
                 binary syntactic structures maximally synchronized with
                 the corresponding target-language syntactic structures,
                 using the constraints of the projected partial
                 structures and the probabilistic models. Our ITG-based
                 preordering model is trained using the produced binary
                 syntactic structures and word alignments. The proposed
                 method facilitates the learning of ITG by producing
                 highly synchronized parallel syntactic structures based
                 on cross-language syntactic projection and sentence
                 selection. The preordering model jointly parses input
                 sentences and identifies their reordered structures.
                 Experiments with Japanese--English and Chinese--English
                 patent translation indicate that our method outperforms
                 existing methods, including string-to-tree syntax-based
                 SMT, a preordering method that does not require a
                 parser, and a preordering method that uses a
                 source-language dependency parser.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Costa-Jussa:2016:DCS,
  author =       "Marta R. Costa-Juss{\`a} and Jordi Centelles",
  title =        "Description of the {Chinese}-to-{Spanish} Rule-Based
                 Machine Translation System Developed Using a Hybrid
                 Combination of Human Annotation and Statistical
                 Techniques",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2738045",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Two of the most popular Machine Translation (MT)
                 paradigms are rule based (RBMT) and corpus based, which
                 include the statistical systems (SMT). When scarce
                 parallel corpus is available, RBMT becomes particularly
                 attractive. This is the case of the Chinese--Spanish
                 language pair. This article presents the first RBMT
                 system for Chinese to Spanish. We describe a hybrid
                 method for constructing this system taking advantage of
                 available resources such as parallel corpora that are
                 used to extract dictionaries and lexical and structural
                 transfer rules. The final system is freely available
                 online and open source. Although performance lags
                 behind standard SMT systems for an in-domain test set,
                 the results show that the RBMT's coverage is
                 competitive and it outperforms the SMT system in an
                 out-of-domain test set. This RBMT system is available
                 to the general public, it can be further enhanced, and
                 it opens up the possibility of creating future hybrid
                 MT systems.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Khanduja:2016:HFE,
  author =       "Deepti Khanduja and Neeta Nain and Subhash Panwar",
  title =        "A Hybrid Feature Extraction Algorithm for {Devanagari}
                 Script",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2710018",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The efficiency of any character recognition technique
                 is directly dependent on the accuracy of the generated
                 feature set that could uniquely represent a character
                 and hence correctly recognize it. This article proposes
                 a hybrid approach combining the structural features of
                 the character and a mathematical model of curve fitting
                 to simulate the best features of a character. As a
                 preprocessing step, skeletonization of the character is
                 performed using an iterative thinning algorithm based
                 on Raster scan of the character image. Then, a
                 combination of structural features of the character
                 like number of endpoints, loops, and intersection
                 points is calculated. Further, the thinned character
                 image is statistically zoned into partitions, and a
                 quadratic curve-fitting model is applied on each
                 partition forming a feature vector of the coefficients
                 of the optimally fitted curve. This vector is combined
                 with the spatial distribution of the foreground pixels
                 for each zone and hence script-independent feature
                 representation. The approach has been evaluated
                 experimentally on Devanagari scripts. The algorithm
                 achieves an average recognition accuracy of 93.4\%.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Shatnawi:2016:IHA,
  author =       "Maad Shatnawi and Sherief Abdallah",
  title =        "Improving Handwritten {Arabic} Character Recognition
                 by Modeling Human Handwriting Distortions",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2764456",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Handwritten Arabic character recognition systems face
                 several challenges, including the unlimited variation
                 in human handwriting and the unavailability of large
                 public databases of handwritten characters and words.
                 The use of synthetic data for training and testing
                 handwritten character recognition systems is one of the
                 possible solutions to provide several variations for
                 these characters and to overcome the lack of large
                 databases. While this can be using arbitrary
                 distortions, such as image noise and randomized affine
                 transformations, such distortions are not realistic. In
                 this work, we model real distortions in handwriting
                 using real handwritten Arabic character examples and
                 then use these distortion models to synthesize
                 handwritten examples that are more realistic. We show
                 that the use of our proposed approach leads to
                 significant improvements across different
                 machine-learning classification algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wushouer:2016:CAP,
  author =       "Mairidan Wushouer and Donghui Lin and Toru Ishida and
                 Katsutoshi Hirayama",
  title =        "A Constraint Approach to Pivot-Based Bilingual
                 Dictionary Induction",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2723144",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "High-quality bilingual dictionaries are very useful,
                 but such resources are rarely available for
                 lower-density language pairs, especially for those that
                 are closely related. Using a third language to link two
                 other languages is a well-known solution and usually
                 requires only two input bilingual dictionaries A-B and
                 B-C to automatically induce the new one, A-C. This
                 approach, however, has never been demonstrated to
                 utilize the complete structures of the input bilingual
                 dictionaries, and this is a key failing because the
                 dropped meanings negatively influence the result. This
                 article proposes a constraint approach to pivot-based
                 dictionary induction where language A and C are closely
                 related. We create constraints from language similarity
                 and model the structures of the input dictionaries as a
                 Boolean optimization problem, which is then formulated
                 within the Weighted Partial Max-SAT framework, an
                 extension of Boolean Satisfiability (SAT). All of the
                 encoded CNF (Conjunctive Normal Form), the predominant
                 input language of modern SAT/MAX-SAT solvers, formulas
                 are evaluated by a solver to produce the target
                 (output) bilingual dictionary. Moreover, we discuss
                 alternative formalizations as a comparison study. We
                 designed a tool that uses the Sat4j library as the
                 default solver to implement our method and conducted an
                 experiment in which the output bilingual dictionary
                 achieved better quality than the baseline method.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Yeh:2016:SAI,
  author =       "Jui-Feng Yeh",
  title =        "Speech Act Identification Using Semantic Dependency
                 Graphs with Probabilistic Context-Free Grammars",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2786978",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "We propose an approach for identifying the speech acts
                 of speakers' utterances in conversational spoken
                 dialogue that involves using semantic dependency graphs
                 with probabilistic context-free grammars (PCFGs). The
                 semantic dependency graph based on the HowNet knowledge
                 base is adopted to model the relationships between
                 words in an utterance parsed by PCFG. Dependency
                 relationships between words within the utterance are
                 extracted by decomposing the semantic dependency graph
                 according to predefined events. The corresponding
                 values of semantic slots are subsequently extracted
                 from the speaker's utterances according to the
                 corresponding identified speech act. The experimental
                 results obtained when using the proposed approach
                 indicated that the accuracy rates of speech act
                 detection and task completion were 95.6\% and 77.4\%
                 for human-generated transcription (REF) and
                 speech-to-text recognition output (STT), respectively,
                 and the average numbers of turns of each dialogue were
                 8.3 and 11.8 for REF and STT, respectively. Compared
                 with Bayes classifier, partial pattern tree, and
                 Bayesian-network-based approaches, we obtained 14.1\%,
                 9.2\%, and 3\% improvements in the accuracy of speech
                 act identification, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wang:2016:CCSa,
  author =       "Ting-Xuan Wang and Wen-Hsiang Lu",
  title =        "Constructing Complex Search Tasks with Coherent
                 Subtask Search Goals",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "2",
  pages =        "6:1--6:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742547",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Nowadays, due to the explosive growth of web content
                 and usage, users deal with their complex search tasks
                 by web search engines. However, conventional search
                 engines consider a search query corresponding only to a
                 simple search task. In order to accomplish a complex
                 search task, which consists of multiple subtask search
                 goals, users usually have to issue a series of queries.
                 For example, the complex search task ``travel to
                 Dubai'' may involve several subtask search goals,
                 including reserving hotel room, surveying Dubai
                 landmarks, booking flights, and so forth. Therefore, a
                 user can efficiently accomplish his or her complex
                 search task if search engines can predict the complex
                 search task with a variety of subtask search goals. In
                 this work, we propose a complex search task model
                 (CSTM) to deal with this problem. The CSTM first groups
                 queries into complex search task clusters, and then
                 generates subtask search goals from each complex search
                 task cluster. To raise the performance of CSTM, we
                 exploit four web resources including community question
                 answering, query logs, search engine result pages, and
                 clicked pages. Experimental results show that our CSTM
                 is effective in identifying the comprehensive subtask
                 search goals of a complex search task.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Tsai:2016:CWB,
  author =       "Richard Tzong-Han Tsai",
  title =        "Collective {Web}-Based Parenthetical Translation
                 Extraction Using {Markov} Logic Networks",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "2",
  pages =        "7:1--7:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2794399",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Parenthetical translations are translations of terms
                 in otherwise monolingual text that appear inside
                 parentheses. Parenthetical translations extraction
                 (PTE) is the task of extracting parenthetical
                 translations from natural language documents. One of
                 the main difficulties in PTE is to detect the left
                 boundary of the translated term in preparenthetical
                 text. In this article, we propose a collective approach
                 that employs Markov logic to model multiple constraints
                 used in the PTE task. We show how various constraints
                 can be formulated and combined in a Markov logic
                 network (MLN). Our experimental results show that the
                 proposed collective PTE approach significantly
                 outperforms a current state-of-the-art method,
                 improving the average F-measure up to 27.11\% compared
                 to the previous word alignment approach. It also
                 outperforms an individual MLN-based system by 8.2\% and
                 a system based on conditional random fields by 5.9\%.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Jain:2016:FHW,
  author =       "Amita Jain and D. K. Lobiyal",
  title =        "Fuzzy {Hindi} {WordNet} and Word Sense Disambiguation
                 Using Fuzzy Graph Connectivity Measures",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "2",
  pages =        "8:1--8:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2790079",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In this article, we propose Fuzzy Hindi WordNet, which
                 is an extended version of Hindi WordNet. The proposed
                 idea of fuzzy relations and their role in modeling
                 Fuzzy Hindi WordNet is explained. We mathematically
                 define fuzzy relations and the composition of these
                 fuzzy relations for this extended version. We show that
                 the concept of composition of fuzzy relations can be
                 used to infer a relation between two words that
                 otherwise are not directly related in Hindi WordNet.
                 Then we propose fuzzy graph connectivity measures that
                 include both local and global measures. These measures
                 are used in determining the significance of a concept
                 (which is represented as a vertex in the fuzzy graph)
                 in a specific context. Finally, we show how these
                 extended measures solve the problem of word sense
                 disambiguation (WSD) effectively, which is useful in
                 many natural language processing applications to
                 improve their performance. Experiments on standard
                 sense tagged corpus for WSD show better results when
                 Fuzzy Hindi WordNet is used in place of Hindi
                 WordNet.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Kertkeidkachorn:2016:AFH,
  author =       "Natthawut Kertkeidkachorn and Proadpran Punyabukkana
                 and Atiwong Suchato",
  title =        "Acoustic Features for Hidden Conditional Random
                 Fields-Based {Thai} Tone Classification",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "2",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2833088",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In the Thai language, tone information is necessary
                 for Thai speech recognition systems. Previous studies
                 show that many acoustic cues are attributed to shapes
                 of tones. Nevertheless, most Thai tone classification
                 studies mainly adopted F$_0$ values and their
                 derivatives without considering other acoustic
                 features. In this article, other acoustic features for
                 Thai tone classification are investigated. In the
                 experiment, energy values and spectral information
                 represented by three spectral-based features including
                 the LPC-based feature, PLP-based feature, and
                 MFCC-based feature are applied to the HCRF-based Thai
                 tone classification, which was reported as the best
                 approach for Thai tone classification. The energy
                 values provide an error rate reduction of 22.40\% in
                 the isolated word scenario, while there are slight
                 improvements in the continuous speech scenario. On the
                 contrary, spectral-based features greatly contribute to
                 Thai tone classification in the continuous-speech
                 scenario, whereas spectral-based features slightly
                 degrade performances in the isolated-word scenario. The
                 best achievement in the continuous-speech scenario is
                 obtained from the PLP-based feature, which yields an
                 error rate reduction of 13.90\%. Therefore, findings in
                 this article are that energy values and spectral-based
                 features, especially the PLP-based feature, are the
                 main contributors to the improvement of the
                 performances of Thai tone classification in the
                 isolated-word scenario and the continuous-speech
                 scenario, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Chu:2016:IPS,
  author =       "Chenhui Chu and Toshiaki Nakazawa and Sadao
                 Kurohashi",
  title =        "Integrated Parallel Sentence and Fragment Extraction
                 from Comparable Corpora: a Case Study on
                 {Chinese--Japanese} {Wikipedia}",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "2",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2833089",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Parallel corpora are crucial for statistical machine
                 translation (SMT); however, they are quite scarce for
                 most language pairs and domains. As comparable corpora
                 are far more available, many studies have been
                 conducted to extract either parallel sentences or
                 fragments from them for SMT. In this article, we
                 propose an integrated system to extract both parallel
                 sentences and fragments from comparable corpora. We
                 first apply parallel sentence extraction to identify
                 parallel sentences from comparable sentences. We then
                 extract parallel fragments from the comparable
                 sentences. Parallel sentence extraction is based on a
                 parallel sentence candidate filter and classifier for
                 parallel sentence identification. We improve it by
                 proposing a novel filtering strategy and three novel
                 feature sets for classification. Previous studies have
                 found it difficult to accurately extract parallel
                 fragments from comparable sentences. We propose an
                 accurate parallel fragment extraction method that uses
                 an alignment model to locate the parallel fragment
                 candidates and an accurate lexicon-based filter to
                 identify the truly parallel fragments. A case study on
                 the Chinese--Japanese Wikipedia indicates that our
                 proposed methods outperform previously proposed
                 methods, and the parallel data extracted by our system
                 significantly improves SMT performance.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wang:2016:CCSb,
  author =       "Rui Wang and Masao Utiyama and Isao Goto and Eiichiro
                 Sumita and Hai Zhao and Bao-Liang Lu",
  title =        "Converting Continuous-Space Language Models into
                 {$N$}-gram Language Models with Efficient Bilingual
                 Pruning for Statistical Machine Translation",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "11:1--11:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2843942",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The Language Model (LM) is an essential component of
                 Statistical Machine Translation (SMT). In this article,
                 we focus on developing efficient methods for LM
                 construction. Our main contribution is that we propose
                 a Natural N -grams based Converting (NNGC) method for
                 transforming a Continuous-Space Language Model (CSLM)
                 to a Back-off N -gram Language Model (BNLM).
                 Furthermore, a Bilingual LM Pruning (BLMP) approach is
                 developed for enhancing LMs in SMT decoding and
                 speeding up CSLM converting. The proposed pruning and
                 converting methods can convert a large LM efficiently
                 by working jointly. That is, a LM can be effectively
                 pruned before it is converted from CSLM without
                 sacrificing performance, and further improved if an
                 additional corpus contains out-of-domain information.
                 For different SMT tasks, our experimental results
                 indicate that the proposed NNGC and BLMP methods
                 outperform the existing counterpart approaches
                 significantly in BLEU and computational cost.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Chakrabarty:2016:BBL,
  author =       "Abhisek Chakrabarty and Utpal Garain",
  title =        "{BenLem} (A {Bengali} Lemmatizer) and Its Role in
                 {WSD}",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "12:1--12:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2835494",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "A lemmatization algorithm for Bengali has been
                 developed and evaluated. Its effectiveness for word
                 sense disambiguation (WSD) is also investigated. One of
                 the key challenges for computer processing of highly
                 inflected languages is to deal with the frequent
                 morphological variations of the root words appearing in
                 the text. Therefore, a lemmatizer is essential for
                 developing natural language processing (NLP) tools for
                 such languages. In this experiment, Bengali, which is
                 the national language of Bangladesh and the second most
                 popular language in the Indian subcontinent, has been
                 taken as a reference. In order to design the Bengali
                 lemmatizer (named as BenLem), possible transformations
                 through which surface words are formed from lemmas are
                 studied so that appropriate reverse transformations can
                 be applied on a surface word to get the corresponding
                 lemma back. BenLem is found to be capable of handling
                 both inflectional and derivational morphology in
                 Bengali. It is evaluated on a set of 18 news articles
                 taken from the FIRE Bengali News Corpus consisting of
                 3,342 surface words (excluding proper nouns) and found
                 to be 81.95\% accurate. The role of the lemmatizer is
                 then investigated for Bengali WSD. Ten highly
                 polysemous Bengali words are considered for sense
                 disambiguation. The FIRE corpus and a collection of
                 Tagore's short stories are considered for creating the
                 WSD dataset. Different WSD systems are considered for
                 this experiment, and it is noticed that BenLem improves
                 the performance of all the WSD systems and the
                 improvements are statistically significant.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Zhou:2016:ESR,
  author =       "Hao Zhou and Shujian Huang and Junsheng Zhou and Yue
                 Zhang and Huadong Chen and Xinyu Dai and Chuan Cheng
                 and Jiajun Chen",
  title =        "Enhancing Shift--Reduce Constituent Parsing with
                 Action {$N$}-Gram Model",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "13:1--13:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2820902",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Current shift-reduce parsers ``understand'' the
                 context by embodying a large number of binary indicator
                 features with a discriminative model. In this article,
                 we propose the action n-gram model, which utilizes the
                 action sequence to help parsing disambiguation. The
                 action n-gram model is trained on action sequences
                 produced by parsers with the n-gram estimation method,
                 which gives a smoothed maximum likelihood estimation of
                 the action probability given a specific action history.
                 We show that incorporating action n-gram models into a
                 state-of-the-art parsing framework could achieve
                 parsing accuracy improvements on three datasets across
                 two languages.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sadek:2016:EAC,
  author =       "Jawad Sadek and Farid Meziane",
  title =        "Extracting {Arabic} Causal Relations Using Linguistic
                 Patterns",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "14:1--14:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2800786",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Identifying semantic relations is a crucial step in
                 discourse analysis and is useful for many applications
                 in both language and speech technology. Automatic
                 detection of Causal relations therefore has gained
                 popularity in the literature within different
                 frameworks. The aim of this article is the automatic
                 detection and extraction of Causal relations that are
                 explicitly expressed in Arabic texts. To fulfill this
                 goal, a Pattern Recognizer model was developed to
                 signal the presence of cause--effect information within
                 sentences from nonspecific domain texts. This model
                 incorporates approximately 700 linguistic patterns so
                 that parts of the sentence representing the cause and
                 those representing the effect can be distinguished. The
                 patterns were constructed based on different sets of
                 syntactic features by analyzing a large untagged Arabic
                 corpus. In addition, the model was boosted with three
                 independent algorithms to deal with certain types of
                 grammatical particles that indicate causation. With
                 this approach, the proposed model achieved an overall
                 recall of 81\% and a precision of 78\%. Evaluation
                 results revealed that the justification particles play
                 a key role in detecting Causal relations. To the best
                 of our knowledge, no previous studies have been
                 dedicated to dealing with this type of relation in the
                 Arabic language.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Yang:2016:BSR,
  author =       "Haitong Yang and Yu Zhou and Chengqing Zong",
  title =        "Bilingual Semantic Role Labeling Inference via Dual
                 Decomposition",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2835493",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article focuses on bilingual Semantic Role
                 Labeling (SRL); its goal is to annotate semantic roles
                 on both sides of the parallel bilingual texts
                 (bi-texts). Since rich bilingual information is
                 encoded, bilingual SRL has been applied in many
                 natural-language processing (NLP) tasks such as machine
                 translation (MT), cross-lingual information retrieval
                 (IR), and the like. A feasible way of performing
                 bilingual SRL is using monolingual SRL systems to
                 perform SRL on each side of bi-texts separately.
                 However, it is difficult to obtain consistent SRL
                 results on both sides of bi-texts in this way. Some
                 works have tried to jointly infer bilingual SRL because
                 there are many complementary language cues on both
                 sides of bi-texts and they reported better performance
                 than monolingual systems. However, there are two limits
                 in the existing methods. First, the existing methods
                 often require high inference costs due to the complex
                 objective function. Second, the existing methods fully
                 adopt the candidates generated by monolingual SRL
                 systems, but many candidates are discarded in the
                 argument pruning or identification stage of monolingual
                 systems. In this article, we propose two strategies to
                 overcome these limits. We utilize a simple but
                 efficient technique: Dual Decomposition to search for
                 consistent results for both sides of bi-texts. On the
                 other hand, we propose a method called Bi-Directional
                 Projection (BDP) to recover arguments discarded in
                 monolingual SRL systems. We evaluate our method on a
                 standard parallel benchmark: the OntoNotes dataset. The
                 experimental results show that our method yields
                 significant improvements over the state-of-the-art
                 monolingual systems. In addition, our approach is also
                 better and faster than existing methods due to BDP and
                 Dual Decomposition.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Li:2016:MMC,
  author =       "Maoxi Li and Mingwen Wang and Hanxi Li and Fan Xu",
  title =        "Modeling Monolingual Character Alignment for Automatic
                 Evaluation of {Chinese} Translation",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2815619",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Automatic evaluation of machine translations is an
                 important task. Most existing evaluation metrics rely
                 on matching the same word or letter n -grams. This
                 strategy leads to poor results on Chinese translations
                 because one has to rely merely on matching identical
                 characters. In this article, we propose a new
                 evaluation metric that allows different characters with
                 the same or similar meaning to match. An Indirect
                 Hidden Markov Model (IHMM) is proposed to align the
                 Chinese translation with human references at the
                 character level. In the model, the emission
                 probabilities are estimated by character similarity,
                 including character semantic similarity and character
                 surface similarity, and transition probabilities are
                 estimated by a heuristic distance-based distortion
                 model. When evaluating the submitted output of
                 English-to-Chinese translation systems in the IWSLT'08
                 CT-EC and NIST'08 EC tasks, the experimental results
                 indicate that the proposed metric has a significantly
                 better correlation with human evaluation than the
                 state-of-the-art machine translation metrics (i.e.,
                 BLEU, Meteor Universal, and TESLA-CELAB). This study
                 shows that it is important to allow different
                 characters to match in the evaluation of Chinese
                 translations and that the IHMM is a reasonable approach
                 for the alignment of Chinese characters.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Abuaiadah:2016:UBM,
  author =       "Diab Abuaiadah",
  title =        "Using Bisect {$K$}-Means Clustering Technique in the
                 Analysis of {Arabic} Documents",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2812809",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In this article, I have investigated the performance
                 of the bisect K-means clustering algorithm compared to
                 the standard K-means algorithm in the analysis of
                 Arabic documents. The experiments included five
                 commonly used similarity and distance functions
                 (Pearson correlation coefficient, cosine, Jaccard
                 coefficient, Euclidean distance, and averaged
                 Kullback--Leibler divergence) and three leading
                 stemmers. Using the purity measure, the bisect K-means
                 clearly outperformed the standard K-means in all
                 settings with varying margins. For the bisect K-means,
                 the best purity reached 0.927 when using the Pearson
                 correlation coefficient function, while for the
                 standard K-means, the best purity reached 0.884 when
                 using the Jaccard coefficient function. Removing stop
                 words significantly improved the results of the bisect
                 K-means but produced minor improvements in the results
                 of the standard K-means. Stemming provided additional
                 minor improvement in all settings except the
                 combination of the averaged Kullback--Leibler
                 divergence function and the root-based stemmer, where
                 the purity was deteriorated by more than 10\%. These
                 experiments were conducted using a dataset with nine
                 categories, each of which contains 300 documents.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Elayeb:2016:ACL,
  author =       "Bilel Elayeb and Ibrahim Bounhas",
  title =        "{Arabic} Cross-Language Information Retrieval: a
                 Review",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "18:1--18:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2789210",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Cross-language information retrieval (CLIR) deals with
                 retrieving relevant documents in one language using
                 queries expressed in another language. As CLIR tools
                 rely on translation techniques, they are challenged by
                 the properties of highly derivational and flexional
                 languages like Arabic. Much work has been done on CLIR
                 for different languages including Arabic. In this
                 article, we introduce the reader to the motivations for
                 solving some problems related to Arabic CLIR
                 approaches. The evaluation of these approaches is
                 discussed starting from the 2001 and 2002 TREC Arabic
                 CLIR tracks, which aim to objectively evaluate CLIR
                 systems. We also study many other research works to
                 highlight the unresolved problems or those that require
                 further investigation. These works are discussed in the
                 light of a deep study of the specificities and the
                 tasks of Arabic information retrieval (IR). Particular
                 attention is given to translation techniques and CLIR
                 resources, which are key issues challenging Arabic
                 CLIR. To push research in this field, we discuss how a
                 new standard collection can improve Arabic IR and CLIR
                 tracks.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Zhao:2016:ALM,
  author =       "Yinggong Zhao and Shujian Huang and Xin-Yu Dai and
                 Jiajun Chen",
  title =        "Adaptation of Language Models for {SMT} Using Neural
                 Networks with Topic Information",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "19:1--19:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2816816",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Neural network language models (LMs) are shown to be
                 effective in improving the performance of statistical
                 machine translation (SMT) systems. However,
                 state-of-the-art neural network LMs usually use words
                 before the current position as context and neglect
                 global topic information, which can help machine
                 translation (MT) systems to select better translation
                 candidates from a higher perspective. In this work, we
                 propose improvement of the state-of-the-art feedforward
                 neural language model with topic information. Two main
                 issues need to be tackled when adding topics into
                 neural network LMs for SMT: one is how to incorporate
                 topics to the neural network; the other is how to get
                 target-side topic distribution before translation. We
                 incorporate topics by appending topic distribution to
                 the input layer of a feedforward LM. We adopt a
                 multinomial logistic-regression (MLR) model to predict
                 the target-side topic distribution based on source side
                 information. Moreover, we propose a feedforward neural
                 network model to learn joint representations on the
                 source side for topic prediction. LM experiments
                 demonstrate that the perplexity on validation set can
                 be greatly reduced by the topic-enhanced feedforward
                 LM, and the prediction of target-side topics can be
                 improved dramatically with the MLR model equipped with
                 the joint source representations. A final MT
                 experiment, conducted on a large-scale Chinese--English
                 dataset, shows that our feedforward LM with predicted
                 topics improves the translation performance against a
                 strong baseline.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ding:2016:IIE,
  author =       "Chenchen Ding and Keisuke Sakanushi and Hirona Touji
                 and Mikio Yamamoto",
  title =        "Inter-, Intra-, and Extra-Chunk Pre-Ordering for
                 Statistical {Japanese}-to-{English} Machine
                 Translation",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "20:1--20:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818381",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "A rule-based pre-ordering approach is proposed for
                 statistical Japanese-to-English machine translation
                 using the dependency structure of source-side
                 sentences. A Japanese sentence is pre-ordered to an
                 English-like order at the morpheme level for a
                 statistical machine translation system during the
                 training and decoding phase to resolve the reordering
                 problem. In this article, extra-chunk pre-ordering of
                 morphemes is proposed, which allows Japanese functional
                 morphemes to move across chunk boundaries. This
                 contrasts with the intra-chunk reordering used in
                 previous approaches, which restricts the reordering of
                 morphemes within a chunk. Linguistically oriented
                 discussions show that correct pre-ordering cannot be
                 realized without extra-chunk movement of morphemes. The
                 proposed approach is compared with five rule-based
                 pre-ordering approaches designed for
                 Japanese-to-English translation and with a language
                 independent statistical pre-ordering approach on a
                 standard patent dataset and on a news dataset obtained
                 by crawling Internet news sites. Two state-of-the-art
                 statistical machine translation systems, one
                 phrase-based and the other hierarchical phrase-based,
                 are used in experiments. Experimental results show that
                 the proposed approach outperforms the compared
                 approaches on automatic reordering measures (Kendall's
                 \tau , Spearman's \rho , fuzzy reordering score, and
                 test set RIBES) and on the automatic translation
                 precision measure of test set BLEU score.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Lee:2015:ISI,
  author =       "Lung-Hao Lee and Gina-Anne Levow and Shih-Hung Wu and
                 Chao-Lin Liu",
  title =        "Introduction to the Special Issue on {Chinese} Spell
                 Checking",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "4",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818354",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/spell.bib;
                 http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  note =         "Special issue on Chinese spell checking.",
  abstract =     "This special issue contains four articles based on and
                 expanded from systems presented at the SIGHAN-7 Chinese
                 Spelling Check Bakeoff. We provide an overview of the
                 approaches and designs for Chinese spelling checkers
                 presented in these articles. We conclude this
                 introductory article with a summary of possible future
                 directions.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Chen:2015:PFC,
  author =       "Kuan-Yu Chen and Hsin-Min Wang and Hsin-Hsi Chen",
  title =        "A Probabilistic Framework for {Chinese} Spelling
                 Check",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "4",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2826234",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/spell.bib;
                 http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  note =         "Special issue on Chinese spell checking.",
  abstract =     "Chinese spelling check (CSC) is still an unsolved
                 problem today since there are many homonymous or
                 homomorphous characters. Recently, more and more CSC
                 systems have been proposed. To the best of our
                 knowledge, language modeling is one of the major
                 components among these systems because of its
                 simplicity and moderately good predictive power. After
                 deeply analyzing the school of research, we are aware
                 that most of the systems only employ the conventional n
                 -gram language models. The contributions of this
                 article are threefold. First, we propose a novel
                 probabilistic framework for CSC, which naturally
                 combines several important components, such as the
                 substitution model and the language model, to inherit
                 their individual merits as well as to overcome their
                 limitations. Second, we incorporate the topic language
                 models into the CSC system in an unsupervised fashion.
                 The topic language models can capture the long-span
                 semantic information from a word (character) string
                 while the conventional n -gram language models can only
                 preserve the local regularity information. Third, we
                 further integrate Web resources with the proposed
                 framework to enhance the overall performance. Our
                 rigorously empirical experiments demonstrate the
                 consistent and utility performance of the proposed
                 framework in the CSC task.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Liu:2015:HRA,
  author =       "Xiaodong Liu and Fei Cheng and Kevin Duh and Yuji
                 Matsumoto",
  title =        "A Hybrid Ranking Approach to {Chinese} Spelling
                 Check",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "4",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2822264",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/spell.bib;
                 http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  note =         "Special issue on Chinese spell checking.",
  abstract =     "We propose a novel framework for Chinese Spelling
                 Check (CSC), which is an automatic algorithm to detect
                 and correct Chinese spelling errors. Our framework
                 contains two key components: candidate generation and
                 candidate ranking. Our framework differs from previous
                 research, such as Statistical Machine Translation (SMT)
                 based model or Language Model (LM) based model, in that
                 we use both SMT and LM models as components of our
                 framework for generating the correction candidates, in
                 order to obtain maximum recall; to improve the
                 precision, we further employ a Support Vector Machines
                 (SVM) classifier to rank the candidates generated by
                 the SMT and the LM. Experiments show that our framework
                 outperforms other systems, which adopted the same or
                 similar resources as ours in the SIGHAN 7 shared task;
                 even comparing with the state-of-the-art systems, which
                 used more resources, such as a considerable large
                 dictionary, an idiom dictionary and other semantic
                 information, our framework still obtains competitive
                 results. Furthermore, to address the resource
                 scarceness problem for training the SMT model, we
                 generate around 2 million artificial training sentences
                 using the Chinese character confusion sets, which
                 include a set of Chinese characters with similar shapes
                 and similar pronunciations, provided by the SIGHAN 7
                 shared task.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Yeh:2015:CSC,
  author =       "Jui-Feng Yeh and Wen-Yi Chen and Mao-Chuan Su",
  title =        "{Chinese} Spelling Checker Based on an Inverted Index
                 List with a Rescoring Mechanism",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "4",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2826235",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/spell.bib;
                 http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  note =         "Special issue on Chinese spell checking.",
  abstract =     "An approach is proposed for Chinese spelling error
                 detection and correction, in which an inverted index
                 list with a rescoring mechanism is used. The inverted
                 index list is a structure for mapping from word to
                 desired sentence, and for representing nodes in
                 lattices constructed through character expansion
                 (according to predefined phonologically and visually
                 similar character sets). Pruning based on a contextual
                 dependency confidence measure was used to markedly
                 reduce the search space and computational complexity.
                 Relevant mapping relations between the original input
                 and desired input were obtained using a scoring
                 mechanism composed of class-based language and maximum
                 entropy correction models containing character, word,
                 and contextual features. The proposed method was
                 evaluated using data sets provided by SigHan 7 bakeoff.
                 The experimental results show that the proposed method
                 achieved acceptable performance in terms of recall rate
                 or precision rate in error sentence detection and error
                 location detection, and it outperformed other
                 approaches in error location detection and
                 correction.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Hsieh:2015:CCS,
  author =       "Yu-Ming Hsieh and Ming-Hong Bai and Shu-Ling Huang and
                 Keh-Jiann Chen",
  title =        "Correcting {Chinese} Spelling Errors with Word Lattice
                 Decoding",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "4",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2791389",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/spell.bib;
                 http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  note =         "Special issue on Chinese spell checking.",
  abstract =     "Chinese spell checkers are more difficult to develop
                 because of two language features: (1) there are no word
                 boundaries, and a character may function as a word or a
                 word morpheme; and (2) the Chinese character set
                 contains more than ten thousand characters. The former
                 makes it difficult for a spell checker to detect
                 spelling errors, and the latter makes it difficult for
                 a spell checker to construct error models. We develop a
                 word lattice decoding model for a Chinese spell checker
                 that addresses these difficulties. The model performs
                 word segmentation and error correction simultaneously,
                 thereby solving the word boundary problem. The model
                 corrects nonword errors as well as real-word errors. In
                 order to better estimate the error distribution of
                 large character sets for error models, we also propose
                 a methodology to extract spelling error samples
                 automatically from the Google web 1T corpus. Due to the
                 large quantity of data in the Google web 1T corpus,
                 many spelling error samples can be extracted, better
                 reflecting spelling error distributions in the real
                 world. Finally, in order to improve the spell checker
                 for real applications, we produce $n$-best suggestions
                 for spelling error corrections. We test our proposed
                 approach with the Bakeoff 2013 CSC Datasets; the
                 results show that the proposed methods with the error
                 model significantly outperform the performance of
                 Chinese spell checkers that do not use error models.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Anonymous:2015:TPE,
  author =       "Anonymous",
  title =        "{TALLIP} Perspectives: Editorial Commentary: The State
                 of the Journal",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "4",
  pages =        "19:1--19:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2823512",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  note =         "Special issue on Chinese spell checking.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Hakro:2016:PTI,
  author =       "Dil Nawaz Hakro and Abdullah Zawawi Talib",
  title =        "Printed Text Image Database for {Sindhi} {OCR}",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "21:1--21:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2846093",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Document Image Understanding (DIU) and Electronic
                 Document Management are active fields of research
                 involving image understanding, interpretation,
                 efficient handling, and routing of documents as well as
                 their retrieval. Research on most of the noncursive
                 scripts (Latin) has matured, whereas research on the
                 cursive (connected) scripts is still moving toward
                 perfection. Many researchers are currently working on
                 the cursive scripts (Arabic and other scripts adopting
                 it) around the world so that the difficulties and
                 challenges in document understanding and handling of
                 these scripts can be overcome. Sindhi script has the
                 largest extension of the original Arabic alphabet among
                 languages adopting the Arabic script; it contains 52
                 characters, compared to 28 characters in the original
                 Arabic alphabet, in order to accommodate more sounds
                 for the language. There are 24 differentiating
                 characters with some possessing four dots. For Sindhi
                 OCR research and development, a database is needed for
                 training and testing of Sindhi text images. We have
                 developed a large database containing over 4 billion
                 words and 15 billion characters in 150 various fonts in
                 four font weights and four styles. The database
                 contents were collected from various sources including
                 websites, books, and theses. A custom-built application
                 was also developed to create a text image from a text
                 document that supports various fonts and sizes. The
                 database considers words, characters, characters with
                 spaces, and lines. The database is freely available as
                 a partial or full database by sending an email to one
                 of the authors.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ding:2016:WSB,
  author =       "Chenchen Ding and Ye Kyaw Thu and Masao Utiyama and
                 Eiichiro Sumita",
  title =        "Word Segmentation for {Burmese} ({Myanmar})",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "22:1--22:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2846095",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Experiments on various word segmentation approaches
                 for the Burmese language are conducted and discussed in
                 this note. Specifically, dictionary-based, statistical,
                 and machine learning approaches are tested.
                 Experimental results demonstrate that statistical and
                 machine learning approaches perform significantly
                 better than dictionary-based approaches. We believe
                 that this note, based on an annotated corpus of
                 relatively considerable size (containing approximately
                 a half million words), is the first systematic
                 comparison of word segmentation approaches for Burmese.
                 This work aims to discover the properties and proper
                 approaches to Burmese textual processing and to promote
                 further researches on this understudied language.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Zhang:2016:ITP,
  author =       "Tongtao Zhang and Aritra Chowdhury and Nimit Dhulekar
                 and Jinjing Xia and Kevin Knight and Heng Ji and
                 B{\"u}lent Yener and Liming Zhao",
  title =        "From Image to Translation: Processing the Endangered
                 {Nyushu} Script",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "23:1--23:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2857052",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The lack of computational support has significantly
                 slowed down automatic understanding of endangered
                 languages. In this paper, we take Nyushu (simplified
                 Chinese: [Chinese characters]; literally: ``women's
                 writing'') as a case study to present the first
                 computational approach that combines Computer Vision
                 and Natural Language Processing techniques to deeply
                 understand an endangered language. We developed an
                 end-to-end system to read a scanned hand-written Nyushu
                 article, segment it into characters, link them to
                 standard characters, and then translate the article
                 into Mandarin Chinese. We propose several novel methods
                 to address the new challenges introduced by noisy input
                 and low resources, including Nyushu-specific feature
                 selection for character segmentation and linking, and
                 character linking lattice based Machine Translation.
                 The end-to-end system performance indicates that the
                 system is a promising approach and can serve as a
                 standard benchmark.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sarigil:2016:SPW,
  author =       "Erdem Sarigil and Oguz Yilmaz and Ismail Sengor
                 Altingovde and Rifat Ozcan and {\"O}zg{\"U}r Ulusoy",
  title =        "A ``Suggested'' Picture of {Web} Search in {Turkish}",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "24:1--24:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2891105",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Although query log analysis provides crucial insights
                 about Web users' search interests, conducting such
                 analyses is almost impossible for some languages, as
                 large-scale and public query logs are quite scarce. In
                 this study, we first survey the existing query
                 collections in Turkish and discuss their limitations.
                 Next, we adopt a novel strategy to obtain a set of
                 Turkish queries using the query autocompletion services
                 from the four major search engines and provide the
                 first large-scale analysis of Web queries and their
                 results in Turkish.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Goswami:2016:CPG,
  author =       "Mukesh M. Goswami and Suman K. Mitra",
  title =        "Classification of Printed Gujarati Characters Using
                 Low-Level Stroke Features",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "25:1--25:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2856105",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article presents an elegant technique for
                 extracting the low-level stroke features, such as
                 endpoints, junction points, line elements, and curve
                 elements, from offline printed text using a template
                 matching approach. The proposed features are used to
                 classify a subset of characters from Gujarati script.
                 The database consists of approximately 16,782 samples
                 of 42 middle-zone symbols from the Gujarati character
                 set collected from three different sources: machine
                 printed books, newspapers, and laser printed documents.
                 The purpose of this division is to add variety in terms
                 of size, font type, style, ink variation, and boundary
                 deformation. The experiments are performed on the
                 database using a k-nearest neighbor (kNN) classifier
                 and results are compared with other widely used
                 structural features, namely Chain Codes (CC),
                 Directional Element Features (DEF), and Histogram of
                 Oriented Gradients (HoG). The results show that the
                 features are quite robust against the variations and
                 give comparable performance with other existing
                 works.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Choudhary:2016:FTA,
  author =       "Prakash Choudhary and Neeta Nain",
  title =        "A Four-Tier Annotated {Urdu} Handwritten Text Image
                 Dataset for Multidisciplinary Research on {Urdu}
                 Script",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "26:1--26:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2857053",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article introduces a large handwritten text
                 document image corpus dataset for Urdu script named
                 CALAM (Cursive And Language Adaptive Methodologies).
                 The database contains unconstrained handwritten
                 sentences along with their structural annotations for
                 the offline handwritten text images with their XML
                 representation. Urdu is the fourth most frequently used
                 language in the world, but due to its complex cursive
                 writing script and low resources, it is still a thrust
                 area for document image analysis. Here, a unified
                 approach is applied in the development of an Urdu
                 corpus by collecting printed texts, handwritten texts,
                 and demographic information of writers on a single
                 form. CALAM contains 1,200 handwritten text images,
                 3,043 lines, 46,664 words, and 101,181 ligatures. For
                 capturing maximum variance among the words and
                 handwritten styles, data collection is distributed
                 among six categories and 14 subcategories. Handwritten
                 forms were filled out by 725 different writers
                 belonging to different geographical regions, ages, and
                 genders with diverse educational backgrounds. A
                 structure has been designed to annotate handwritten
                 Urdu script images at line, word, and ligature levels
                 with an XML standard to provide a ground truth of each
                 image at different levels of annotation. This corpus
                 would be very useful for linguistic research in
                 benchmarking and providing a testbed for evaluation of
                 handwritten text recognition techniques for Urdu
                 script, signature verification, writer identification,
                 digital forensics, classification of printed and
                 handwritten text, categorization of texts as per use,
                 and so on. The experimental results of some recently
                 developed handwritten text line segmentation techniques
                 experimented on the proposed dataset are also presented
                 in the article for asserting its viability and
                 usability.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Norimatsu:2016:FCL,
  author =       "Jun-Ya Norimatsu and Makoto Yasuhara and Toru Tanaka
                 and Mikio Yamamoto",
  title =        "A Fast and Compact Language Model Implementation Using
                 Double-Array Structures",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "27:1--27:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2873068",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The language model is a widely used component in
                 fields such as natural language processing, automatic
                 speech recognition, and optical character recognition.
                 In particular, statistical machine translation uses
                 language models, and the translation speed and the
                 amount of memory required are greatly affected by the
                 performance of the language model implementation. We
                 propose a fast and compact implementation of n -gram
                 language models that increases query speed and reduces
                 memory usage by using a double-array structure, which
                 is known to be a fast and compact trie data structure.
                 We propose two types of implementation: one for
                 backward suffix trees and the other for reverse tries.
                 The data structure is optimized for space efficiency by
                 embedding model parameters into otherwise unused spaces
                 in the double-array structure. We show that the reverse
                 trie version of our method is among the smallest
                 state-of-the-art implementations in terms of model size
                 with almost the same speed as the implementation that
                 performs fastest on perplexity calculation tasks.
                 Similarly, we achieve faster decoding while keeping
                 compact model sizes, and we confirm that our method can
                 utilize the efficiency of the double-array structure to
                 achieve a balance between speed and size on translation
                 tasks.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Yang:2016:LGF,
  author =       "Haitong Yang and Chengqing Zong",
  title =        "Learning Generalized Features for Semantic Role
                 Labeling",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "28:1--28:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2890496",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article makes an effort to improve Semantic Role
                 Labeling (SRL) through learning generalized features.
                 The SRL task is usually treated as a supervised
                 problem. Therefore, a huge set of features are crucial
                 to the performance of SRL systems. But these features
                 often lack generalization powers when predicting an
                 unseen argument. This article proposes a simple
                 approach to relieve the issue. A strong intuition is
                 that arguments occurring in similar syntactic positions
                 are likely to bear the same semantic role, and,
                 analogously, arguments that are lexically similar are
                 likely to represent the same semantic role. Therefore,
                 it will be informative to SRL if syntactic or lexical
                 similar arguments can activate the same feature.
                 Inspired by this, we embed the information of
                 lexicalization and syntax into a feature vector for
                 each argument and then use K -means to make clustering
                 for all feature vectors of training set. For an unseen
                 argument to be predicted, it will belong to the same
                 cluster as its similar arguments of training set.
                 Therefore, the clusters can be thought of as a kind of
                 generalized feature. We evaluate our method on several
                 benchmarks. The experimental results show that our
                 approach can significantly improve the SRL
                 performance.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Bhowmik:2016:BHC,
  author =       "Tapan Kumar Bhowmik and Swapan Kumar Parui and Utpal
                 Roy and Lambert Schomaker",
  title =        "{Bangla} Handwritten Character Segmentation Using
                 Structural Features: a Supervised and Bootstrapping
                 Approach",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "29:1--29:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2890497",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In this article, we propose a new framework for
                 segmentation of Bangla handwritten word images into
                 meaningful individual symbols or pseudo-characters.
                 Existing segmentation algorithms are not usually
                 treated as a classification problem. However, in the
                 present study, the segmentation algorithm is looked
                 upon as a two-class supervised classification problem.
                 The method employs an SVM classifier to select the
                 segmentation points on the word image on the basis of
                 various structural features. For training of the SVM
                 classifier, an unannotated training set is prepared
                 first using candidate segmenting points. The training
                 set is then clustered, and each cluster is labeled
                 manually with minimal manual intervention. A
                 semi-automatic bootstrapping technique is also employed
                 to enlarge the training set from new samples. The
                 overall architecture describes a basic step toward
                 building an annotation system for the segmentation
                 problem, which has not so far been investigated. The
                 experimental results show that our segmentation method
                 is quite efficient in segmenting not only word images
                 but also handwritten texts. As a part of this work, a
                 database of Bangla handwritten word images has also
                 been developed. Considering our data collection method
                 and a statistical analysis of our lexicon set, we claim
                 that the relevant characteristics of an ideal lexicon
                 set are present in our handwritten word image
                 database.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Singh:2016:OHG,
  author =       "Sukhdeep Singh and Anuj Sharma and Indu Chhabra",
  title =        "Online Handwritten {Gurmukhi} Strokes Dataset Based on
                 Minimal Set of Words",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "1:1--1:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2896318",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The online handwriting data are an integral part of
                 data analysis and classification research, as collected
                 handwritten data offers many challenges to group
                 handwritten stroke classes. The present work has been
                 done for grouping handwritten strokes from the Indic
                 script Gurmukhi. Gurmukhi is the script of the popular
                 and widely spoken language Punjabi. The present work
                 includes development of the dataset of Gurmukhi words
                 in the context of online handwriting recognition for
                 real-life use applications, such as maps navigation. We
                 have collected the data of 100 writers from the largest
                 cities in the Punjab region. The writers' variations,
                 such as writing skill level (beginner, moderate, and
                 expert), gender, right or left handedness, and their
                 adaptability to digital handwriting, have been
                 considered in dataset development. We have introduced a
                 novel technique to form handwritten stroke classes
                 based on a limited set of words. The presence of all
                 alphabets including vowels of Gurmukhi script has been
                 considered before selection of a word. The developed
                 dataset includes 39,411 strokes from handwritten words
                 and forms 72 classes of strokes after using a k-means
                 clustering technique and manual verification through
                 expert and moderate writers. We have achieved
                 recognition results using the Hidden Markov Model as
                 87.10\%, 85.43\%, and 84.33\% for middle zone strokes
                 when using training data as 66\%, 50\%, and 80\% of the
                 developed dataset. The present work is a step in a
                 direction to find groups for unknown handwriting
                 strokes with reasonably higher levels of accuracy.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{El-Fiqi:2016:PCC,
  author =       "Heba El-Fiqi and Eleni Petraki and Hussein A. Abbass",
  title =        "Pairwise Comparative Classification for Translator
                 Stylometric Analysis",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "2:1--2:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2898997",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In this article, we present a new type of
                 classification problem, which we call Comparative
                 Classification Problem (CCP), where we use the term
                 data record to refer to a block of instances. Given a
                 single data record with n instances for n classes, the
                 CCP problem is to map each instance to a unique class.
                 This problem occurs in a wide range of applications
                 where the independent and identically distributed
                 assumption is broken down. The primary difference
                 between CCP and classical classification is that in the
                 latter, the assignment of a translator to one record is
                 independent of the assignment of a translator to a
                 different record. In CCP, however, the assignment of a
                 translator to one record within a block excludes this
                 translator from further assignments to any other record
                 in that block. The interdependency in the data poses
                 challenges for techniques relying on the independent
                 and identically distributed (iid) assumption. In the
                 Pairwise CCP (PWCCP), a pair of records is grouped
                 together. The key difference between PWCCP and
                 classical binary classification problems is that hidden
                 patterns can only be unmasked by comparing the
                 instances as pairs. In this article, we introduce a new
                 algorithm, PWC4.5, which is based on C4.5, to manage
                 PWCCP. We first show that a simple transformation-that
                 we call Gradient-Based Transformation (GBT)-can fix the
                 problem of iid in C4.5. We then evaluate PWC4.5 using
                 two real-world corpora to distinguish between
                 translators on Arabic-English and French-English
                 translations. While the traditional C4.5 failed to
                 distinguish between different translators, GBT
                 demonstrated better performance. Meanwhile, PWC4.5
                 consistently provided the best results over C4.5 and
                 GBT.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Qiao:2016:IUD,
  author =       "Xiuming Qiao and Hailong Cao and Tiejun Zhao",
  title =        "Improving Unsupervised Dependency Parsing with
                 Knowledge from Query Logs",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "3:1--3:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2903720",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Unsupervised dependency parsing becomes more and more
                 popular in recent years because it does not need
                 expensive annotations, such as treebanks, which are
                 required for supervised and semi-supervised dependency
                 parsing. However, its accuracy is still far below that
                 of supervised dependency parsers, partly due to the
                 fact that their parsing model is insufficient to
                 capture linguistic phenomena underlying texts. The
                 performance for unsupervised dependency parsing can be
                 improved by mining knowledge from the texts and by
                 incorporating it into the model. In this article,
                 syntactic knowledge is acquired from query logs to help
                 estimate better probabilities in dependency models with
                 valence. The proposed method is language independent
                 and obtains an improvement of 4.1\% unlabeled accuracy
                 on the Penn Chinese Treebank by utilizing additional
                 dependency relations from the Sogou query logs and
                 Baidu query logs. Morever, experiments show that the
                 proposed model achieves improvements of 8.07\% on CoNLL
                 2007 English using the AOL query logs. We believe query
                 logs are useful sources of syntactic knowledge for many
                 natural language processing (NLP) tasks.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Passban:2016:BNP,
  author =       "Peyman Passban and Qun Liu and Andy Way",
  title =        "Boosting Neural {POS} Tagger for {Farsi} Using
                 Morphological Information",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "4:1--4:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2934676",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Farsi (Persian) is a low-resource language that
                 suffers from the data sparsity problem and a lack of
                 efficient processing tools. Due to their broad
                 application in natural language processing tasks,
                 part-of-speech (POS) taggers are one of those important
                 tools that should be considered in this respect.
                 Despite recent work on Farsi tagging, there is still
                 room for improvement. The best reported accuracy so far
                 is 96\%, which in special cases can rise to 96.9\%. The
                 main problem with existing taggers is their
                 inefficiency in coping with out-of-vocabulary (OOV)
                 words. Addressing both problems of accuracy and OOV
                 words, we developed a neural network-based POS tagger
                 (NPT) that performs efficiently on Farsi. Despite using
                 less data, NPT provides better results in comparison to
                 state-of-the-art systems. Our proposed tagger performs
                 with an accuracy of 97.4\%, with performance highly
                 influenced by morphological features. We carry out a
                 shallow morphological analysis and show considerable
                 improvement over the baseline configuration.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Liu:2016:SBM,
  author =       "Liangliang Liu and Cungen Cao",
  title =        "A Seed-Based Method for Generating {Chinese} Confusion
                 Sets",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "5:1--5:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2933396",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In natural language, people often misuse a word
                 (called a ``confused word'') in place of other words
                 (called ``confusing words''). In misspelling
                 corrections, many approaches to finding and correcting
                 misspelling errors are based on a simple notion called
                 a ``confusion set.'' The confusion set of a confused
                 word consists of confusing words. In this article, we
                 propose a new method of building Chinese character
                 confusion sets. Our method is composed of two major
                 phases. In the first phase, we build a list of seed
                 confusion sets for each Chinese character, which is
                 based on measuring similarity in character pinyin or
                 similarity in character shape. In this phase, all
                 confusion sets are constructed manually, and the
                 confusion sets are organized into a graph, called a
                 ``seed confusion graph'' (SCG), in which vertices
                 denote characters and edges are pairs of characters in
                 the form (confused character, confusing character). In
                 the second phase, we extend the SCG by acquiring more
                 pairs of (confused character, confusing character) from
                 a large Chinese corpus. For this, we use several word
                 patterns (or patterns) to generate new confusion pairs
                 and then verify the pairs before adding them into a
                 SCG. Comprehensive experiments show that our method of
                 extending confusion sets is effective. Also, we shall
                 use the confusion sets in Chinese misspelling
                 corrections to show the utility of our method.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Li:2016:ISP,
  author =       "Junhui Li and Muhua Zhu and Wei Lu and Guodong Zhou",
  title =        "Improving Semantic Parsing with Enriched Synchronous
                 Context-Free Grammars in Statistical Machine
                 Translation",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "6:1--6:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2963099",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Semantic parsing maps a sentence in natural language
                 into a structured meaning representation. Previous
                 studies show that semantic parsing with synchronous
                 context-free grammars (SCFGs) achieves favorable
                 performance over most other alternatives. Motivated by
                 the observation that the performance of semantic
                 parsing with SCFGs is closely tied to the translation
                 rules, this article explores to extend translation
                 rules with high quality and increased coverage in three
                 ways. First, we examine the difference between word
                 alignments for semantic parsing and statistical machine
                 translation (SMT) to better adapt word alignment in SMT
                 to semantic parsing. Second, we introduce both
                 structure and syntax informed nonterminals, better
                 guiding the parsing in favor of well-formed structure,
                 instead of using a uninformed nonterminal in SCFGs.
                 Third, we address the unknown word translation issue
                 via synthetic translation rules. Last but not least, we
                 use a filtering approach to improve performance via
                 predicting answer type. Evaluation on the standard
                 GeoQuery benchmark dataset shows that our approach
                 greatly outperforms the state of the art across various
                 languages, including English, Chinese, Thai, German,
                 and Greek.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Krishnamurthi:2016:UDS,
  author =       "Karthik Krishnamurthi and Vijayapal Reddy Panuganti
                 and Vishnu Vardhan Bulusu",
  title =        "Understanding Document Semantics from Summaries: a
                 Case Study on {Hindi} Texts",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "7:1--7:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2956236",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Summary of a document contains words that actually
                 contribute to the semantics of the document. Latent
                 Semantic Analysis (LSA) is a mathematical model that is
                 used to understand document semantics by deriving a
                 semantic structure based on patterns of word
                 correlations in the document. When using LSA to capture
                 semantics from summaries, it is observed that LSA
                 performs quite well despite being completely
                 independent of any external sources of semantics.
                 However, LSA can be remodeled to enhance its capability
                 to analyze correlations within texts. By taking
                 advantage of the model being language independent, this
                 article presents two stages of LSA remodeling to
                 understand document semantics in the Indian context,
                 specifically from Hindi text summaries. One stage of
                 remodeling is done by providing supplementary
                 information, such as document category and domain
                 information. The second stage of remodeling is done by
                 using a supervised term weighting measure in the
                 process. The remodeled LSA's performance is empirically
                 evaluated in a document classification application by
                 comparing the accuracies of classification to plain
                 LSA. An improvement in the performance of LSA in the
                 range of 4.7\% to 6.2\% is achieved from the remodel
                 when compared to the plain model. The results suggest
                 that summaries of documents efficiently capture the
                 semantic structure of documents and is an alternative
                 to full-length documents for understanding document
                 semantics.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Tursun:2016:STT,
  author =       "Eziz Tursun and Debasis Ganguly and Turghun Osman and
                 Ya-Ting Yang and Ghalip Abdukerim and Jun-Lin Zhou and
                 Qun Liu",
  title =        "A Semisupervised Tag-Transition-Based {Markovian}
                 Model for {Uyghur} Morphology Analysis",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "8:1--8:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2968410",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Morphological analysis, which includes analysis of
                 part-of-speech (POS) tagging, stemming, and morpheme
                 segmentation, is one of the key components in natural
                 language processing (NLP), particularly for
                 agglutinative languages. In this article, we
                 investigate the morphological analysis of the Uyghur
                 language, which is the native language of the people in
                 the Xinjiang Uyghur autonomous region of western China.
                 Morphological analysis of Uyghur is challenging
                 primarily because of factors such as (1) ambiguities
                 arising due to the likelihood of association of a
                 multiple number of POS tags with a word stem or a
                 multiple number of functional tags with a word suffix,
                 (2) ambiguous morpheme boundaries, and (3) complex
                 morphopholonogy of the language. Further, the
                 unavailability of a manually annotated training set in
                 the Uyghur language for the purpose of word
                 segmentation makes Uyghur morphological analysis more
                 difficult. In our proposed work, we address these
                 challenges by undertaking a semisupervised approach of
                 learning a Markov model with the help of a manually
                 constructed dictionary of ``suffix to tag'' mappings in
                 order to predict the most likely tag transitions in the
                 Uyghur morpheme sequence. Due to the linguistic
                 characteristics of Uyghur, we incorporate a prior
                 belief in our model for favoring word segmentations
                 with a lower number of morpheme units. Empirical
                 evaluation of our proposed model shows an accuracy of
                 about 82\%. We further improve the effectiveness of the
                 tag transition model with an active learning paradigm.
                 In particular, we manually investigated a subset of
                 words for which the model prediction ambiguity was
                 within the top 20\%. Manually incorporating rules to
                 handle these erroneous cases resulted in an overall
                 accuracy of 93.81\%.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Nguyen:2016:ACN,
  author =       "Long H. B. Nguyen and Dien Dinh and Phuoc Tran",
  title =        "An Approach to Construct a Named Entity Annotated
                 {English--Vietnamese} Bilingual Corpus",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "9:1--9:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2990191",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Manually constructing an annotated Named Entity (NE)
                 in a bilingual corpus is a time-consuming,
                 labor--intensive, and expensive process, but this is
                 necessary for natural language processing (NLP) tasks
                 such as cross-lingual information retrieval,
                 cross-lingual information extraction, machine
                 translation, etc. In this article, we present an
                 automatic approach to construct an annotated NE in
                 English-Vietnamese bilingual corpus from a bilingual
                 parallel corpus by proposing an aligned NE method.
                 Basing this corpus on a bilingual corpus in which the
                 initial NEs are extracted from its own language
                 separately, the approach tries to correct unrecognized
                 NEs or incorrectly recognized NEs before aligning the
                 NEs by using a variety of bilingual constraints. The
                 generated corpus not only improves the NE recognition
                 results but also creates alignments between English NEs
                 and Vietnamese NEs, which are necessary for training NE
                 translation models. The experimental results show that
                 the approach outperforms the baseline methods
                 effectively. In the English-Vietnamese NE alignment
                 task, the F-measure increases from 68.58\% to 79.77\%.
                 Thanks to the improvement of the NE recognition
                 quality, the proposed method also increases
                 significantly: the F-measure goes from 84.85\% to
                 88.66\% for the English side and from 75.71\% to
                 85.55\% for the Vietnamese side. By providing the
                 additional semantic information for the machine
                 translation systems, the BLEU score increases from
                 33.04\% to 45.11\%.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Chou:2016:BWN,
  author =       "Chien-Lung Chou and Chia-Hui Chang and Ya-Yun Huang",
  title =        "Boosted {Web} Named Entity Recognition via
                 Tri-Training",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "10:1--10:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2963100",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Named entity extraction is a fundamental task for many
                 natural language processing applications on the web.
                 Existing studies rely on annotated training data, which
                 is quite expensive to obtain large datasets, limiting
                 the effectiveness of recognition. In this research, we
                 propose a semisupervised learning approach for web
                 named entity recognition (NER) model construction via
                 automatic labeling and tri-training. The former
                 utilizes structured resources containing known named
                 entities for automatic labeling, while the latter makes
                 use of unlabeled examples to improve the extraction
                 performance. Since this automatically labeled training
                 data may contain noise, a self-testing procedure is
                 used as a follow-up to remove low-confidence annotation
                 and prepare higher-quality training data. Furthermore,
                 we modify tri-training for sequence labeling and derive
                 a proper initialization for large dataset training to
                 improve entity recognition. Finally, we apply this
                 semisupervised learning framework for person name
                 recognition, business organization name recognition,
                 and location name extraction. In the task of Chinese
                 NER, an F-measure of 0.911, 0.849, and 0.845 can be
                 achieved, for person, business organization, and
                 location NER, respectively. The same framework is also
                 applied for English and Japanese business organization
                 name recognition and obtains models with performance of
                 a 0.832 and 0.803 F-measure.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sadek:2016:DBA,
  author =       "Jawad Sadek and Farid Meziane",
  title =        "A Discourse-Based Approach for {Arabic} Question
                 Answering",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "11:1--11:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2988238",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The treatment of complex questions with explanatory
                 answers involves searching for arguments in texts.
                 Because of the prominent role that discourse relations
                 play in reflecting text producers' intentions,
                 capturing the underlying structure of text constitutes
                 a good instructor in this issue. From our extensive
                 review, a system for automatic discourse analysis that
                 creates full rhetorical structures in large-scale
                 Arabic texts is currently unavailable. This is due to
                 the high computational complexity involved in
                 processing a large number of hypothesized relations
                 associated with large texts. Therefore, more practical
                 approaches should be investigated. This article
                 presents a new Arabic Text Parser oriented for
                 question-answering systems dealing with [Arabic
                 characters] ``why'' and [Arabic characters] ``how to''
                 questions. The Text Parser presented here considers the
                 sentence as the basic unit of text and incorporates a
                 set of heuristics to avoid computational explosion.
                 With this approach, the developed question-answering
                 system reached a significant improvement over the
                 baseline with a Recall of 68\% and MRR of 0.62.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Tran:2016:WRS,
  author =       "Phuoc Tran and Dien Dinh and Long H. B. Nguyen",
  title =        "Word Re-Segmentation in {Chinese--Vietnamese} Machine
                 Translation",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "12:1--12:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2988237",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In isolated languages, such as Chinese and Vietnamese,
                 words are not separated by spaces, and a word may be
                 formed by one or more syllables. Therefore, word
                 segmentation (WS) is usually the first process that is
                 implemented in the machine translation process. WS in
                 the source and target languages is based on different
                 training corpora, and WS approaches may not be the
                 same. Therefore, the WS that results in these two
                 languages are not often homologous, and thus word
                 alignment results in many 1-n and n-1 alignment pairs
                 in statistical machine translation, which degrades the
                 performance of machine translation. In this article, we
                 will adjust the WS for both Chinese and Vietnamese in
                 particular and for isolated language pairs in general
                 and make the word boundary of the two languages more
                 symmetric in order to strengthen 1-1 alignments and
                 enhance machine translation performance. We have tested
                 this method on the Computational Linguistics Center's
                 corpus, which consists of 35,623 sentence pairs. The
                 experimental results show that our method has
                 significantly improved the performance of machine
                 translation compared to the baseline translation
                 system, WS translation system, and anchor
                 language-based WS translation systems.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Li:2016:MSC,
  author =       "Peifeng Li and Guodong Zhou and Qiaoming Zhu",
  title =        "Minimally Supervised {Chinese} Event Extraction from
                 Multiple Views",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "13:1--13:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2994600",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Although several semi-supervised learning models have
                 been proposed for English event extraction, there are
                 few successful stories in Chinese due to its special
                 characteristics. In this article, we propose a novel
                 minimally supervised model for Chinese event extraction
                 from multiple views. Besides the traditional pattern
                 similarity view (PSV), a semantic relationship view
                 (SRV) is introduced to capture the relevant event
                 mentions from relevant documents. Moreover, a
                 morphological structure view (MSV) is incorporated to
                 both infer more positive patterns and help filter
                 negative patterns via morphological structure
                 similarity. An evaluation of the ACE 2005 Chinese
                 corpus shows that our minimally supervised model
                 significantly outperforms several strong baselines.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Atreya:2016:QER,
  author =       "Arjun {Atreya V} and Ashish Kankaria and Pushpak
                 Bhattacharyya and Ganesh Ramakrishnan",
  title =        "Query Expansion in Resource-Scarce Languages: a
                 Multilingual Framework Utilizing Document Structure",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2997643",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Retrievals in response to queries to search engines in
                 resource-scarce languages often produce no results,
                 which annoys the user. In such cases, at least
                 partially relevant documents must be retrieved. We
                 propose a novel multilingual framework, MultiStructPRF,
                 which expands the query with related terms by (i) using
                 a resource-rich assisting language and (ii) giving
                 varied importance to the expansion terms depending on
                 their position of occurrence in the document. Our
                 system uses the help of an assisting language to expand
                 the query in order to improve system recall. We propose
                 a systematic expansion model for weighting the
                 expansion terms coming from different parts of the
                 document. To combine the expansion terms from query
                 language and assisting language, we propose a
                 heuristics-based fusion model. Our experimental results
                 show an improvement over other PRF techniques in both
                 precision and recall for multiple resource-scarce
                 languages like Marathi, Bengali, Odia, Finnish, and the
                 like. We study the effect of different assisting
                 languages on precision and recall for multiple query
                 languages. Our experiments reveal an interesting fact:
                 Precision is positively correlated with the typological
                 closeness of query language and assisting language,
                 whereas recall is positively correlated with the
                 resource richness of the assisting language.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Finch:2017:IBL,
  author =       "Andrew Finch and Taisuke Harada and Kumiko
                 Tanaka-Ishii and Eiichiro Sumita",
  title =        "Inducing a Bilingual Lexicon from Short Parallel
                 Multiword Sequences",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3003726",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article proposes a technique for mining bilingual
                 lexicons from pairs of parallel short word sequences.
                 The technique builds a generative model from a corpus
                 of training data consisting of such pairs. The model is
                 a hierarchical nonparametric Bayesian model that
                 directly induces a bilingual lexicon while training.
                 The model learns in an unsupervised manner and is
                 designed to exploit characteristics of the language
                 pairs being mined. The proposed model is capable of
                 utilizing commonly used word-pair frequency information
                 and additionally can employ the internal character
                 alignments within the words themselves. It is thereby
                 capable of mining transliterations and can use reliably
                 aligned transliteration pairs to support the mining of
                 other words in their context. The model is also capable
                 of performing word reordering and word deletion during
                 the alignment process, and it is furthermore capable of
                 operating in the absence of full segmentation
                 information. In this work, we study two mining tasks
                 based on English-Japanese and English--Chinese language
                 pairs, and compare the proposed approach to baselines
                 based on a simpler models that use only word-pair
                 frequency information. Our results show that the
                 proposed method is able to mine bilingual word pairs at
                 higher levels of precision and recall than the
                 baselines.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wang:2017:CSC,
  author =       "Shaonan Wang and Chengqing Zong",
  title =        "Comparison Study on Critical Components in Composition
                 Model for Phrase Representation",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3010088",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Phrase representation, an important step in many NLP
                 tasks, involves representing phrases as
                 continuous-valued vectors. This article presents
                 detailed comparisons concerning the effects of word
                 vectors, training data, and the composition and
                 objective function used in a composition model for
                 phrase representation. Specifically, we first discuss
                 how the augmented word representations affect the
                 performance of the composition model. Then, we
                 investigate whether different types of training data
                 influence the performance of the composition model and,
                 if so, how they influence it. Finally, we evaluate
                 combinations of different composition and objective
                 functions and discuss the factors related to
                 composition model performance. All evaluations were
                 conducted in both English and Chinese. Our main
                 findings are as follows: (1) The Additive model with
                 semantic enhanced word vectors performs comparably to
                 the state-of-the-art model; (2) The Additive model
                 which updates augmented word vectors and the Matrix
                 model with semantic enhanced word vectors
                 systematically outperforms the state-of-the-art model
                 in bigram and multi-word phrase similarity task,
                 respectively; (3) Representing the high frequency
                 phrases by estimating their surrounding contexts is a
                 good training objective for bigram phrase similarity
                 tasks; and (4) The performance gain of composition
                 model with semantic enhanced word vectors is due to the
                 composition function and the greater weight attached to
                 important words. Previous works focus on the
                 composition function; however, our findings indicate
                 that other components in the composition model
                 (especially word representation) make a critical
                 difference in phrase representation.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Bhat:2017:ITB,
  author =       "Riyaz Ahmad Bhat and Irshad Ahmad Bhat and Dipti Misra
                 Sharma",
  title =        "Improving Transition-Based Dependency Parsing of
                 {Hindi} and {Urdu} by Modeling Syntactically Relevant
                 Phenomena",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3005447",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In recent years, transition-based parsers have shown
                 promise in terms of efficiency and accuracy. Though
                 these parsers have been extensively explored for
                 multiple Indian languages, there is still considerable
                 scope for improvement by properly incorporating
                 syntactically relevant information. In this article, we
                 enhance transition-based parsing of Hindi and Urdu by
                 redefining the features and feature extraction
                 procedures that have been previously proposed in the
                 parsing literature of Indian languages. We propose and
                 empirically show that properly incorporating
                 syntactically relevant information like case marking,
                 complex predication and grammatical agreement in an
                 arc-eager parsing model can significantly improve
                 parsing accuracy. Our experiments show an absolute
                 improvement of ~2\% LAS for parsing of both Hindi and
                 Urdu over a competitive baseline which uses rich
                 features like part-of-speech (POS) tags, chunk tags,
                 cluster ids and lemmas. We also propose some heuristics
                 to identify ezafe constructions in Urdu texts which
                 show promising results in parsing these
                 constructions.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Das:2017:NER,
  author =       "Arjun Das and Debasis Ganguly and Utpal Garain",
  title =        "Named Entity Recognition with Word Embeddings and
                 {Wikipedia} Categories for a Low-Resource Language",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "18:1--18:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3015467",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In this article, we propose a word embedding--based
                 named entity recognition (NER) approach. NER is
                 commonly approached as a sequence labeling task with
                 the application of methods such as conditional random
                 field (CRF). However, for low-resource languages
                 without the presence of sufficiently large training
                 data, methods such as CRF do not perform well. In our
                 work, we make use of the proximity of the vector
                 embeddings of words to approach the NER problem. The
                 hypothesis is that word vectors belonging to the same
                 name category, such as a person's name, occur in close
                 vicinity in the abstract vector space of the embedded
                 words. Assuming that this clustering hypothesis is
                 true, we apply a standard classification approach on
                 the vectors of words to learn a decision boundary
                 between the NER classes. Our NER experiments are
                 conducted on a morphologically rich and low-resource
                 language, namely Bengali. Our approach significantly
                 outperforms standard baseline CRF approaches that use
                 cluster labels of word embeddings and gazetteers
                 constructed from Wikipedia. Further, we propose an
                 unsupervised approach (that uses an automatically
                 created named entity (NE) gazetteer from Wikipedia in
                 the absence of training data). For a low-resource
                 language, the word vectors obtained from Wikipedia are
                 not sufficient to train a classifier. As a result, we
                 propose to make use of the distance measure between the
                 vector embeddings of words to expand the set of
                 Wikipedia training examples with additional NEs
                 extracted from a monolingual corpus that yield
                 significant improvement in the unsupervised NER
                 performance. In fact, our expansion method performs
                 better than the traditional CRF-based (supervised)
                 approach (i.e., F-score of 65.4\% vs. 64.2\%). Finally,
                 we compare our proposed approach to the official
                 submission for the IJCNLP-2008 Bengali NER shared task
                 and achieve an overall improvement of F-score 11.26\%
                 with respect to the best official system.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Li:2017:IDR,
  author =       "Haoran Li and Jiajun Zhang and Chengqing Zong",
  title =        "Implicit Discourse Relation Recognition for {English}
                 and {Chinese} with Multiview Modeling and Effective
                 Representation Learning",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "19:1--19:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3028772",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Discourse relations between two text segments play an
                 important role in many Natural Language Processing
                 (NLP) tasks. The connectives strongly indicate the
                 sense of discourse relations, while in fact, there are
                 no connectives in a large proportion of discourse
                 relations, that is, implicit discourse relations.
                 Compared with explicit relations, implicit relations
                 are much harder to detect and have drawn significant
                 attention. Until now, there have been many studies
                 focusing on English implicit discourse relations, and
                 few studies address implicit relation recognition in
                 Chinese even though the implicit discourse relations in
                 Chinese are more common than those in English. In our
                 work, both the English and Chinese languages are our
                 focus. The key to implicit relation prediction is to
                 properly model the semantics of the two discourse
                 arguments, as well as the contextual interaction
                 between them. To achieve this goal, we propose a neural
                 network based framework that consists of two
                 hierarchies. The first one is the model hierarchy, in
                 which we propose a max-margin learning method to
                 explore the implicit discourse relation from multiple
                 views. The second one is the feature hierarchy, in
                 which we learn multilevel distributed representations
                 from words, arguments, and syntactic structures to
                 sentences. We have conducted experiments on the
                 standard benchmarks of English and Chinese, and the
                 results show that compared with several methods our
                 proposed method can achieve the best performance in
                 most cases.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Tholpadi:2017:CBT,
  author =       "Goutham Tholpadi and Chiranjib Bhattacharyya and
                 Shirish Shevade",
  title =        "Corpus-Based Translation Induction in {Indian}
                 Languages Using Auxiliary Language Corpora from
                 {Wikipedia}",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "20:1--20:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3038295",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Identifying translations from comparable corpora is a
                 well-known problem with several applications. Existing
                 methods rely on linguistic tools or high-quality
                 corpora. Absence of such resources, especially in
                 Indian languages, makes this problem hard; for example,
                 state-of-the-art techniques achieve a mean reciprocal
                 rank of 0.66 for English--Italian, and a mere 0.187 for
                 Telugu-Kannada. In this work, we address the problem of
                 comparable corpora-based translation correspondence
                 induction (CC-TCI) when the only resources available
                 are small noisy comparable corpora extracted from
                 Wikipedia. We observe that translations in the source
                 and target languages have many topically related words
                 in common in other ``auxiliary'' languages. To model
                 this, we define the notion of a translingual theme, a
                 set of topically related words from auxiliary language
                 corpora, and present a probabilistic framework for
                 CC-TCI. Extensive experiments on 35 comparable corpora
                 showed dramatic improvements in performance. We extend
                 these ideas to propose a method for measuring
                 cross-lingual semantic relatedness (CLSR) between
                 words. To stimulate further research in this area, we
                 make publicly available two new high-quality
                 human-annotated datasets for CLSR. Experiments on the
                 CLSR datasets show more than 200\% improvement in
                 correlation on the CLSR task. We apply the method to
                 the real-world problem of cross-lingual Wikipedia title
                 suggestion and build the WikiTSu system. A user study
                 on WikiTSu shows a 20\% improvement in the quality of
                 titles suggested.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Zhao:2017:HMC,
  author =       "Hai Zhao and Deng Cai and Yang Xin and Yuzhu Wang and
                 Zhongye Jia",
  title =        "A Hybrid Model for {Chinese} Spelling Check",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "21:1--21:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3047405",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Spelling check for Chinese has more challenging
                 difficulties than that for other languages. A hybrid
                 model for Chinese spelling check is presented in this
                 article. The hybrid model consists of three components:
                 one graph-based model for generic errors and two
                 independently trained models for specific errors. In
                 the graph model, a directed acyclic graph is generated
                 for each sentence, and the single-source shortest-path
                 algorithm is performed on the graph to detect and
                 correct general spelling errors at the same time. Prior
                 to that, two types of errors over functional words
                 (characters) are first solved by conditional random
                 fields: the confusion of ``[Chinese characters]'' (at)
                 (pinyin is zai in Chinese), ``[Chinese characters]''
                 (again, more, then) (pinyin: zai) and ``[Chinese
                 characters]'' (of) (pinyin: de), ``[Chinese
                 characters]'' (- ly, adverb-forming particle) (pinyin:
                 de), and ``[Chinese characters]'' (so that, have to)
                 (pinyin: de). Finally, a rule-based model is exploited
                 to distinguish pronoun usage confusion: ``[Chinese
                 characters]'' (she) (pinyin: ta), ``[Chinese
                 characters]'' (he) (pinyin: ta), and some other common
                 collocation errors. The proposed model is evaluated on
                 the standard datasets released by the SIGHAN Bake-off
                 shared tasks, giving state-of-the-art results.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}