Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.05",
%%%     date            = "03 March 2020",
%%%     time            = "09:13:43 MST",
%%%     filename        = "tallip.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "49434 9369 51624 490842",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "ACM Transactions on Asian and Low-Resource
%%%                        Language Information Processing (TALLIP);
%%%                        bibliography; BibTeX; TALLIP",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        ACM Transactions on Asian and Low-Resource
%%%                        Language Information Processing (TALLIP)
%%%                        (CODEN none, ISSN 2375-4699 (print),
%%%                        2375-4702 (electronic)).  Publication began
%%%                        with volume 14, number 1, in 2015 as a
%%%                        continuation of the predecessor journal,
%%%                        ACM Transactions on Asian language
%%%                        information processing (TALIP), which is
%%%                        covered in a separate bibliography, talip.bib.
%%%
%%%                        The journal has a World Wide Web site at
%%%
%%%                            http://portal.acm.org/browse_dl.cfm?&idx=J1521
%%%
%%%                        At version 1.05, the year coverage looked
%%%                        like this:
%%%
%%%                             2015 (  19)    2017 (  23)    2019 (  48)
%%%                             2016 (  43)    2018 (  27)    2020 (  46)
%%%
%%%                             Article:        206
%%%
%%%                             Total entries:  206
%%%
%%%                        This bibliography has been constructed
%%%                        primarily from the publisher Web site.
%%%
%%%                        Numerous errors in the sources noted above
%%%                        have been corrected.  Spelling has been
%%%                        verified with the UNIX spell and GNU ispell
%%%                        programs using the exception dictionary
%%%                        stored in the companion file with extension
%%%                        .sok.
%%%
%%%                        BibTeX citation tags are uniformly chosen as
%%%                        name:year:abbrev, where name is the family
%%%                        name of the first author or editor, year is a
%%%                        4-digit number, and abbrev is a 3-letter
%%%                        condensation of important title words.
%%%                        Citation labels were automatically generated
%%%                        by software developed for the BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, with the help of
%%%                        ``bibsort -byvolume''.  The bibsort utility
%%%                        is available from ftp.math.utah.edu in
%%%                        /pub/tex/bib.
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================
@Preamble{
    "\hyphenation{ }"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-TALLIP                = "ACM Transactions on Asian and Low-Resource
                                  Language Information Processing (TALLIP)"}

%%% ====================================================================
%%% Bibliography entries:
@Article{Uematsu:2015:IMD,
  author =       "Sumire Uematsu and Takuya Matsuzaki and Hiroki Hanaoka
                 and Yusuke Miyao and Hideki Mima",
  title =        "Integrating Multiple Dependency Corpora for Inducing
                 Wide-Coverage {Japanese} {CCG} Resources",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2658997",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:48 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "A novel method to induce wide-coverage Combinatory
                 Categorial Grammar (CCG) resources for Japanese is
                 proposed in this article. For some languages including
                 English, the availability of large annotated corpora
                 and the development of data-based induction of
                 lexicalized grammar have enabled deep parsing, i.e.,
                 parsing based on lexicalized grammars. However, deep
                 parsing for Japanese has not been widely studied. This
                 is mainly because most Japanese syntactic resources are
                 represented in chunk-based dependency structures, while
                 previous methods for inducing grammars are dependent on
                 tree corpora. To translate syntactic information
                 presented in chunk-based dependencies to phrase
                 structures as accurately as possible, integration of
                 annotation from multiple dependency-based corpora is
                 proposed. Our method first integrates dependency
                 structures and predicate-argument information and
                 converts them into phrase structure trees. The trees
                 are then transformed into CCG derivations in a similar
                 way to previously proposed methods. The quality of the
                 conversion is empirically evaluated in terms of the
                 coverage of the obtained CCG lexicon and the accuracy
                 of the parsing with the grammar. While the transforming
                 process used in this study is specialized for Japanese,
                 the framework of our method would be applicable to
                 other languages for which dependency-based analysis has
                 been regarded as more appropriate than phrase
                 structure-based analysis due to morphosyntactic
                 features.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ramrakhiyani:2015:ATE,
  author =       "Nitin Ramrakhiyani and Prasenjit Majumder",
  title =        "Approaches to Temporal Expression Recognition in
                 {Hindi}",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629574",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:48 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Temporal annotation of plain text is considered a
                 useful component of modern information retrieval tasks.
                 In this work, different approaches for identification
                 and classification of temporal expressions in Hindi are
                 developed and analyzed. First, a rule-based approach is
                 developed, which takes plain text as input and based on
                 a set of hand-crafted rules, produces a tagged output
                 with identified temporal expressions. This approach
                 performs with a strict F1-measure of 0.83. In another
                 approach, a CRF-based classifier is trained with human
                 tagged data and is then tested on a test dataset. The
                 trained classifier identifies the time expressions from
                 plain text and further classifies them to various
                 classes. This approach performs with a strict
                 F1-measure of 0.78. Next, the CRF is replaced by an
                 SVM-based classifier and the same experiment is
                 performed with the same features. This approach is
                 shown to be comparable to the CRF and performs with a
                 strict F1-measure of 0.77. Using the rule base
                 information as an additional feature enhances the
                 performances to 0.86 and 0.84 for the CRF and SVM
                 respectively. With three different comparable systems
                 performing the extraction task, merging them to take
                 advantage of their positives is the next step. As the
                 first merge experiment, rule-based tagged data is fed
                 to the CRF and SVM classifiers as additional training
                 data. Evaluation results report an increase in
                 F1-measure of the CRF from 0.78 to 0.8. Second, a
                 voting-based approach is implemented, which chooses the
                 best class for each token from the outputs of the three
                 approaches. This approach results in the best
                 performance for this task with a strict F1-measure of
                 0.88. In this process a reusable gold standard dataset
                 for temporal tagging in Hindi is also developed. Named
                 the ILTIMEX2012 corpus, it consists of 300 manually
                 tagged Hindi news documents.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Kumari:2015:ITD,
  author =       "B. Venkata Seshu Kumari and Ramisetty Rajeshwara Rao",
  title =        "Improving {Telugu} Dependency Parsing using
                 Combinatory Categorial Grammar Supertags",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2693190.2693191",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:48 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "We show that Combinatory Categorial Grammar (CCG)
                 supertags can improve Telugu dependency parsing. In
                 this process, we first extract a CCG lexicon from the
                 dependency treebank. Using both the CCG lexicon and the
                 dependency treebank, we create a CCG treebank using a
                 chart parser. Exploring different morphological
                 features of Telugu, we develop a supertagger using
                 maximum entropy models. We provide CCG supertags as
                 features to the Telugu dependency parser (MST parser).
                 We get an improvement of 1.8\% in the unlabelled
                 attachment score and 2.2\% in the labelled attachment
                 score. Our results show that CCG supertags improve the
                 MST parser, especially on verbal arguments for which it
                 has weak rates of recovery.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ketui:2015:EBA,
  author =       "Nongnuch Ketui and Thanaruk Theeramunkong and
                 Chutamanee Onsuwan",
  title =        "An {EDU}-Based Approach for {Thai} Multi-Document
                 Summarization and Its Application",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2641567",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:48 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Due to lack of a word/phrase/sentence boundary,
                 summarization of Thai multiple documents has several
                 challenges in unit segmentation, unit selection,
                 duplication elimination, and evaluation dataset
                 construction. In this article, we introduce Thai
                 Elementary Discourse Units (TEDUs) and their
                 derivatives, called Combined TEDUs (CTEDUs), and then
                 present our three-stage method of Thai multi-document
                 summarization, that is, unit segmentation, unit-graph
                 formulation, and unit selection and summary generation.
                 To examine performance of our proposed method, a number
                 of experiments are conducted using 50 sets of Thai news
                 articles with their manually constructed reference
                 summaries. Based on measures of ROUGE-1, ROUGE-2, and
                 ROUGE-SU4, the experimental results show that: (1) the
                 TEDU-based summarization outperforms paragraph-based
                 summarization; (2) our proposed graph-based TEDU
                 weighting with importance-based selection achieves the
                 best performance; and (3) unit duplication
                 consideration and weight recalculation help improve
                 summary quality.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sproat:2015:TPE,
  author =       "Richard Sproat",
  title =        "{TALLIP} Perspectives: Editorial Commentary: The
                 Broadened Focus of the Journal",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2710043",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:48 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Shen:2015:MGA,
  author =       "Han-ping Shen and Chung-hsien Wu and Pei-shan Tsai",
  title =        "Model Generation of Accented Speech using Model
                 Transformation and Verification for Bilingual Speech
                 Recognition",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "2",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2661637",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Nowadays, bilingual or multilingual speech recognition
                 is confronted with the accent-related problem caused by
                 non-native speech in a variety of real-world
                 applications. Accent modeling of non-native speech is
                 definitely challenging, because the acoustic properties
                 in highly-accented speech pronounced by non-native
                 speakers are quite divergent. The aim of this study is
                 to generate highly Mandarin-accented English models for
                 speakers whose mother tongue is Mandarin. First, a
                 two-stage, state-based verification method is proposed
                 to extract the state-level, highly-accented speech
                 segments automatically. Acoustic features and
                 articulatory features are successively used for robust
                 verification of the extracted speech segments. Second,
                 Gaussian components of the highly-accented speech
                 models are generated from the corresponding Gaussian
                 components of the native speech models using a linear
                 transformation function. A decision tree is constructed
                 to categorize the transformation functions and used for
                 transformation function retrieval to deal with the data
                 sparseness problem. Third, a discrimination function is
                 further applied to verify the generated accented
                 acoustic models. Finally, the successfully verified
                 accented English models are integrated into the native
                 bilingual phone model set for Mandarin-English
                 bilingual speech recognition. Experimental results show
                 that the proposed approach can effectively alleviate
                 recognition performance degradation due to accents and
                 can obtain absolute improvements of 4.1\%, 1.8\%, and
                 2.7\% in word accuracy for bilingual speech recognition
                 compared to that using traditional ASR approaches,
                 MAP-adapted, and MLLR-adapted ASR methods,
                 respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Awajan:2015:KEA,
  author =       "Arafat Awajan",
  title =        "Keyword Extraction from {Arabic} Documents using Term
                 Equivalence Classes",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "2",
  pages =        "7:1--7:??",
  month =        mar,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2665077",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The rapid growth of the Internet and other computing
                 facilities in recent years has resulted in the creation
                 of a large amount of text in electronic form, which has
                 increased the interest in and importance of different
                 automatic text processing applications, including
                 keyword extraction and term indexing. Although keywords
                 are very useful for many applications, most documents
                 available online are not provided with keywords. We
                 describe a method for extracting keywords from Arabic
                 documents. This method identifies the keywords by
                 combining linguistics and statistical analysis of the
                 text without using prior knowledge from its domain or
                 information from any related corpus. The text is
                 preprocessed to extract the main linguistic
                 information, such as the roots and morphological
                 patterns of derivative words. A cleaning phase is then
                 applied to eliminate the meaningless words from the
                 text. The most frequent terms are clustered into
                 equivalence classes in which the derivative words
                 generated from the same root and the non-derivative
                 words generated from the same stem are placed together,
                 and their count is accumulated. A vector space model is
                 then used to capture the most frequent N-gram in the
                 text. Experiments carried out using a real-world
                 dataset show that the proposed method achieves good
                 results with an average precision of 31\% and average
                 recall of 53\% when tested against manually assigned
                 keywords.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sundaram:2015:BLM,
  author =       "Suresh Sundaram and A. G. Ramakrishnan",
  title =        "Bigram Language Models and Reevaluation Strategy for
                 Improved Recognition of Online Handwritten {Tamil}
                 Words",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "2",
  pages =        "8:1--8:??",
  month =        mar,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2671014",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article describes a postprocessing strategy for
                 online, handwritten, isolated Tamil words.
                 Contributions have been made with regard to two issues
                 hardly addressed in the online Indic word recognition
                 literature, namely, use of (1) language models
                 exploiting the idiosyncrasies of Indic scripts and (2)
                 expert classifiers for the disambiguation of confused
                 symbols. The input word is first segmented into its
                 individual symbols, which are recognized using a
                 primary support vector machine (SVM) classifier.
                 Thereafter, we enhance the recognition accuracy by
                 utilizing (i) a bigram language model at the symbol or
                 character level and (ii) expert classifiers for
                 reevaluating and disambiguating the different sets of
                 confused symbols. The symbol-level bigram model is used
                 in a traditional Viterbi framework. The concept of a
                 character comprising multiple symbols is unique to
                 Dravidian languages such as Tamil. This multi-symbol
                 feature of Tamil characters has been exploited in
                 proposing a novel, prefix-tree-based character-level
                 bigram model that does not use Viterbi search; rather
                 it reduces the search space for each input symbol based
                 on its left context. For disambiguating confused
                 symbols, a dynamic time-warping approach is proposed to
                 automatically identify the parts of the online trace
                 that discriminates between the confused classes. Fine
                 classification of these regions by dedicated expert
                 SVMs reduces the extent of confusions between such
                 symbols. The integration of segmentation,
                 prefix-tree-based language model and disambiguation of
                 confused symbols is presented on a set of 15,000
                 handwritten isolated online Tamil words. Our results
                 show recognition accuracies of 93.0\% and 81.6\% at the
                 symbol and word level, respectively, as compared to the
                 baseline classifier performance of 88.4\% and 65.1\%,
                 respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Zhang:2015:TMT,
  author =       "Jiajun Zhang and Shujie Liu and Mu Li and Ming Zhou
                 and Chengqing Zong",
  title =        "Towards Machine Translation in Semantic Vector Space",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "2",
  pages =        "9:1--9:??",
  month =        mar,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2699927",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Measuring the quality of the translation rules and
                 their composition is an essential issue in the
                 conventional statistical machine translation (SMT)
                 framework. To express the translation quality, the
                 previous lexical and phrasal probabilities are
                 calculated only according to the co-occurrence
                 statistics in the bilingual corpus and may be not
                 reliable due to the data sparseness problem. To address
                 this issue, we propose measuring the quality of the
                 translation rules and their composition in the semantic
                 vector embedding space (VES). We present a recursive
                 neural network (RNN)-based translation framework, which
                 includes two submodels. One is the
                 bilingually-constrained recursive auto-encoder, which
                 is proposed to convert the lexical translation rules
                 into compact real-valued vectors in the semantic VES.
                 The other is a type-dependent recursive neural network,
                 which is proposed to perform the decoding process by
                 minimizing the semantic gap (meaning distance) between
                 the source language string and its translation
                 candidates at each state in a bottom-up structure. The
                 RNN-based translation model is trained using a
                 max-margin objective function that maximizes the margin
                 between the reference translation and the n-best
                 translations in forced decoding. In the experiments, we
                 first show that the proposed vector representations for
                 the translation rules are very reliable for application
                 in translation modeling. We further show that the
                 proposed type-dependent, RNN-based model can
                 significantly improve the translation quality in the
                 large-scale, end-to-end Chinese-to-English translation
                 evaluation.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Na:2015:CRF,
  author =       "Seung-Hoon Na",
  title =        "Conditional Random Fields for {Korean} Morpheme
                 Segmentation and {POS} Tagging",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "3",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700051",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "There has been recent interest in statistical
                 approaches to Korean morphological analysis. However,
                 previous studies have been based mostly on generative
                 models, including a hidden Markov model (HMM), without
                 utilizing discriminative models such as a conditional
                 random field (CRF). We present a two-stage
                 discriminative approach based on CRFs for Korean
                 morphological analysis. Similar to methods used for
                 Chinese, we perform two disambiguation procedures based
                 on CRFs: (1) morpheme segmentation and (2) POS tagging.
                 In morpheme segmentation, an input sentence is
                 segmented into sequences of morphemes, where a morpheme
                 unit is either atomic or compound. In the POS tagging
                 procedure, each morpheme (atomic or compound) is
                 assigned a POS tag. Once POS tagging is complete, we
                 carry out a post-processing of the compound morphemes,
                 where each compound morpheme is further decomposed into
                 atomic morphemes, which is based on pre-analyzed
                 patterns and generalized HMMs obtained from the given
                 tagged corpus. Experimental results show the promise of
                 our proposed method.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Liu:2015:MTM,
  author =       "Xiaodong Liu and Kevin Duh and Yuji Matsumoto",
  title =        "Multilingual Topic Models for Bilingual Dictionary
                 Extraction",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "3",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2699939",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "A machine-readable bilingual dictionary plays a
                 crucial role in many natural language processing tasks,
                 such as statistical machine translation and
                 cross-language information retrieval. In this article,
                 we propose a framework for extracting a bilingual
                 dictionary from comparable corpora by exploiting a
                 novel combination of topic modeling and word aligners
                 such as the IBM models. Using a multilingual topic
                 model, we first convert a comparable document -aligned
                 corpus into a parallel topic -aligned corpus. This
                 novel topic-aligned corpus is similar in structure to
                 the sentence -aligned corpus frequently employed in
                 statistical machine translation and allows us to
                 extract a bilingual dictionary using a word alignment
                 model. The main advantages of our framework is that (1)
                 no seed dictionary is necessary for bootstrapping the
                 process, and (2) multilingual comparable corpora in
                 more than two languages can also be exploited. In our
                 experiments on a large-scale Wikipedia dataset, we
                 demonstrate that our approach can extract higher
                 precision dictionaries compared to previous approaches
                 and that our method improves further as we add more
                 languages to the dataset.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Li:2015:UMS,
  author =       "Xiaoqing Li and Chengqing Zong and Keh-yih Su",
  title =        "A Unified Model for Solving the {OOV} Problem of
                 {Chinese} Word Segmentation",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "3",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2699940",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article proposes a unified, character-based,
                 generative model to incorporate additional resources
                 for solving the out-of-vocabulary (OOV) problem of
                 Chinese word segmentation, within which different types
                 of additional information can be utilized independently
                 in corresponding submodels. This article mainly
                 addresses the following three types of OOV: unseen
                 dictionary words, named entities, and suffix-derived
                 words, none of which are handled well by current
                 approaches. The results show that our approach can
                 effectively improve the performance of the first two
                 types with positive interaction in F-score.
                 Additionally, we also analyze reason that suffix
                 information is not helpful. After integrating the
                 proposed generative model with the corresponding
                 discriminative approach, our evaluation on various
                 corpora---including SIGHAN-2005, CIPS-SIGHAN-2010, and
                 the Chinese Treebank (CTB)---shows that our integrated
                 approach achieves the best performance reported in the
                 literature on all testing sets when additional
                 information and resources are allowed.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Goto:2015:PUT,
  author =       "Isao Goto and Masao Utiyama and Eiichiro Sumita and
                 Sadao Kurohashi",
  title =        "Preordering using a Target-Language Parser via
                 Cross-Language Syntactic Projection for Statistical
                 Machine Translation",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "3",
  pages =        "13:1--13:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2699925",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "When translating between languages with widely
                 different word orders, word reordering can present a
                 major challenge. Although some word reordering methods
                 do not employ source-language syntactic structures,
                 such structures are inherently useful for word
                 reordering. However, high-quality syntactic parsers are
                 not available for many languages. We propose a
                 preordering method using a target-language syntactic
                 parser to process source-language syntactic structures
                 without a source-language syntactic parser. To train
                 our preordering model based on ITG, we produced
                 syntactic constituent structures for source-language
                 training sentences by (1) parsing target-language
                 training sentences, (2) projecting constituent
                 structures of the target-language sentences to the
                 corresponding source-language sentences, (3) selecting
                 parallel sentences with highly synchronized parallel
                 structures, (4) producing probabilistic models for
                 parsing using the projected partial structures and the
                 Pitman-Yor process, and (5) parsing to produce full
                 binary syntactic structures maximally synchronized with
                 the corresponding target-language syntactic structures,
                 using the constraints of the projected partial
                 structures and the probabilistic models. Our ITG-based
                 preordering model is trained using the produced binary
                 syntactic structures and word alignments. The proposed
                 method facilitates the learning of ITG by producing
                 highly synchronized parallel syntactic structures based
                 on cross-language syntactic projection and sentence
                 selection. The preordering model jointly parses input
                 sentences and identifies their reordered structures.
                 Experiments with Japanese--English and Chinese--English
                 patent translation indicate that our method outperforms
                 existing methods, including string-to-tree syntax-based
                 SMT, a preordering method that does not require a
                 parser, and a preordering method that uses a
                 source-language dependency parser.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Costa-Jussa:2016:DCS,
  author =       "Marta R. Costa-Juss{\`a} and Jordi Centelles",
  title =        "Description of the {Chinese}-to-{Spanish} Rule-Based
                 Machine Translation System Developed Using a Hybrid
                 Combination of Human Annotation and Statistical
                 Techniques",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2738045",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Two of the most popular Machine Translation (MT)
                 paradigms are rule based (RBMT) and corpus based, which
                 include the statistical systems (SMT). When scarce
                 parallel corpus is available, RBMT becomes particularly
                 attractive. This is the case of the Chinese--Spanish
                 language pair. This article presents the first RBMT
                 system for Chinese to Spanish. We describe a hybrid
                 method for constructing this system taking advantage of
                 available resources such as parallel corpora that are
                 used to extract dictionaries and lexical and structural
                 transfer rules. The final system is freely available
                 online and open source. Although performance lags
                 behind standard SMT systems for an in-domain test set,
                 the results show that the RBMT's coverage is
                 competitive and it outperforms the SMT system in an
                 out-of-domain test set. This RBMT system is available
                 to the general public, it can be further enhanced, and
                 it opens up the possibility of creating future hybrid
                 MT systems.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Khanduja:2016:HFE,
  author =       "Deepti Khanduja and Neeta Nain and Subhash Panwar",
  title =        "A Hybrid Feature Extraction Algorithm for {Devanagari}
                 Script",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2710018",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The efficiency of any character recognition technique
                 is directly dependent on the accuracy of the generated
                 feature set that could uniquely represent a character
                 and hence correctly recognize it. This article proposes
                 a hybrid approach combining the structural features of
                 the character and a mathematical model of curve fitting
                 to simulate the best features of a character. As a
                 preprocessing step, skeletonization of the character is
                 performed using an iterative thinning algorithm based
                 on Raster scan of the character image. Then, a
                 combination of structural features of the character
                 like number of endpoints, loops, and intersection
                 points is calculated. Further, the thinned character
                 image is statistically zoned into partitions, and a
                 quadratic curve-fitting model is applied on each
                 partition forming a feature vector of the coefficients
                 of the optimally fitted curve. This vector is combined
                 with the spatial distribution of the foreground pixels
                 for each zone and hence script-independent feature
                 representation. The approach has been evaluated
                 experimentally on Devanagari scripts. The algorithm
                 achieves an average recognition accuracy of 93.4\%.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Shatnawi:2016:IHA,
  author =       "Maad Shatnawi and Sherief Abdallah",
  title =        "Improving Handwritten {Arabic} Character Recognition
                 by Modeling Human Handwriting Distortions",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2764456",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Handwritten Arabic character recognition systems face
                 several challenges, including the unlimited variation
                 in human handwriting and the unavailability of large
                 public databases of handwritten characters and words.
                 The use of synthetic data for training and testing
                 handwritten character recognition systems is one of the
                 possible solutions to provide several variations for
                 these characters and to overcome the lack of large
                 databases. While this can be using arbitrary
                 distortions, such as image noise and randomized affine
                 transformations, such distortions are not realistic. In
                 this work, we model real distortions in handwriting
                 using real handwritten Arabic character examples and
                 then use these distortion models to synthesize
                 handwritten examples that are more realistic. We show
                 that the use of our proposed approach leads to
                 significant improvements across different
                 machine-learning classification algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wushouer:2016:CAP,
  author =       "Mairidan Wushouer and Donghui Lin and Toru Ishida and
                 Katsutoshi Hirayama",
  title =        "A Constraint Approach to Pivot-Based Bilingual
                 Dictionary Induction",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2723144",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "High-quality bilingual dictionaries are very useful,
                 but such resources are rarely available for
                 lower-density language pairs, especially for those that
                 are closely related. Using a third language to link two
                 other languages is a well-known solution and usually
                 requires only two input bilingual dictionaries A-B and
                 B-C to automatically induce the new one, A-C. This
                 approach, however, has never been demonstrated to
                 utilize the complete structures of the input bilingual
                 dictionaries, and this is a key failing because the
                 dropped meanings negatively influence the result. This
                 article proposes a constraint approach to pivot-based
                 dictionary induction where language A and C are closely
                 related. We create constraints from language similarity
                 and model the structures of the input dictionaries as a
                 Boolean optimization problem, which is then formulated
                 within the Weighted Partial Max-SAT framework, an
                 extension of Boolean Satisfiability (SAT). All of the
                 encoded CNF (Conjunctive Normal Form), the predominant
                 input language of modern SAT/MAX-SAT solvers, formulas
                 are evaluated by a solver to produce the target
                 (output) bilingual dictionary. Moreover, we discuss
                 alternative formalizations as a comparison study. We
                 designed a tool that uses the Sat4j library as the
                 default solver to implement our method and conducted an
                 experiment in which the output bilingual dictionary
                 achieved better quality than the baseline method.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Yeh:2016:SAI,
  author =       "Jui-Feng Yeh",
  title =        "Speech Act Identification Using Semantic Dependency
                 Graphs with Probabilistic Context-Free Grammars",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2786978",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "We propose an approach for identifying the speech acts
                 of speakers' utterances in conversational spoken
                 dialogue that involves using semantic dependency graphs
                 with probabilistic context-free grammars (PCFGs). The
                 semantic dependency graph based on the HowNet knowledge
                 base is adopted to model the relationships between
                 words in an utterance parsed by PCFG. Dependency
                 relationships between words within the utterance are
                 extracted by decomposing the semantic dependency graph
                 according to predefined events. The corresponding
                 values of semantic slots are subsequently extracted
                 from the speaker's utterances according to the
                 corresponding identified speech act. The experimental
                 results obtained when using the proposed approach
                 indicated that the accuracy rates of speech act
                 detection and task completion were 95.6\% and 77.4\%
                 for human-generated transcription (REF) and
                 speech-to-text recognition output (STT), respectively,
                 and the average numbers of turns of each dialogue were
                 8.3 and 11.8 for REF and STT, respectively. Compared
                 with Bayes classifier, partial pattern tree, and
                 Bayesian-network-based approaches, we obtained 14.1\%,
                 9.2\%, and 3\% improvements in the accuracy of speech
                 act identification, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wang:2016:CCSa,
  author =       "Ting-Xuan Wang and Wen-Hsiang Lu",
  title =        "Constructing Complex Search Tasks with Coherent
                 Subtask Search Goals",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "2",
  pages =        "6:1--6:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742547",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Nowadays, due to the explosive growth of web content
                 and usage, users deal with their complex search tasks
                 by web search engines. However, conventional search
                 engines consider a search query corresponding only to a
                 simple search task. In order to accomplish a complex
                 search task, which consists of multiple subtask search
                 goals, users usually have to issue a series of queries.
                 For example, the complex search task ``travel to
                 Dubai'' may involve several subtask search goals,
                 including reserving hotel room, surveying Dubai
                 landmarks, booking flights, and so forth. Therefore, a
                 user can efficiently accomplish his or her complex
                 search task if search engines can predict the complex
                 search task with a variety of subtask search goals. In
                 this work, we propose a complex search task model
                 (CSTM) to deal with this problem. The CSTM first groups
                 queries into complex search task clusters, and then
                 generates subtask search goals from each complex search
                 task cluster. To raise the performance of CSTM, we
                 exploit four web resources including community question
                 answering, query logs, search engine result pages, and
                 clicked pages. Experimental results show that our CSTM
                 is effective in identifying the comprehensive subtask
                 search goals of a complex search task.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Tsai:2016:CWB,
  author =       "Richard Tzong-Han Tsai",
  title =        "Collective {Web}-Based Parenthetical Translation
                 Extraction Using {Markov} Logic Networks",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "2",
  pages =        "7:1--7:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2794399",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Parenthetical translations are translations of terms
                 in otherwise monolingual text that appear inside
                 parentheses. Parenthetical translations extraction
                 (PTE) is the task of extracting parenthetical
                 translations from natural language documents. One of
                 the main difficulties in PTE is to detect the left
                 boundary of the translated term in preparenthetical
                 text. In this article, we propose a collective approach
                 that employs Markov logic to model multiple constraints
                 used in the PTE task. We show how various constraints
                 can be formulated and combined in a Markov logic
                 network (MLN). Our experimental results show that the
                 proposed collective PTE approach significantly
                 outperforms a current state-of-the-art method,
                 improving the average F-measure up to 27.11\% compared
                 to the previous word alignment approach. It also
                 outperforms an individual MLN-based system by 8.2\% and
                 a system based on conditional random fields by 5.9\%.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Jain:2016:FHW,
  author =       "Amita Jain and D. K. Lobiyal",
  title =        "Fuzzy {Hindi} {WordNet} and Word Sense Disambiguation
                 Using Fuzzy Graph Connectivity Measures",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "2",
  pages =        "8:1--8:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2790079",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In this article, we propose Fuzzy Hindi WordNet, which
                 is an extended version of Hindi WordNet. The proposed
                 idea of fuzzy relations and their role in modeling
                 Fuzzy Hindi WordNet is explained. We mathematically
                 define fuzzy relations and the composition of these
                 fuzzy relations for this extended version. We show that
                 the concept of composition of fuzzy relations can be
                 used to infer a relation between two words that
                 otherwise are not directly related in Hindi WordNet.
                 Then we propose fuzzy graph connectivity measures that
                 include both local and global measures. These measures
                 are used in determining the significance of a concept
                 (which is represented as a vertex in the fuzzy graph)
                 in a specific context. Finally, we show how these
                 extended measures solve the problem of word sense
                 disambiguation (WSD) effectively, which is useful in
                 many natural language processing applications to
                 improve their performance. Experiments on standard
                 sense tagged corpus for WSD show better results when
                 Fuzzy Hindi WordNet is used in place of Hindi
                 WordNet.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Kertkeidkachorn:2016:AFH,
  author =       "Natthawut Kertkeidkachorn and Proadpran Punyabukkana
                 and Atiwong Suchato",
  title =        "Acoustic Features for Hidden Conditional Random
                 Fields-Based {Thai} Tone Classification",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "2",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2833088",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In the Thai language, tone information is necessary
                 for Thai speech recognition systems. Previous studies
                 show that many acoustic cues are attributed to shapes
                 of tones. Nevertheless, most Thai tone classification
                 studies mainly adopted F$_0$ values and their
                 derivatives without considering other acoustic
                 features. In this article, other acoustic features for
                 Thai tone classification are investigated. In the
                 experiment, energy values and spectral information
                 represented by three spectral-based features including
                 the LPC-based feature, PLP-based feature, and
                 MFCC-based feature are applied to the HCRF-based Thai
                 tone classification, which was reported as the best
                 approach for Thai tone classification. The energy
                 values provide an error rate reduction of 22.40\% in
                 the isolated word scenario, while there are slight
                 improvements in the continuous speech scenario. On the
                 contrary, spectral-based features greatly contribute to
                 Thai tone classification in the continuous-speech
                 scenario, whereas spectral-based features slightly
                 degrade performances in the isolated-word scenario. The
                 best achievement in the continuous-speech scenario is
                 obtained from the PLP-based feature, which yields an
                 error rate reduction of 13.90\%. Therefore, findings in
                 this article are that energy values and spectral-based
                 features, especially the PLP-based feature, are the
                 main contributors to the improvement of the
                 performances of Thai tone classification in the
                 isolated-word scenario and the continuous-speech
                 scenario, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Chu:2016:IPS,
  author =       "Chenhui Chu and Toshiaki Nakazawa and Sadao
                 Kurohashi",
  title =        "Integrated Parallel Sentence and Fragment Extraction
                 from Comparable Corpora: a Case Study on
                 {Chinese--Japanese} {Wikipedia}",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "2",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2833089",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Parallel corpora are crucial for statistical machine
                 translation (SMT); however, they are quite scarce for
                 most language pairs and domains. As comparable corpora
                 are far more available, many studies have been
                 conducted to extract either parallel sentences or
                 fragments from them for SMT. In this article, we
                 propose an integrated system to extract both parallel
                 sentences and fragments from comparable corpora. We
                 first apply parallel sentence extraction to identify
                 parallel sentences from comparable sentences. We then
                 extract parallel fragments from the comparable
                 sentences. Parallel sentence extraction is based on a
                 parallel sentence candidate filter and classifier for
                 parallel sentence identification. We improve it by
                 proposing a novel filtering strategy and three novel
                 feature sets for classification. Previous studies have
                 found it difficult to accurately extract parallel
                 fragments from comparable sentences. We propose an
                 accurate parallel fragment extraction method that uses
                 an alignment model to locate the parallel fragment
                 candidates and an accurate lexicon-based filter to
                 identify the truly parallel fragments. A case study on
                 the Chinese--Japanese Wikipedia indicates that our
                 proposed methods outperform previously proposed
                 methods, and the parallel data extracted by our system
                 significantly improves SMT performance.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wang:2016:CCSb,
  author =       "Rui Wang and Masao Utiyama and Isao Goto and Eiichiro
                 Sumita and Hai Zhao and Bao-Liang Lu",
  title =        "Converting Continuous-Space Language Models into
                 {$N$}-gram Language Models with Efficient Bilingual
                 Pruning for Statistical Machine Translation",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "11:1--11:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2843942",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The Language Model (LM) is an essential component of
                 Statistical Machine Translation (SMT). In this article,
                 we focus on developing efficient methods for LM
                 construction. Our main contribution is that we propose
                 a Natural N -grams based Converting (NNGC) method for
                 transforming a Continuous-Space Language Model (CSLM)
                 to a Back-off N -gram Language Model (BNLM).
                 Furthermore, a Bilingual LM Pruning (BLMP) approach is
                 developed for enhancing LMs in SMT decoding and
                 speeding up CSLM converting. The proposed pruning and
                 converting methods can convert a large LM efficiently
                 by working jointly. That is, a LM can be effectively
                 pruned before it is converted from CSLM without
                 sacrificing performance, and further improved if an
                 additional corpus contains out-of-domain information.
                 For different SMT tasks, our experimental results
                 indicate that the proposed NNGC and BLMP methods
                 outperform the existing counterpart approaches
                 significantly in BLEU and computational cost.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Chakrabarty:2016:BBL,
  author =       "Abhisek Chakrabarty and Utpal Garain",
  title =        "{BenLem} (A {Bengali} Lemmatizer) and Its Role in
                 {WSD}",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "12:1--12:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2835494",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "A lemmatization algorithm for Bengali has been
                 developed and evaluated. Its effectiveness for word
                 sense disambiguation (WSD) is also investigated. One of
                 the key challenges for computer processing of highly
                 inflected languages is to deal with the frequent
                 morphological variations of the root words appearing in
                 the text. Therefore, a lemmatizer is essential for
                 developing natural language processing (NLP) tools for
                 such languages. In this experiment, Bengali, which is
                 the national language of Bangladesh and the second most
                 popular language in the Indian subcontinent, has been
                 taken as a reference. In order to design the Bengali
                 lemmatizer (named as BenLem), possible transformations
                 through which surface words are formed from lemmas are
                 studied so that appropriate reverse transformations can
                 be applied on a surface word to get the corresponding
                 lemma back. BenLem is found to be capable of handling
                 both inflectional and derivational morphology in
                 Bengali. It is evaluated on a set of 18 news articles
                 taken from the FIRE Bengali News Corpus consisting of
                 3,342 surface words (excluding proper nouns) and found
                 to be 81.95\% accurate. The role of the lemmatizer is
                 then investigated for Bengali WSD. Ten highly
                 polysemous Bengali words are considered for sense
                 disambiguation. The FIRE corpus and a collection of
                 Tagore's short stories are considered for creating the
                 WSD dataset. Different WSD systems are considered for
                 this experiment, and it is noticed that BenLem improves
                 the performance of all the WSD systems and the
                 improvements are statistically significant.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Zhou:2016:ESR,
  author =       "Hao Zhou and Shujian Huang and Junsheng Zhou and Yue
                 Zhang and Huadong Chen and Xinyu Dai and Chuan Cheng
                 and Jiajun Chen",
  title =        "Enhancing Shift--Reduce Constituent Parsing with
                 Action {$N$}-Gram Model",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "13:1--13:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2820902",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Current shift-reduce parsers ``understand'' the
                 context by embodying a large number of binary indicator
                 features with a discriminative model. In this article,
                 we propose the action n-gram model, which utilizes the
                 action sequence to help parsing disambiguation. The
                 action n-gram model is trained on action sequences
                 produced by parsers with the n-gram estimation method,
                 which gives a smoothed maximum likelihood estimation of
                 the action probability given a specific action history.
                 We show that incorporating action n-gram models into a
                 state-of-the-art parsing framework could achieve
                 parsing accuracy improvements on three datasets across
                 two languages.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sadek:2016:EAC,
  author =       "Jawad Sadek and Farid Meziane",
  title =        "Extracting {Arabic} Causal Relations Using Linguistic
                 Patterns",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "14:1--14:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2800786",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Identifying semantic relations is a crucial step in
                 discourse analysis and is useful for many applications
                 in both language and speech technology. Automatic
                 detection of Causal relations therefore has gained
                 popularity in the literature within different
                 frameworks. The aim of this article is the automatic
                 detection and extraction of Causal relations that are
                 explicitly expressed in Arabic texts. To fulfill this
                 goal, a Pattern Recognizer model was developed to
                 signal the presence of cause--effect information within
                 sentences from nonspecific domain texts. This model
                 incorporates approximately 700 linguistic patterns so
                 that parts of the sentence representing the cause and
                 those representing the effect can be distinguished. The
                 patterns were constructed based on different sets of
                 syntactic features by analyzing a large untagged Arabic
                 corpus. In addition, the model was boosted with three
                 independent algorithms to deal with certain types of
                 grammatical particles that indicate causation. With
                 this approach, the proposed model achieved an overall
                 recall of 81\% and a precision of 78\%. Evaluation
                 results revealed that the justification particles play
                 a key role in detecting Causal relations. To the best
                 of our knowledge, no previous studies have been
                 dedicated to dealing with this type of relation in the
                 Arabic language.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Yang:2016:BSR,
  author =       "Haitong Yang and Yu Zhou and Chengqing Zong",
  title =        "Bilingual Semantic Role Labeling Inference via Dual
                 Decomposition",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2835493",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article focuses on bilingual Semantic Role
                 Labeling (SRL); its goal is to annotate semantic roles
                 on both sides of the parallel bilingual texts
                 (bi-texts). Since rich bilingual information is
                 encoded, bilingual SRL has been applied in many
                 natural-language processing (NLP) tasks such as machine
                 translation (MT), cross-lingual information retrieval
                 (IR), and the like. A feasible way of performing
                 bilingual SRL is using monolingual SRL systems to
                 perform SRL on each side of bi-texts separately.
                 However, it is difficult to obtain consistent SRL
                 results on both sides of bi-texts in this way. Some
                 works have tried to jointly infer bilingual SRL because
                 there are many complementary language cues on both
                 sides of bi-texts and they reported better performance
                 than monolingual systems. However, there are two limits
                 in the existing methods. First, the existing methods
                 often require high inference costs due to the complex
                 objective function. Second, the existing methods fully
                 adopt the candidates generated by monolingual SRL
                 systems, but many candidates are discarded in the
                 argument pruning or identification stage of monolingual
                 systems. In this article, we propose two strategies to
                 overcome these limits. We utilize a simple but
                 efficient technique: Dual Decomposition to search for
                 consistent results for both sides of bi-texts. On the
                 other hand, we propose a method called Bi-Directional
                 Projection (BDP) to recover arguments discarded in
                 monolingual SRL systems. We evaluate our method on a
                 standard parallel benchmark: the OntoNotes dataset. The
                 experimental results show that our method yields
                 significant improvements over the state-of-the-art
                 monolingual systems. In addition, our approach is also
                 better and faster than existing methods due to BDP and
                 Dual Decomposition.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Li:2016:MMC,
  author =       "Maoxi Li and Mingwen Wang and Hanxi Li and Fan Xu",
  title =        "Modeling Monolingual Character Alignment for Automatic
                 Evaluation of {Chinese} Translation",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2815619",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Automatic evaluation of machine translations is an
                 important task. Most existing evaluation metrics rely
                 on matching the same word or letter n -grams. This
                 strategy leads to poor results on Chinese translations
                 because one has to rely merely on matching identical
                 characters. In this article, we propose a new
                 evaluation metric that allows different characters with
                 the same or similar meaning to match. An Indirect
                 Hidden Markov Model (IHMM) is proposed to align the
                 Chinese translation with human references at the
                 character level. In the model, the emission
                 probabilities are estimated by character similarity,
                 including character semantic similarity and character
                 surface similarity, and transition probabilities are
                 estimated by a heuristic distance-based distortion
                 model. When evaluating the submitted output of
                 English-to-Chinese translation systems in the IWSLT'08
                 CT-EC and NIST'08 EC tasks, the experimental results
                 indicate that the proposed metric has a significantly
                 better correlation with human evaluation than the
                 state-of-the-art machine translation metrics (i.e.,
                 BLEU, Meteor Universal, and TESLA-CELAB). This study
                 shows that it is important to allow different
                 characters to match in the evaluation of Chinese
                 translations and that the IHMM is a reasonable approach
                 for the alignment of Chinese characters.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Abuaiadah:2016:UBM,
  author =       "Diab Abuaiadah",
  title =        "Using Bisect {$K$}-Means Clustering Technique in the
                 Analysis of {Arabic} Documents",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2812809",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In this article, I have investigated the performance
                 of the bisect K-means clustering algorithm compared to
                 the standard K-means algorithm in the analysis of
                 Arabic documents. The experiments included five
                 commonly used similarity and distance functions
                 (Pearson correlation coefficient, cosine, Jaccard
                 coefficient, Euclidean distance, and averaged
                 Kullback--Leibler divergence) and three leading
                 stemmers. Using the purity measure, the bisect K-means
                 clearly outperformed the standard K-means in all
                 settings with varying margins. For the bisect K-means,
                 the best purity reached 0.927 when using the Pearson
                 correlation coefficient function, while for the
                 standard K-means, the best purity reached 0.884 when
                 using the Jaccard coefficient function. Removing stop
                 words significantly improved the results of the bisect
                 K-means but produced minor improvements in the results
                 of the standard K-means. Stemming provided additional
                 minor improvement in all settings except the
                 combination of the averaged Kullback--Leibler
                 divergence function and the root-based stemmer, where
                 the purity was deteriorated by more than 10\%. These
                 experiments were conducted using a dataset with nine
                 categories, each of which contains 300 documents.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Elayeb:2016:ACL,
  author =       "Bilel Elayeb and Ibrahim Bounhas",
  title =        "{Arabic} Cross-Language Information Retrieval: a
                 Review",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "18:1--18:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2789210",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Cross-language information retrieval (CLIR) deals with
                 retrieving relevant documents in one language using
                 queries expressed in another language. As CLIR tools
                 rely on translation techniques, they are challenged by
                 the properties of highly derivational and flexional
                 languages like Arabic. Much work has been done on CLIR
                 for different languages including Arabic. In this
                 article, we introduce the reader to the motivations for
                 solving some problems related to Arabic CLIR
                 approaches. The evaluation of these approaches is
                 discussed starting from the 2001 and 2002 TREC Arabic
                 CLIR tracks, which aim to objectively evaluate CLIR
                 systems. We also study many other research works to
                 highlight the unresolved problems or those that require
                 further investigation. These works are discussed in the
                 light of a deep study of the specificities and the
                 tasks of Arabic information retrieval (IR). Particular
                 attention is given to translation techniques and CLIR
                 resources, which are key issues challenging Arabic
                 CLIR. To push research in this field, we discuss how a
                 new standard collection can improve Arabic IR and CLIR
                 tracks.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Zhao:2016:ALM,
  author =       "Yinggong Zhao and Shujian Huang and Xin-Yu Dai and
                 Jiajun Chen",
  title =        "Adaptation of Language Models for {SMT} Using Neural
                 Networks with Topic Information",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "19:1--19:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2816816",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Neural network language models (LMs) are shown to be
                 effective in improving the performance of statistical
                 machine translation (SMT) systems. However,
                 state-of-the-art neural network LMs usually use words
                 before the current position as context and neglect
                 global topic information, which can help machine
                 translation (MT) systems to select better translation
                 candidates from a higher perspective. In this work, we
                 propose improvement of the state-of-the-art feedforward
                 neural language model with topic information. Two main
                 issues need to be tackled when adding topics into
                 neural network LMs for SMT: one is how to incorporate
                 topics to the neural network; the other is how to get
                 target-side topic distribution before translation. We
                 incorporate topics by appending topic distribution to
                 the input layer of a feedforward LM. We adopt a
                 multinomial logistic-regression (MLR) model to predict
                 the target-side topic distribution based on source side
                 information. Moreover, we propose a feedforward neural
                 network model to learn joint representations on the
                 source side for topic prediction. LM experiments
                 demonstrate that the perplexity on validation set can
                 be greatly reduced by the topic-enhanced feedforward
                 LM, and the prediction of target-side topics can be
                 improved dramatically with the MLR model equipped with
                 the joint source representations. A final MT
                 experiment, conducted on a large-scale Chinese--English
                 dataset, shows that our feedforward LM with predicted
                 topics improves the translation performance against a
                 strong baseline.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ding:2016:IIE,
  author =       "Chenchen Ding and Keisuke Sakanushi and Hirona Touji
                 and Mikio Yamamoto",
  title =        "Inter-, Intra-, and Extra-Chunk Pre-Ordering for
                 Statistical {Japanese}-to-{English} Machine
                 Translation",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "3",
  pages =        "20:1--20:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818381",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:50 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "A rule-based pre-ordering approach is proposed for
                 statistical Japanese-to-English machine translation
                 using the dependency structure of source-side
                 sentences. A Japanese sentence is pre-ordered to an
                 English-like order at the morpheme level for a
                 statistical machine translation system during the
                 training and decoding phase to resolve the reordering
                 problem. In this article, extra-chunk pre-ordering of
                 morphemes is proposed, which allows Japanese functional
                 morphemes to move across chunk boundaries. This
                 contrasts with the intra-chunk reordering used in
                 previous approaches, which restricts the reordering of
                 morphemes within a chunk. Linguistically oriented
                 discussions show that correct pre-ordering cannot be
                 realized without extra-chunk movement of morphemes. The
                 proposed approach is compared with five rule-based
                 pre-ordering approaches designed for
                 Japanese-to-English translation and with a language
                 independent statistical pre-ordering approach on a
                 standard patent dataset and on a news dataset obtained
                 by crawling Internet news sites. Two state-of-the-art
                 statistical machine translation systems, one
                 phrase-based and the other hierarchical phrase-based,
                 are used in experiments. Experimental results show that
                 the proposed approach outperforms the compared
                 approaches on automatic reordering measures (Kendall's
                 $ \tau $, Spearman's $ \rho $, fuzzy reordering score,
                 and test set RIBES) and on the automatic translation
                 precision measure of test set BLEU score.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Lee:2015:ISI,
  author =       "Lung-Hao Lee and Gina-Anne Levow and Shih-Hung Wu and
                 Chao-Lin Liu",
  title =        "Introduction to the Special Issue on {Chinese} Spell
                 Checking",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "4",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818354",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/spell.bib;
                 http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  note =         "Special issue on Chinese spell checking.",
  abstract =     "This special issue contains four articles based on and
                 expanded from systems presented at the SIGHAN-7 Chinese
                 Spelling Check Bakeoff. We provide an overview of the
                 approaches and designs for Chinese spelling checkers
                 presented in these articles. We conclude this
                 introductory article with a summary of possible future
                 directions.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Chen:2015:PFC,
  author =       "Kuan-Yu Chen and Hsin-Min Wang and Hsin-Hsi Chen",
  title =        "A Probabilistic Framework for {Chinese} Spelling
                 Check",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "4",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2826234",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/spell.bib;
                 http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  note =         "Special issue on Chinese spell checking.",
  abstract =     "Chinese spelling check (CSC) is still an unsolved
                 problem today since there are many homonymous or
                 homomorphous characters. Recently, more and more CSC
                 systems have been proposed. To the best of our
                 knowledge, language modeling is one of the major
                 components among these systems because of its
                 simplicity and moderately good predictive power. After
                 deeply analyzing the school of research, we are aware
                 that most of the systems only employ the conventional n
                 -gram language models. The contributions of this
                 article are threefold. First, we propose a novel
                 probabilistic framework for CSC, which naturally
                 combines several important components, such as the
                 substitution model and the language model, to inherit
                 their individual merits as well as to overcome their
                 limitations. Second, we incorporate the topic language
                 models into the CSC system in an unsupervised fashion.
                 The topic language models can capture the long-span
                 semantic information from a word (character) string
                 while the conventional n -gram language models can only
                 preserve the local regularity information. Third, we
                 further integrate Web resources with the proposed
                 framework to enhance the overall performance. Our
                 rigorously empirical experiments demonstrate the
                 consistent and utility performance of the proposed
                 framework in the CSC task.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Liu:2015:HRA,
  author =       "Xiaodong Liu and Fei Cheng and Kevin Duh and Yuji
                 Matsumoto",
  title =        "A Hybrid Ranking Approach to {Chinese} Spelling
                 Check",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "4",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2822264",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/spell.bib;
                 http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  note =         "Special issue on Chinese spell checking.",
  abstract =     "We propose a novel framework for Chinese Spelling
                 Check (CSC), which is an automatic algorithm to detect
                 and correct Chinese spelling errors. Our framework
                 contains two key components: candidate generation and
                 candidate ranking. Our framework differs from previous
                 research, such as Statistical Machine Translation (SMT)
                 based model or Language Model (LM) based model, in that
                 we use both SMT and LM models as components of our
                 framework for generating the correction candidates, in
                 order to obtain maximum recall; to improve the
                 precision, we further employ a Support Vector Machines
                 (SVM) classifier to rank the candidates generated by
                 the SMT and the LM. Experiments show that our framework
                 outperforms other systems, which adopted the same or
                 similar resources as ours in the SIGHAN 7 shared task;
                 even comparing with the state-of-the-art systems, which
                 used more resources, such as a considerable large
                 dictionary, an idiom dictionary and other semantic
                 information, our framework still obtains competitive
                 results. Furthermore, to address the resource
                 scarceness problem for training the SMT model, we
                 generate around 2 million artificial training sentences
                 using the Chinese character confusion sets, which
                 include a set of Chinese characters with similar shapes
                 and similar pronunciations, provided by the SIGHAN 7
                 shared task.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Yeh:2015:CSC,
  author =       "Jui-Feng Yeh and Wen-Yi Chen and Mao-Chuan Su",
  title =        "{Chinese} Spelling Checker Based on an Inverted Index
                 List with a Rescoring Mechanism",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "4",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2826235",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/spell.bib;
                 http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  note =         "Special issue on Chinese spell checking.",
  abstract =     "An approach is proposed for Chinese spelling error
                 detection and correction, in which an inverted index
                 list with a rescoring mechanism is used. The inverted
                 index list is a structure for mapping from word to
                 desired sentence, and for representing nodes in
                 lattices constructed through character expansion
                 (according to predefined phonologically and visually
                 similar character sets). Pruning based on a contextual
                 dependency confidence measure was used to markedly
                 reduce the search space and computational complexity.
                 Relevant mapping relations between the original input
                 and desired input were obtained using a scoring
                 mechanism composed of class-based language and maximum
                 entropy correction models containing character, word,
                 and contextual features. The proposed method was
                 evaluated using data sets provided by SigHan 7 bakeoff.
                 The experimental results show that the proposed method
                 achieved acceptable performance in terms of recall rate
                 or precision rate in error sentence detection and error
                 location detection, and it outperformed other
                 approaches in error location detection and
                 correction.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Hsieh:2015:CCS,
  author =       "Yu-Ming Hsieh and Ming-Hong Bai and Shu-Ling Huang and
                 Keh-Jiann Chen",
  title =        "Correcting {Chinese} Spelling Errors with Word Lattice
                 Decoding",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "4",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2791389",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/spell.bib;
                 http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  note =         "Special issue on Chinese spell checking.",
  abstract =     "Chinese spell checkers are more difficult to develop
                 because of two language features: (1) there are no word
                 boundaries, and a character may function as a word or a
                 word morpheme; and (2) the Chinese character set
                 contains more than ten thousand characters. The former
                 makes it difficult for a spell checker to detect
                 spelling errors, and the latter makes it difficult for
                 a spell checker to construct error models. We develop a
                 word lattice decoding model for a Chinese spell checker
                 that addresses these difficulties. The model performs
                 word segmentation and error correction simultaneously,
                 thereby solving the word boundary problem. The model
                 corrects nonword errors as well as real-word errors. In
                 order to better estimate the error distribution of
                 large character sets for error models, we also propose
                 a methodology to extract spelling error samples
                 automatically from the Google web 1T corpus. Due to the
                 large quantity of data in the Google web 1T corpus,
                 many spelling error samples can be extracted, better
                 reflecting spelling error distributions in the real
                 world. Finally, in order to improve the spell checker
                 for real applications, we produce $n$-best suggestions
                 for spelling error corrections. We test our proposed
                 approach with the Bakeoff 2013 CSC Datasets; the
                 results show that the proposed methods with the error
                 model significantly outperform the performance of
                 Chinese spell checkers that do not use error models.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Anonymous:2015:TPE,
  author =       "Anonymous",
  title =        "{TALLIP} Perspectives: Editorial Commentary: The State
                 of the Journal",
  journal =      j-TALLIP,
  volume =       "14",
  number =       "4",
  pages =        "19:1--19:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2823512",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:49 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  note =         "Special issue on Chinese spell checking.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Hakro:2016:PTI,
  author =       "Dil Nawaz Hakro and Abdullah Zawawi Talib",
  title =        "Printed Text Image Database for {Sindhi} {OCR}",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "21:1--21:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2846093",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Document Image Understanding (DIU) and Electronic
                 Document Management are active fields of research
                 involving image understanding, interpretation,
                 efficient handling, and routing of documents as well as
                 their retrieval. Research on most of the noncursive
                 scripts (Latin) has matured, whereas research on the
                 cursive (connected) scripts is still moving toward
                 perfection. Many researchers are currently working on
                 the cursive scripts (Arabic and other scripts adopting
                 it) around the world so that the difficulties and
                 challenges in document understanding and handling of
                 these scripts can be overcome. Sindhi script has the
                 largest extension of the original Arabic alphabet among
                 languages adopting the Arabic script; it contains 52
                 characters, compared to 28 characters in the original
                 Arabic alphabet, in order to accommodate more sounds
                 for the language. There are 24 differentiating
                 characters with some possessing four dots. For Sindhi
                 OCR research and development, a database is needed for
                 training and testing of Sindhi text images. We have
                 developed a large database containing over 4 billion
                 words and 15 billion characters in 150 various fonts in
                 four font weights and four styles. The database
                 contents were collected from various sources including
                 websites, books, and theses. A custom-built application
                 was also developed to create a text image from a text
                 document that supports various fonts and sizes. The
                 database considers words, characters, characters with
                 spaces, and lines. The database is freely available as
                 a partial or full database by sending an email to one
                 of the authors.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ding:2016:WSB,
  author =       "Chenchen Ding and Ye Kyaw Thu and Masao Utiyama and
                 Eiichiro Sumita",
  title =        "Word Segmentation for {Burmese} ({Myanmar})",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "22:1--22:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2846095",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Experiments on various word segmentation approaches
                 for the Burmese language are conducted and discussed in
                 this note. Specifically, dictionary-based, statistical,
                 and machine learning approaches are tested.
                 Experimental results demonstrate that statistical and
                 machine learning approaches perform significantly
                 better than dictionary-based approaches. We believe
                 that this note, based on an annotated corpus of
                 relatively considerable size (containing approximately
                 a half million words), is the first systematic
                 comparison of word segmentation approaches for Burmese.
                 This work aims to discover the properties and proper
                 approaches to Burmese textual processing and to promote
                 further researches on this understudied language.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Zhang:2016:ITP,
  author =       "Tongtao Zhang and Aritra Chowdhury and Nimit Dhulekar
                 and Jinjing Xia and Kevin Knight and Heng Ji and
                 B{\"u}lent Yener and Liming Zhao",
  title =        "From Image to Translation: Processing the Endangered
                 {Nyushu} Script",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "23:1--23:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2857052",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The lack of computational support has significantly
                 slowed down automatic understanding of endangered
                 languages. In this paper, we take Nyushu (simplified
                 Chinese: [Chinese characters]; literally: ``women's
                 writing'') as a case study to present the first
                 computational approach that combines Computer Vision
                 and Natural Language Processing techniques to deeply
                 understand an endangered language. We developed an
                 end-to-end system to read a scanned hand-written Nyushu
                 article, segment it into characters, link them to
                 standard characters, and then translate the article
                 into Mandarin Chinese. We propose several novel methods
                 to address the new challenges introduced by noisy input
                 and low resources, including Nyushu-specific feature
                 selection for character segmentation and linking, and
                 character linking lattice based Machine Translation.
                 The end-to-end system performance indicates that the
                 system is a promising approach and can serve as a
                 standard benchmark.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sarigil:2016:SPW,
  author =       "Erdem Sarigil and Oguz Yilmaz and Ismail Sengor
                 Altingovde and Rifat Ozcan and {\"O}zg{\"U}r Ulusoy",
  title =        "A ``Suggested'' Picture of {Web} Search in {Turkish}",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "24:1--24:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2891105",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Although query log analysis provides crucial insights
                 about Web users' search interests, conducting such
                 analyses is almost impossible for some languages, as
                 large-scale and public query logs are quite scarce. In
                 this study, we first survey the existing query
                 collections in Turkish and discuss their limitations.
                 Next, we adopt a novel strategy to obtain a set of
                 Turkish queries using the query autocompletion services
                 from the four major search engines and provide the
                 first large-scale analysis of Web queries and their
                 results in Turkish.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Goswami:2016:CPG,
  author =       "Mukesh M. Goswami and Suman K. Mitra",
  title =        "Classification of Printed Gujarati Characters Using
                 Low-Level Stroke Features",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "25:1--25:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2856105",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article presents an elegant technique for
                 extracting the low-level stroke features, such as
                 endpoints, junction points, line elements, and curve
                 elements, from offline printed text using a template
                 matching approach. The proposed features are used to
                 classify a subset of characters from Gujarati script.
                 The database consists of approximately 16,782 samples
                 of 42 middle-zone symbols from the Gujarati character
                 set collected from three different sources: machine
                 printed books, newspapers, and laser printed documents.
                 The purpose of this division is to add variety in terms
                 of size, font type, style, ink variation, and boundary
                 deformation. The experiments are performed on the
                 database using a k-nearest neighbor (kNN) classifier
                 and results are compared with other widely used
                 structural features, namely Chain Codes (CC),
                 Directional Element Features (DEF), and Histogram of
                 Oriented Gradients (HoG). The results show that the
                 features are quite robust against the variations and
                 give comparable performance with other existing
                 works.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Choudhary:2016:FTA,
  author =       "Prakash Choudhary and Neeta Nain",
  title =        "A Four-Tier Annotated {Urdu} Handwritten Text Image
                 Dataset for Multidisciplinary Research on {Urdu}
                 Script",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "26:1--26:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2857053",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article introduces a large handwritten text
                 document image corpus dataset for Urdu script named
                 CALAM (Cursive And Language Adaptive Methodologies).
                 The database contains unconstrained handwritten
                 sentences along with their structural annotations for
                 the offline handwritten text images with their XML
                 representation. Urdu is the fourth most frequently used
                 language in the world, but due to its complex cursive
                 writing script and low resources, it is still a thrust
                 area for document image analysis. Here, a unified
                 approach is applied in the development of an Urdu
                 corpus by collecting printed texts, handwritten texts,
                 and demographic information of writers on a single
                 form. CALAM contains 1,200 handwritten text images,
                 3,043 lines, 46,664 words, and 101,181 ligatures. For
                 capturing maximum variance among the words and
                 handwritten styles, data collection is distributed
                 among six categories and 14 subcategories. Handwritten
                 forms were filled out by 725 different writers
                 belonging to different geographical regions, ages, and
                 genders with diverse educational backgrounds. A
                 structure has been designed to annotate handwritten
                 Urdu script images at line, word, and ligature levels
                 with an XML standard to provide a ground truth of each
                 image at different levels of annotation. This corpus
                 would be very useful for linguistic research in
                 benchmarking and providing a testbed for evaluation of
                 handwritten text recognition techniques for Urdu
                 script, signature verification, writer identification,
                 digital forensics, classification of printed and
                 handwritten text, categorization of texts as per use,
                 and so on. The experimental results of some recently
                 developed handwritten text line segmentation techniques
                 experimented on the proposed dataset are also presented
                 in the article for asserting its viability and
                 usability.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Norimatsu:2016:FCL,
  author =       "Jun-Ya Norimatsu and Makoto Yasuhara and Toru Tanaka
                 and Mikio Yamamoto",
  title =        "A Fast and Compact Language Model Implementation Using
                 Double-Array Structures",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "27:1--27:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2873068",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The language model is a widely used component in
                 fields such as natural language processing, automatic
                 speech recognition, and optical character recognition.
                 In particular, statistical machine translation uses
                 language models, and the translation speed and the
                 amount of memory required are greatly affected by the
                 performance of the language model implementation. We
                 propose a fast and compact implementation of n -gram
                 language models that increases query speed and reduces
                 memory usage by using a double-array structure, which
                 is known to be a fast and compact trie data structure.
                 We propose two types of implementation: one for
                 backward suffix trees and the other for reverse tries.
                 The data structure is optimized for space efficiency by
                 embedding model parameters into otherwise unused spaces
                 in the double-array structure. We show that the reverse
                 trie version of our method is among the smallest
                 state-of-the-art implementations in terms of model size
                 with almost the same speed as the implementation that
                 performs fastest on perplexity calculation tasks.
                 Similarly, we achieve faster decoding while keeping
                 compact model sizes, and we confirm that our method can
                 utilize the efficiency of the double-array structure to
                 achieve a balance between speed and size on translation
                 tasks.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Yang:2016:LGF,
  author =       "Haitong Yang and Chengqing Zong",
  title =        "Learning Generalized Features for Semantic Role
                 Labeling",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "28:1--28:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2890496",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article makes an effort to improve Semantic Role
                 Labeling (SRL) through learning generalized features.
                 The SRL task is usually treated as a supervised
                 problem. Therefore, a huge set of features are crucial
                 to the performance of SRL systems. But these features
                 often lack generalization powers when predicting an
                 unseen argument. This article proposes a simple
                 approach to relieve the issue. A strong intuition is
                 that arguments occurring in similar syntactic positions
                 are likely to bear the same semantic role, and,
                 analogously, arguments that are lexically similar are
                 likely to represent the same semantic role. Therefore,
                 it will be informative to SRL if syntactic or lexical
                 similar arguments can activate the same feature.
                 Inspired by this, we embed the information of
                 lexicalization and syntax into a feature vector for
                 each argument and then use K -means to make clustering
                 for all feature vectors of training set. For an unseen
                 argument to be predicted, it will belong to the same
                 cluster as its similar arguments of training set.
                 Therefore, the clusters can be thought of as a kind of
                 generalized feature. We evaluate our method on several
                 benchmarks. The experimental results show that our
                 approach can significantly improve the SRL
                 performance.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Bhowmik:2016:BHC,
  author =       "Tapan Kumar Bhowmik and Swapan Kumar Parui and Utpal
                 Roy and Lambert Schomaker",
  title =        "{Bangla} Handwritten Character Segmentation Using
                 Structural Features: a Supervised and Bootstrapping
                 Approach",
  journal =      j-TALLIP,
  volume =       "15",
  number =       "4",
  pages =        "29:1--29:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2890497",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In this article, we propose a new framework for
                 segmentation of Bangla handwritten word images into
                 meaningful individual symbols or pseudo-characters.
                 Existing segmentation algorithms are not usually
                 treated as a classification problem. However, in the
                 present study, the segmentation algorithm is looked
                 upon as a two-class supervised classification problem.
                 The method employs an SVM classifier to select the
                 segmentation points on the word image on the basis of
                 various structural features. For training of the SVM
                 classifier, an unannotated training set is prepared
                 first using candidate segmenting points. The training
                 set is then clustered, and each cluster is labeled
                 manually with minimal manual intervention. A
                 semi-automatic bootstrapping technique is also employed
                 to enlarge the training set from new samples. The
                 overall architecture describes a basic step toward
                 building an annotation system for the segmentation
                 problem, which has not so far been investigated. The
                 experimental results show that our segmentation method
                 is quite efficient in segmenting not only word images
                 but also handwritten texts. As a part of this work, a
                 database of Bangla handwritten word images has also
                 been developed. Considering our data collection method
                 and a statistical analysis of our lexicon set, we claim
                 that the relevant characteristics of an ideal lexicon
                 set are present in our handwritten word image
                 database.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Singh:2016:OHG,
  author =       "Sukhdeep Singh and Anuj Sharma and Indu Chhabra",
  title =        "Online Handwritten {Gurmukhi} Strokes Dataset Based on
                 Minimal Set of Words",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "1:1--1:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2896318",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The online handwriting data are an integral part of
                 data analysis and classification research, as collected
                 handwritten data offers many challenges to group
                 handwritten stroke classes. The present work has been
                 done for grouping handwritten strokes from the Indic
                 script Gurmukhi. Gurmukhi is the script of the popular
                 and widely spoken language Punjabi. The present work
                 includes development of the dataset of Gurmukhi words
                 in the context of online handwriting recognition for
                 real-life use applications, such as maps navigation. We
                 have collected the data of 100 writers from the largest
                 cities in the Punjab region. The writers' variations,
                 such as writing skill level (beginner, moderate, and
                 expert), gender, right or left handedness, and their
                 adaptability to digital handwriting, have been
                 considered in dataset development. We have introduced a
                 novel technique to form handwritten stroke classes
                 based on a limited set of words. The presence of all
                 alphabets including vowels of Gurmukhi script has been
                 considered before selection of a word. The developed
                 dataset includes 39,411 strokes from handwritten words
                 and forms 72 classes of strokes after using a k-means
                 clustering technique and manual verification through
                 expert and moderate writers. We have achieved
                 recognition results using the Hidden Markov Model as
                 87.10\%, 85.43\%, and 84.33\% for middle zone strokes
                 when using training data as 66\%, 50\%, and 80\% of the
                 developed dataset. The present work is a step in a
                 direction to find groups for unknown handwriting
                 strokes with reasonably higher levels of accuracy.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{El-Fiqi:2016:PCC,
  author =       "Heba El-Fiqi and Eleni Petraki and Hussein A. Abbass",
  title =        "Pairwise Comparative Classification for Translator
                 Stylometric Analysis",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "2:1--2:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2898997",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In this article, we present a new type of
                 classification problem, which we call Comparative
                 Classification Problem (CCP), where we use the term
                 data record to refer to a block of instances. Given a
                 single data record with n instances for n classes, the
                 CCP problem is to map each instance to a unique class.
                 This problem occurs in a wide range of applications
                 where the independent and identically distributed
                 assumption is broken down. The primary difference
                 between CCP and classical classification is that in the
                 latter, the assignment of a translator to one record is
                 independent of the assignment of a translator to a
                 different record. In CCP, however, the assignment of a
                 translator to one record within a block excludes this
                 translator from further assignments to any other record
                 in that block. The interdependency in the data poses
                 challenges for techniques relying on the independent
                 and identically distributed (iid) assumption. In the
                 Pairwise CCP (PWCCP), a pair of records is grouped
                 together. The key difference between PWCCP and
                 classical binary classification problems is that hidden
                 patterns can only be unmasked by comparing the
                 instances as pairs. In this article, we introduce a new
                 algorithm, PWC4.5, which is based on C4.5, to manage
                 PWCCP. We first show that a simple transformation-that
                 we call Gradient-Based Transformation (GBT)-can fix the
                 problem of iid in C4.5. We then evaluate PWC4.5 using
                 two real-world corpora to distinguish between
                 translators on Arabic-English and French-English
                 translations. While the traditional C4.5 failed to
                 distinguish between different translators, GBT
                 demonstrated better performance. Meanwhile, PWC4.5
                 consistently provided the best results over C4.5 and
                 GBT.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Qiao:2016:IUD,
  author =       "Xiuming Qiao and Hailong Cao and Tiejun Zhao",
  title =        "Improving Unsupervised Dependency Parsing with
                 Knowledge from Query Logs",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "3:1--3:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2903720",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Unsupervised dependency parsing becomes more and more
                 popular in recent years because it does not need
                 expensive annotations, such as treebanks, which are
                 required for supervised and semi-supervised dependency
                 parsing. However, its accuracy is still far below that
                 of supervised dependency parsers, partly due to the
                 fact that their parsing model is insufficient to
                 capture linguistic phenomena underlying texts. The
                 performance for unsupervised dependency parsing can be
                 improved by mining knowledge from the texts and by
                 incorporating it into the model. In this article,
                 syntactic knowledge is acquired from query logs to help
                 estimate better probabilities in dependency models with
                 valence. The proposed method is language independent
                 and obtains an improvement of 4.1\% unlabeled accuracy
                 on the Penn Chinese Treebank by utilizing additional
                 dependency relations from the Sogou query logs and
                 Baidu query logs. Morever, experiments show that the
                 proposed model achieves improvements of 8.07\% on CoNLL
                 2007 English using the AOL query logs. We believe query
                 logs are useful sources of syntactic knowledge for many
                 natural language processing (NLP) tasks.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Passban:2016:BNP,
  author =       "Peyman Passban and Qun Liu and Andy Way",
  title =        "Boosting Neural {POS} Tagger for {Farsi} Using
                 Morphological Information",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "4:1--4:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2934676",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Farsi (Persian) is a low-resource language that
                 suffers from the data sparsity problem and a lack of
                 efficient processing tools. Due to their broad
                 application in natural language processing tasks,
                 part-of-speech (POS) taggers are one of those important
                 tools that should be considered in this respect.
                 Despite recent work on Farsi tagging, there is still
                 room for improvement. The best reported accuracy so far
                 is 96\%, which in special cases can rise to 96.9\%. The
                 main problem with existing taggers is their
                 inefficiency in coping with out-of-vocabulary (OOV)
                 words. Addressing both problems of accuracy and OOV
                 words, we developed a neural network-based POS tagger
                 (NPT) that performs efficiently on Farsi. Despite using
                 less data, NPT provides better results in comparison to
                 state-of-the-art systems. Our proposed tagger performs
                 with an accuracy of 97.4\%, with performance highly
                 influenced by morphological features. We carry out a
                 shallow morphological analysis and show considerable
                 improvement over the baseline configuration.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Liu:2016:SBM,
  author =       "Liangliang Liu and Cungen Cao",
  title =        "A Seed-Based Method for Generating {Chinese} Confusion
                 Sets",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "5:1--5:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2933396",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In natural language, people often misuse a word
                 (called a ``confused word'') in place of other words
                 (called ``confusing words''). In misspelling
                 corrections, many approaches to finding and correcting
                 misspelling errors are based on a simple notion called
                 a ``confusion set.'' The confusion set of a confused
                 word consists of confusing words. In this article, we
                 propose a new method of building Chinese character
                 confusion sets. Our method is composed of two major
                 phases. In the first phase, we build a list of seed
                 confusion sets for each Chinese character, which is
                 based on measuring similarity in character pinyin or
                 similarity in character shape. In this phase, all
                 confusion sets are constructed manually, and the
                 confusion sets are organized into a graph, called a
                 ``seed confusion graph'' (SCG), in which vertices
                 denote characters and edges are pairs of characters in
                 the form (confused character, confusing character). In
                 the second phase, we extend the SCG by acquiring more
                 pairs of (confused character, confusing character) from
                 a large Chinese corpus. For this, we use several word
                 patterns (or patterns) to generate new confusion pairs
                 and then verify the pairs before adding them into a
                 SCG. Comprehensive experiments show that our method of
                 extending confusion sets is effective. Also, we shall
                 use the confusion sets in Chinese misspelling
                 corrections to show the utility of our method.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Li:2016:ISP,
  author =       "Junhui Li and Muhua Zhu and Wei Lu and Guodong Zhou",
  title =        "Improving Semantic Parsing with Enriched Synchronous
                 Context-Free Grammars in Statistical Machine
                 Translation",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "6:1--6:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2963099",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Semantic parsing maps a sentence in natural language
                 into a structured meaning representation. Previous
                 studies show that semantic parsing with synchronous
                 context-free grammars (SCFGs) achieves favorable
                 performance over most other alternatives. Motivated by
                 the observation that the performance of semantic
                 parsing with SCFGs is closely tied to the translation
                 rules, this article explores to extend translation
                 rules with high quality and increased coverage in three
                 ways. First, we examine the difference between word
                 alignments for semantic parsing and statistical machine
                 translation (SMT) to better adapt word alignment in SMT
                 to semantic parsing. Second, we introduce both
                 structure and syntax informed nonterminals, better
                 guiding the parsing in favor of well-formed structure,
                 instead of using a uninformed nonterminal in SCFGs.
                 Third, we address the unknown word translation issue
                 via synthetic translation rules. Last but not least, we
                 use a filtering approach to improve performance via
                 predicting answer type. Evaluation on the standard
                 GeoQuery benchmark dataset shows that our approach
                 greatly outperforms the state of the art across various
                 languages, including English, Chinese, Thai, German,
                 and Greek.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Krishnamurthi:2016:UDS,
  author =       "Karthik Krishnamurthi and Vijayapal Reddy Panuganti
                 and Vishnu Vardhan Bulusu",
  title =        "Understanding Document Semantics from Summaries: a
                 Case Study on {Hindi} Texts",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "1",
  pages =        "7:1--7:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2956236",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:51 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Summary of a document contains words that actually
                 contribute to the semantics of the document. Latent
                 Semantic Analysis (LSA) is a mathematical model that is
                 used to understand document semantics by deriving a
                 semantic structure based on patterns of word
                 correlations in the document. When using LSA to capture
                 semantics from summaries, it is observed that LSA
                 performs quite well despite being completely
                 independent of any external sources of semantics.
                 However, LSA can be remodeled to enhance its capability
                 to analyze correlations within texts. By taking
                 advantage of the model being language independent, this
                 article presents two stages of LSA remodeling to
                 understand document semantics in the Indian context,
                 specifically from Hindi text summaries. One stage of
                 remodeling is done by providing supplementary
                 information, such as document category and domain
                 information. The second stage of remodeling is done by
                 using a supervised term weighting measure in the
                 process. The remodeled LSA's performance is empirically
                 evaluated in a document classification application by
                 comparing the accuracies of classification to plain
                 LSA. An improvement in the performance of LSA in the
                 range of 4.7\% to 6.2\% is achieved from the remodel
                 when compared to the plain model. The results suggest
                 that summaries of documents efficiently capture the
                 semantic structure of documents and is an alternative
                 to full-length documents for understanding document
                 semantics.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Tursun:2016:STT,
  author =       "Eziz Tursun and Debasis Ganguly and Turghun Osman and
                 Ya-Ting Yang and Ghalip Abdukerim and Jun-Lin Zhou and
                 Qun Liu",
  title =        "A Semisupervised Tag-Transition-Based {Markovian}
                 Model for {Uyghur} Morphology Analysis",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "8:1--8:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2968410",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Morphological analysis, which includes analysis of
                 part-of-speech (POS) tagging, stemming, and morpheme
                 segmentation, is one of the key components in natural
                 language processing (NLP), particularly for
                 agglutinative languages. In this article, we
                 investigate the morphological analysis of the Uyghur
                 language, which is the native language of the people in
                 the Xinjiang Uyghur autonomous region of western China.
                 Morphological analysis of Uyghur is challenging
                 primarily because of factors such as (1) ambiguities
                 arising due to the likelihood of association of a
                 multiple number of POS tags with a word stem or a
                 multiple number of functional tags with a word suffix,
                 (2) ambiguous morpheme boundaries, and (3) complex
                 morphopholonogy of the language. Further, the
                 unavailability of a manually annotated training set in
                 the Uyghur language for the purpose of word
                 segmentation makes Uyghur morphological analysis more
                 difficult. In our proposed work, we address these
                 challenges by undertaking a semisupervised approach of
                 learning a Markov model with the help of a manually
                 constructed dictionary of ``suffix to tag'' mappings in
                 order to predict the most likely tag transitions in the
                 Uyghur morpheme sequence. Due to the linguistic
                 characteristics of Uyghur, we incorporate a prior
                 belief in our model for favoring word segmentations
                 with a lower number of morpheme units. Empirical
                 evaluation of our proposed model shows an accuracy of
                 about 82\%. We further improve the effectiveness of the
                 tag transition model with an active learning paradigm.
                 In particular, we manually investigated a subset of
                 words for which the model prediction ambiguity was
                 within the top 20\%. Manually incorporating rules to
                 handle these erroneous cases resulted in an overall
                 accuracy of 93.81\%.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Nguyen:2016:ACN,
  author =       "Long H. B. Nguyen and Dien Dinh and Phuoc Tran",
  title =        "An Approach to Construct a Named Entity Annotated
                 {English--Vietnamese} Bilingual Corpus",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "9:1--9:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2990191",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Manually constructing an annotated Named Entity (NE)
                 in a bilingual corpus is a time-consuming,
                 labor--intensive, and expensive process, but this is
                 necessary for natural language processing (NLP) tasks
                 such as cross-lingual information retrieval,
                 cross-lingual information extraction, machine
                 translation, etc. In this article, we present an
                 automatic approach to construct an annotated NE in
                 English-Vietnamese bilingual corpus from a bilingual
                 parallel corpus by proposing an aligned NE method.
                 Basing this corpus on a bilingual corpus in which the
                 initial NEs are extracted from its own language
                 separately, the approach tries to correct unrecognized
                 NEs or incorrectly recognized NEs before aligning the
                 NEs by using a variety of bilingual constraints. The
                 generated corpus not only improves the NE recognition
                 results but also creates alignments between English NEs
                 and Vietnamese NEs, which are necessary for training NE
                 translation models. The experimental results show that
                 the approach outperforms the baseline methods
                 effectively. In the English-Vietnamese NE alignment
                 task, the F-measure increases from 68.58\% to 79.77\%.
                 Thanks to the improvement of the NE recognition
                 quality, the proposed method also increases
                 significantly: the F-measure goes from 84.85\% to
                 88.66\% for the English side and from 75.71\% to
                 85.55\% for the Vietnamese side. By providing the
                 additional semantic information for the machine
                 translation systems, the BLEU score increases from
                 33.04\% to 45.11\%.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Chou:2016:BWN,
  author =       "Chien-Lung Chou and Chia-Hui Chang and Ya-Yun Huang",
  title =        "Boosted {Web} Named Entity Recognition via
                 Tri-Training",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "10:1--10:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2963100",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Named entity extraction is a fundamental task for many
                 natural language processing applications on the web.
                 Existing studies rely on annotated training data, which
                 is quite expensive to obtain large datasets, limiting
                 the effectiveness of recognition. In this research, we
                 propose a semisupervised learning approach for web
                 named entity recognition (NER) model construction via
                 automatic labeling and tri-training. The former
                 utilizes structured resources containing known named
                 entities for automatic labeling, while the latter makes
                 use of unlabeled examples to improve the extraction
                 performance. Since this automatically labeled training
                 data may contain noise, a self-testing procedure is
                 used as a follow-up to remove low-confidence annotation
                 and prepare higher-quality training data. Furthermore,
                 we modify tri-training for sequence labeling and derive
                 a proper initialization for large dataset training to
                 improve entity recognition. Finally, we apply this
                 semisupervised learning framework for person name
                 recognition, business organization name recognition,
                 and location name extraction. In the task of Chinese
                 NER, an F-measure of 0.911, 0.849, and 0.845 can be
                 achieved, for person, business organization, and
                 location NER, respectively. The same framework is also
                 applied for English and Japanese business organization
                 name recognition and obtains models with performance of
                 a 0.832 and 0.803 F-measure.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sadek:2016:DBA,
  author =       "Jawad Sadek and Farid Meziane",
  title =        "A Discourse-Based Approach for {Arabic} Question
                 Answering",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "11:1--11:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2988238",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The treatment of complex questions with explanatory
                 answers involves searching for arguments in texts.
                 Because of the prominent role that discourse relations
                 play in reflecting text producers' intentions,
                 capturing the underlying structure of text constitutes
                 a good instructor in this issue. From our extensive
                 review, a system for automatic discourse analysis that
                 creates full rhetorical structures in large-scale
                 Arabic texts is currently unavailable. This is due to
                 the high computational complexity involved in
                 processing a large number of hypothesized relations
                 associated with large texts. Therefore, more practical
                 approaches should be investigated. This article
                 presents a new Arabic Text Parser oriented for
                 question-answering systems dealing with [Arabic
                 characters] ``why'' and [Arabic characters] ``how to''
                 questions. The Text Parser presented here considers the
                 sentence as the basic unit of text and incorporates a
                 set of heuristics to avoid computational explosion.
                 With this approach, the developed question-answering
                 system reached a significant improvement over the
                 baseline with a Recall of 68\% and MRR of 0.62.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Tran:2016:WRS,
  author =       "Phuoc Tran and Dien Dinh and Long H. B. Nguyen",
  title =        "Word Re-Segmentation in {Chinese--Vietnamese} Machine
                 Translation",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "12:1--12:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2988237",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In isolated languages, such as Chinese and Vietnamese,
                 words are not separated by spaces, and a word may be
                 formed by one or more syllables. Therefore, word
                 segmentation (WS) is usually the first process that is
                 implemented in the machine translation process. WS in
                 the source and target languages is based on different
                 training corpora, and WS approaches may not be the
                 same. Therefore, the WS that results in these two
                 languages are not often homologous, and thus word
                 alignment results in many 1-n and n-1 alignment pairs
                 in statistical machine translation, which degrades the
                 performance of machine translation. In this article, we
                 will adjust the WS for both Chinese and Vietnamese in
                 particular and for isolated language pairs in general
                 and make the word boundary of the two languages more
                 symmetric in order to strengthen 1-1 alignments and
                 enhance machine translation performance. We have tested
                 this method on the Computational Linguistics Center's
                 corpus, which consists of 35,623 sentence pairs. The
                 experimental results show that our method has
                 significantly improved the performance of machine
                 translation compared to the baseline translation
                 system, WS translation system, and anchor
                 language-based WS translation systems.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Li:2016:MSC,
  author =       "Peifeng Li and Guodong Zhou and Qiaoming Zhu",
  title =        "Minimally Supervised {Chinese} Event Extraction from
                 Multiple Views",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "13:1--13:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2994600",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Although several semi-supervised learning models have
                 been proposed for English event extraction, there are
                 few successful stories in Chinese due to its special
                 characteristics. In this article, we propose a novel
                 minimally supervised model for Chinese event extraction
                 from multiple views. Besides the traditional pattern
                 similarity view (PSV), a semantic relationship view
                 (SRV) is introduced to capture the relevant event
                 mentions from relevant documents. Moreover, a
                 morphological structure view (MSV) is incorporated to
                 both infer more positive patterns and help filter
                 negative patterns via morphological structure
                 similarity. An evaluation of the ACE 2005 Chinese
                 corpus shows that our minimally supervised model
                 significantly outperforms several strong baselines.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Atreya:2016:QER,
  author =       "Arjun {Atreya V} and Ashish Kankaria and Pushpak
                 Bhattacharyya and Ganesh Ramakrishnan",
  title =        "Query Expansion in Resource-Scarce Languages: a
                 Multilingual Framework Utilizing Document Structure",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "2",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2997643",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Retrievals in response to queries to search engines in
                 resource-scarce languages often produce no results,
                 which annoys the user. In such cases, at least
                 partially relevant documents must be retrieved. We
                 propose a novel multilingual framework, MultiStructPRF,
                 which expands the query with related terms by (i) using
                 a resource-rich assisting language and (ii) giving
                 varied importance to the expansion terms depending on
                 their position of occurrence in the document. Our
                 system uses the help of an assisting language to expand
                 the query in order to improve system recall. We propose
                 a systematic expansion model for weighting the
                 expansion terms coming from different parts of the
                 document. To combine the expansion terms from query
                 language and assisting language, we propose a
                 heuristics-based fusion model. Our experimental results
                 show an improvement over other PRF techniques in both
                 precision and recall for multiple resource-scarce
                 languages like Marathi, Bengali, Odia, Finnish, and the
                 like. We study the effect of different assisting
                 languages on precision and recall for multiple query
                 languages. Our experiments reveal an interesting fact:
                 Precision is positively correlated with the typological
                 closeness of query language and assisting language,
                 whereas recall is positively correlated with the
                 resource richness of the assisting language.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Finch:2017:IBL,
  author =       "Andrew Finch and Taisuke Harada and Kumiko
                 Tanaka-Ishii and Eiichiro Sumita",
  title =        "Inducing a Bilingual Lexicon from Short Parallel
                 Multiword Sequences",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3003726",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article proposes a technique for mining bilingual
                 lexicons from pairs of parallel short word sequences.
                 The technique builds a generative model from a corpus
                 of training data consisting of such pairs. The model is
                 a hierarchical nonparametric Bayesian model that
                 directly induces a bilingual lexicon while training.
                 The model learns in an unsupervised manner and is
                 designed to exploit characteristics of the language
                 pairs being mined. The proposed model is capable of
                 utilizing commonly used word-pair frequency information
                 and additionally can employ the internal character
                 alignments within the words themselves. It is thereby
                 capable of mining transliterations and can use reliably
                 aligned transliteration pairs to support the mining of
                 other words in their context. The model is also capable
                 of performing word reordering and word deletion during
                 the alignment process, and it is furthermore capable of
                 operating in the absence of full segmentation
                 information. In this work, we study two mining tasks
                 based on English-Japanese and English--Chinese language
                 pairs, and compare the proposed approach to baselines
                 based on a simpler models that use only word-pair
                 frequency information. Our results show that the
                 proposed method is able to mine bilingual word pairs at
                 higher levels of precision and recall than the
                 baselines.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wang:2017:CSC,
  author =       "Shaonan Wang and Chengqing Zong",
  title =        "Comparison Study on Critical Components in Composition
                 Model for Phrase Representation",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3010088",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Phrase representation, an important step in many NLP
                 tasks, involves representing phrases as
                 continuous-valued vectors. This article presents
                 detailed comparisons concerning the effects of word
                 vectors, training data, and the composition and
                 objective function used in a composition model for
                 phrase representation. Specifically, we first discuss
                 how the augmented word representations affect the
                 performance of the composition model. Then, we
                 investigate whether different types of training data
                 influence the performance of the composition model and,
                 if so, how they influence it. Finally, we evaluate
                 combinations of different composition and objective
                 functions and discuss the factors related to
                 composition model performance. All evaluations were
                 conducted in both English and Chinese. Our main
                 findings are as follows: (1) The Additive model with
                 semantic enhanced word vectors performs comparably to
                 the state-of-the-art model; (2) The Additive model
                 which updates augmented word vectors and the Matrix
                 model with semantic enhanced word vectors
                 systematically outperforms the state-of-the-art model
                 in bigram and multi-word phrase similarity task,
                 respectively; (3) Representing the high frequency
                 phrases by estimating their surrounding contexts is a
                 good training objective for bigram phrase similarity
                 tasks; and (4) The performance gain of composition
                 model with semantic enhanced word vectors is due to the
                 composition function and the greater weight attached to
                 important words. Previous works focus on the
                 composition function; however, our findings indicate
                 that other components in the composition model
                 (especially word representation) make a critical
                 difference in phrase representation.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Bhat:2017:ITB,
  author =       "Riyaz Ahmad Bhat and Irshad Ahmad Bhat and Dipti Misra
                 Sharma",
  title =        "Improving Transition-Based Dependency Parsing of
                 {Hindi} and {Urdu} by Modeling Syntactically Relevant
                 Phenomena",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3005447",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In recent years, transition-based parsers have shown
                 promise in terms of efficiency and accuracy. Though
                 these parsers have been extensively explored for
                 multiple Indian languages, there is still considerable
                 scope for improvement by properly incorporating
                 syntactically relevant information. In this article, we
                 enhance transition-based parsing of Hindi and Urdu by
                 redefining the features and feature extraction
                 procedures that have been previously proposed in the
                 parsing literature of Indian languages. We propose and
                 empirically show that properly incorporating
                 syntactically relevant information like case marking,
                 complex predication and grammatical agreement in an
                 arc-eager parsing model can significantly improve
                 parsing accuracy. Our experiments show an absolute
                 improvement of $\approx 2$\% LAS for parsing of both
                 Hindi and Urdu over a competitive baseline which uses
                 rich features like part-of-speech (POS) tags, chunk
                 tags, cluster ids and lemmas. We also propose some
                 heuristics to identify ezafe constructions in Urdu
                 texts which show promising results in parsing these
                 constructions.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Das:2017:NER,
  author =       "Arjun Das and Debasis Ganguly and Utpal Garain",
  title =        "Named Entity Recognition with Word Embeddings and
                 {Wikipedia} Categories for a Low-Resource Language",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "18:1--18:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3015467",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In this article, we propose a word embedding--based
                 named entity recognition (NER) approach. NER is
                 commonly approached as a sequence labeling task with
                 the application of methods such as conditional random
                 field (CRF). However, for low-resource languages
                 without the presence of sufficiently large training
                 data, methods such as CRF do not perform well. In our
                 work, we make use of the proximity of the vector
                 embeddings of words to approach the NER problem. The
                 hypothesis is that word vectors belonging to the same
                 name category, such as a person's name, occur in close
                 vicinity in the abstract vector space of the embedded
                 words. Assuming that this clustering hypothesis is
                 true, we apply a standard classification approach on
                 the vectors of words to learn a decision boundary
                 between the NER classes. Our NER experiments are
                 conducted on a morphologically rich and low-resource
                 language, namely Bengali. Our approach significantly
                 outperforms standard baseline CRF approaches that use
                 cluster labels of word embeddings and gazetteers
                 constructed from Wikipedia. Further, we propose an
                 unsupervised approach (that uses an automatically
                 created named entity (NE) gazetteer from Wikipedia in
                 the absence of training data). For a low-resource
                 language, the word vectors obtained from Wikipedia are
                 not sufficient to train a classifier. As a result, we
                 propose to make use of the distance measure between the
                 vector embeddings of words to expand the set of
                 Wikipedia training examples with additional NEs
                 extracted from a monolingual corpus that yield
                 significant improvement in the unsupervised NER
                 performance. In fact, our expansion method performs
                 better than the traditional CRF-based (supervised)
                 approach (i.e., F-score of 65.4\% vs. 64.2\%). Finally,
                 we compare our proposed approach to the official
                 submission for the IJCNLP-2008 Bengali NER shared task
                 and achieve an overall improvement of F-score 11.26\%
                 with respect to the best official system.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Li:2017:IDR,
  author =       "Haoran Li and Jiajun Zhang and Chengqing Zong",
  title =        "Implicit Discourse Relation Recognition for {English}
                 and {Chinese} with Multiview Modeling and Effective
                 Representation Learning",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "19:1--19:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3028772",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Discourse relations between two text segments play an
                 important role in many Natural Language Processing
                 (NLP) tasks. The connectives strongly indicate the
                 sense of discourse relations, while in fact, there are
                 no connectives in a large proportion of discourse
                 relations, that is, implicit discourse relations.
                 Compared with explicit relations, implicit relations
                 are much harder to detect and have drawn significant
                 attention. Until now, there have been many studies
                 focusing on English implicit discourse relations, and
                 few studies address implicit relation recognition in
                 Chinese even though the implicit discourse relations in
                 Chinese are more common than those in English. In our
                 work, both the English and Chinese languages are our
                 focus. The key to implicit relation prediction is to
                 properly model the semantics of the two discourse
                 arguments, as well as the contextual interaction
                 between them. To achieve this goal, we propose a neural
                 network based framework that consists of two
                 hierarchies. The first one is the model hierarchy, in
                 which we propose a max-margin learning method to
                 explore the implicit discourse relation from multiple
                 views. The second one is the feature hierarchy, in
                 which we learn multilevel distributed representations
                 from words, arguments, and syntactic structures to
                 sentences. We have conducted experiments on the
                 standard benchmarks of English and Chinese, and the
                 results show that compared with several methods our
                 proposed method can achieve the best performance in
                 most cases.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Tholpadi:2017:CBT,
  author =       "Goutham Tholpadi and Chiranjib Bhattacharyya and
                 Shirish Shevade",
  title =        "Corpus-Based Translation Induction in {Indian}
                 Languages Using Auxiliary Language Corpora from
                 {Wikipedia}",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "20:1--20:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3038295",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Identifying translations from comparable corpora is a
                 well-known problem with several applications. Existing
                 methods rely on linguistic tools or high-quality
                 corpora. Absence of such resources, especially in
                 Indian languages, makes this problem hard; for example,
                 state-of-the-art techniques achieve a mean reciprocal
                 rank of 0.66 for English--Italian, and a mere 0.187 for
                 Telugu-Kannada. In this work, we address the problem of
                 comparable corpora-based translation correspondence
                 induction (CC-TCI) when the only resources available
                 are small noisy comparable corpora extracted from
                 Wikipedia. We observe that translations in the source
                 and target languages have many topically related words
                 in common in other ``auxiliary'' languages. To model
                 this, we define the notion of a translingual theme, a
                 set of topically related words from auxiliary language
                 corpora, and present a probabilistic framework for
                 CC-TCI. Extensive experiments on 35 comparable corpora
                 showed dramatic improvements in performance. We extend
                 these ideas to propose a method for measuring
                 cross-lingual semantic relatedness (CLSR) between
                 words. To stimulate further research in this area, we
                 make publicly available two new high-quality
                 human-annotated datasets for CLSR. Experiments on the
                 CLSR datasets show more than 200\% improvement in
                 correlation on the CLSR task. We apply the method to
                 the real-world problem of cross-lingual Wikipedia title
                 suggestion and build the WikiTSu system. A user study
                 on WikiTSu shows a 20\% improvement in the quality of
                 titles suggested.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Zhao:2017:HMC,
  author =       "Hai Zhao and Deng Cai and Yang Xin and Yuzhu Wang and
                 Zhongye Jia",
  title =        "A Hybrid Model for {Chinese} Spelling Check",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "3",
  pages =        "21:1--21:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3047405",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Mon Apr 3 08:15:52 MDT 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Spelling check for Chinese has more challenging
                 difficulties than that for other languages. A hybrid
                 model for Chinese spelling check is presented in this
                 article. The hybrid model consists of three components:
                 one graph-based model for generic errors and two
                 independently trained models for specific errors. In
                 the graph model, a directed acyclic graph is generated
                 for each sentence, and the single-source shortest-path
                 algorithm is performed on the graph to detect and
                 correct general spelling errors at the same time. Prior
                 to that, two types of errors over functional words
                 (characters) are first solved by conditional random
                 fields: the confusion of ``[Chinese characters]'' (at)
                 (pinyin is zai in Chinese), ``[Chinese characters]''
                 (again, more, then) (pinyin: zai) and ``[Chinese
                 characters]'' (of) (pinyin: de), ``[Chinese
                 characters]'' (- ly, adverb-forming particle) (pinyin:
                 de), and ``[Chinese characters]'' (so that, have to)
                 (pinyin: de). Finally, a rule-based model is exploited
                 to distinguish pronoun usage confusion: ``[Chinese
                 characters]'' (she) (pinyin: ta), ``[Chinese
                 characters]'' (he) (pinyin: ta), and some other common
                 collocation errors. The proposed model is evaluated on
                 the standard datasets released by the SIGHAN Bake-off
                 shared tasks, giving state-of-the-art results.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wali:2017:ECL,
  author =       "Wafa Wali and Bilel Gargouri and Adelmajid Ben
                 Hamadou",
  title =        "Evaluating the Content of {LMF} Standardized
                 Dictionaries: a Practical Experiment on {Arabic}
                 Language",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "4",
  pages =        "22:1--22:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3047406",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Since the age of paper versions, dictionaries are
                 often published with anomalies in their content
                 resulting from lexicographer's mistakes or from the
                 lack of efficiency of automatic enrichment systems.
                 Many of these anomalies are expensive to manually
                 detect and difficult to automatically control, notably
                 with lightly structured models of dictionaries. In this
                 article, we take advantage of the fine structure
                 proposed by the Lexical Markup Framework (LMF) norm to
                 investigate the detection of anomalies in the content
                 of LMF normalized dictionaries. First, we give a
                 theoretical study on the plausible anomalies, such as
                 inconsistency, incoherence, redundancy, and
                 incompleteness. Second, we detail the approach that we
                 propose for the automatic detection of such anomalies.
                 Finally, we report on an experiment carried out on an
                 available normalized dictionary of the Arabic language.
                 The experiment has shown that the proposed approach
                 gives reasonable results in terms of precision and
                 recall.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Baly:2017:STM,
  author =       "Ramy Baly and Hazem Hajj and Nizar Habash and Khaled
                 Bashir Shaban and Wassim El-Hajj",
  title =        "A Sentiment {Treebank} and Morphologically Enriched
                 Recursive Deep Models for Effective Sentiment Analysis
                 in {Arabic}",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "4",
  pages =        "23:1--23:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3086576",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Accurate sentiment analysis models encode the
                 sentiment of words and their combinations to predict
                 the overall sentiment of a sentence. This task becomes
                 challenging when applied to morphologically rich
                 languages (MRL). In this article, we evaluate the use
                 of deep learning advances, namely the Recursive Neural
                 Tensor Networks (RNTN), for sentiment analysis in
                 Arabic as a case study of MRLs. While Arabic may not be
                 considered the only representative of all MRLs, the
                 challenges faced and proposed solutions in Arabic are
                 common to many other MRLs. We identify, illustrate, and
                 address MRL-related challenges and show how RNTN is
                 affected by the morphological richness and orthographic
                 ambiguity of the Arabic language. To address the
                 challenges with sentiment extraction from text in MRL,
                 we propose to explore different orthographic features
                 as well as different morphological features at multiple
                 levels of abstraction ranging from raw words to roots.
                 A key requirement for RNTN is the availability of a
                 sentiment treebank; a collection of syntactic parse
                 trees annotated for sentiment at all levels of
                 constituency and that currently only exists in English.
                 Therefore, our contribution also includes the creation
                 of the first Arabic Sentiment Treebank (A rSenTB) that
                 is morphologically and orthographically enriched.
                 Experimental results show that, compared to the basic
                 RNTN proposed for English, our solution achieves
                 significant improvements up to 8\% absolute at the
                 phrase level and 10.8\% absolute at the sentence level,
                 measured by average F1 score. It also outperforms
                 well-known classifiers including Support Vector
                 Machines, Recursive Auto Encoders, and Long Short-Term
                 Memory by 7.6\%, 3.2\%, and 1.6\% absolute
                 respectively, all models being trained with similar
                 morphological considerations.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Punchimudiyanse:2017:AFW,
  author =       "Malinda Punchimudiyanse and Ravinda Gayan Narendra
                 Meegama",
  title =        "Animation of Fingerspelled Words and Number Signs of
                 the {Sinhala} Sign Language",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "4",
  pages =        "24:1--24:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092743",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Sign language is the primary communication medium of
                 the aurally handicapped community. Often, a sign
                 gesture is mapped to a word or a phrase in a spoken
                 language and named as a conversational sign. A
                 fingerspelling sign is a special sign derived to show a
                 single character that matches a character in the
                 alphabet of a given language. This enables the deaf
                 community to express words that do not have a
                 conversational sign, such as a name, using a
                 letter-by-letter technique. Sinhala Sign Language (SSL)
                 uses a phonetic pronunciation mechanism to decode such
                 words due to the presence of one or more modifiers
                 after a consonant. Expressing numbers also have a
                 similar notation, and it is broken down into parts
                 before interpretation in sign gestures. This article
                 presents the variations implemented to make the 3D
                 avatar-based interpreter system look similar to an
                 actual fingerspelled SSL by a human interpreter. To
                 accomplish the task, a phonetic English-based 3D avatar
                 animation system is developed with Blender animation
                 software. The conversion of Sinhala Unicode text to
                 phonetic English and numbers written in digits to sign
                 gestures is done with a Visual Basic.NET (VB.NET)
                 application. The presented application has 61 SSL
                 fingerspelling signs and 40 SSL number signs. It is
                 capable of interpreting any word written using the
                 modern Sinhala alphabet without conversational signs
                 and interprets the numbers that go up to the billions.
                 This is a helpful tool in teaching SSL fingerspelling
                 and number signs of SSL to deaf children.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Al-Sallab:2017:ARD,
  author =       "Ahmad Al-Sallab and Ramy Baly and Hazem Hajj and
                 Khaled Bashir Shaban and Wassim El-Hajj and Gilbert
                 Badaro",
  title =        "{AROMA}: a Recursive Deep Learning Model for Opinion
                 Mining in {Arabic} as a Low Resource Language",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "4",
  pages =        "25:1--25:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3086575",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "While research on English opinion mining has already
                 achieved significant progress and success, work on
                 Arabic opinion mining is still lagging. This is mainly
                 due to the relative recency of research efforts in
                 developing natural language processing (NLP) methods
                 for Arabic, handling its morphological complexity, and
                 the lack of large-scale opinion resources for Arabic.
                 To close this gap, we examine the class of models used
                 for English and that do not require extensive use of
                 NLP or opinion resources. In particular, we consider
                 the Recursive Auto Encoder (RAE). However, RAE models
                 are not as successful in Arabic as they are in English,
                 due to their limitations in handling the morphological
                 complexity of Arabic, providing a more complete and
                 comprehensive input features for the auto encoder, and
                 performing semantic composition following the natural
                 way constituents are combined to express the overall
                 meaning. In this article, we propose A Recursive Deep
                 Learning Model for Opinion Mining in Arabic (AROMA)
                 that addresses these limitations. AROMA was evaluated
                 on three Arabic corpora representing different genres
                 and writing styles. Results show that AROMA achieved
                 significant performance improvements compared to the
                 baseline RAE. It also outperformed several well-known
                 approaches in the literature.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Kong:2017:CSE,
  author =       "Fang Kong and Guodong Zhou",
  title =        "A {CDT}-Styled End-to-End {Chinese} Discourse Parser",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "4",
  pages =        "26:1--26:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3099557",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Discourse parsing is a challenging task and plays a
                 critical role in discourse analysis. Since the release
                 of the Rhetorical Structure Theory Discourse Treebank
                 and the Penn Discourse Treebank, the research on
                 English discourse parsing has attracted increasing
                 attention and achieved considerable success in recent
                 years. At the same time, some preliminary research on
                 certain subtasks about discourse parsing for other
                 languages, such as Chinese, has been conducted. In this
                 article, we present an end-to-end Chinese discourse
                 parser with the Connective-Driven Dependency Tree
                 scheme, which consists of multiple components in a
                 pipeline architecture, such as the elementary discourse
                 unit (EDU) detector, discourse relation recognizer,
                 discourse parse tree generator, and attribution
                 labeler. In particular, the attribution labeler
                 determines two attributions (i.e., sense and centering)
                 for every nonterminal node (i.e., discourse relation)
                 in the discourse parse trees. Systematically, our
                 parser detects all EDUs in a free text, generates the
                 discourse parse tree in a bottom-up way, and determines
                 the sense and centering attributions for all
                 nonterminal nodes by traversing the discourse parse
                 tree. Comprehensive evaluation on the Connective-Driven
                 Dependency Treebank corpus from both component-wise and
                 error-cascading perspectives is conducted to illustrate
                 how each component performs in isolation, and how the
                 pipeline performs with error propagation. Finally, it
                 shows that our end-to-end Chinese discourse parser
                 achieves an overall F1 score of 20\% with full
                 automation.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Liu:2017:PAL,
  author =       "Shih-Hung Liu and Kuan-Yu Chen and Yu-Lun Hsieh and
                 Berlin Chen and Hsin-Min Wang and Hsu-Chun Yen and
                 Wen-Lian Hsu",
  title =        "A Position-Aware Language Modeling Framework for
                 Extractive Broadcast News Speech Summarization",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "4",
  pages =        "27:1--27:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3099472",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Extractive summarization, a process that automatically
                 picks exemplary sentences from a text (or spoken)
                 document with the goal of concisely conveying key
                 information therein, has seen a surge of attention from
                 scholars and practitioners recently. Using a language
                 modeling (LM) approach for sentence selection has been
                 proven effective for performing unsupervised extractive
                 summarization. However, one of the major difficulties
                 facing the LM approach is to model sentences and
                 estimate their parameters more accurately for each text
                 (or spoken) document. We extend this line of research
                 and make the following contributions in this work.
                 First, we propose a position-aware language modeling
                 framework using various granularities of
                 position-specific information to better estimate the
                 sentence models involved in the summarization process.
                 Second, we explore disparate ways to integrate the
                 positional cues into relevance models through a
                 pseudo-relevance feedback procedure. Third, we
                 extensively evaluate various models originated from our
                 proposed framework and several well-established
                 unsupervised methods. Empirical evaluation conducted on
                 a broadcast news summarization task further
                 demonstrates performance merits of the proposed
                 summarization methods.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Phani:2017:SLA,
  author =       "Shanta Phani and Shibamouli Lahiri and Arindam
                 Biswas",
  title =        "A Supervised Learning Approach for Authorship
                 Attribution of {Bengali} Literary Texts",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "4",
  pages =        "28:1--28:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3099473",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Authorship Attribution is a long-standing problem in
                 Natural Language Processing. Several statistical and
                 computational methods have been used to find a solution
                 to this problem. In this article, we have proposed
                 methods to deal with the authorship attribution problem
                 in Bengali. More specifically, we proposed a supervised
                 framework consisting of lexical and shallow features
                 and investigated the possibility of using
                 topic-modeling-inspired features, to classify documents
                 according to their authors. We have created a corpus
                 from nearly all the literary works of three eminent
                 Bengali authors, consisting of 3,000 disjoint samples.
                 Our models showed better performance than the
                 state-of-the-art, with more than 98\% test accuracy for
                 the shallow features and 100\% test accuracy for the
                 topic-based features. Further experiments with GloVe
                 vectors [Pennington et al. 2014] showed comparable
                 results, but flexible patterns based on content words
                 and high-frequency words [Schwartz et al. 2013] failed
                 to perform as well as expected.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Passban:2017:TLR,
  author =       "Peyman Passban and Qun Liu and Andy Way",
  title =        "Translating Low-Resource Languages by Vocabulary
                 Adaptation from Close Counterparts",
  journal =      j-TALLIP,
  volume =       "16",
  number =       "4",
  pages =        "29:1--29:??",
  month =        sep,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3099556",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Some natural languages belong to the same family or
                 share similar syntactic and/or semantic regularities.
                 This property persuades researchers to share
                 computational models across languages and benefit from
                 high-quality models to boost existing low-performance
                 counterparts. In this article, we follow a similar
                 idea, whereby we develop statistical and neural machine
                 translation (MT) engines that are trained on one
                 language pair but are used to translate another
                 language. First we train a reliable model for a
                 high-resource language, and then we exploit
                 cross-lingual similarities and adapt the model to work
                 for a close language with almost zero resources. We
                 chose Turkish (Tr) and Azeri or Azerbaijani (Az) as the
                 proposed pair in our experiments. Azeri suffers from
                 lack of resources as there is almost no bilingual
                 corpus for this language. Via our techniques, we are
                 able to train an engine for the Az -{$>$} English (En)
                 direction, which is able to outperform all other
                 existing models.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{S:2017:RMI,
  author =       "Sreelekha S. and Pushpak Bhattacharyya",
  title =        "Role of Morphology Injection in {SMT}: a Case Study
                 from {Indian} Language Perspective",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "1",
  pages =        "1:1--1:??",
  month =        nov,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3129208",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Phrase-based Statistical Machine Translation (PBSMT)
                 is commonly used for automatic translation. However,
                 PBSMT runs into difficulty when either or both of the
                 source and target languages are morphologically rich.
                 Factored models are found to be useful for such cases,
                 as they consider word as a vector of factors. These
                 factors can contain any information about the surface
                 word and use it while translating. The objective of the
                 current work is to handle morphological inflections in
                 Hindi, Marathi, and Malayalam using Factored
                 translation models when translating from English.
                 Statistical MT approaches face the problem of data
                 sparsity when translating to a morphologically rich
                 language. It is very unlikely for a parallel corpus to
                 contain all morphological forms of words. We propose a
                 solution to generate these unseen morphological forms
                 and inject them into the original training corpus. We
                 propose a simple and effective solution based on
                 enriching the input with various morphological forms of
                 words. We observe that morphology injection improves
                 the quality of translation in terms of both adequacy
                 and fluency. We verify this with experiments on three
                 morphologically rich languages when translating from
                 English. From the detailed evaluations, we observed an
                 order of magnitude improvement in translation
                 quality.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Malik:2017:UNE,
  author =       "Muhammad Kamran Malik",
  title =        "{Urdu} Named Entity Recognition and Classification
                 System Using Artificial Neural Network",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "1",
  pages =        "2:1--2:??",
  month =        nov,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3129290",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Named Entity Recognition and Classification (NERC) is
                 a process of identifying words and classifying them
                 into person names, location names, organization names,
                 and so on. In this article, we discuss the development
                 of an Urdu Named Entity (NE) corpus, called the
                 Kamran-PU-NE (KPU-NE) corpus, for three entity types,
                 that is, Person, Organization, and Location, and
                 marking the remaining tokens as Others (O). We use two
                 supervised learning algorithms, Hidden Markov Model
                 (HMM) and Artificial Neural Network (ANN), for the
                 development of the Urdu NERC system. We annotate the
                 652852-token corpus taken from 15 different genres with
                 a total of 44480 NEs. The inter-annotator agreement
                 between the two annotators in terms of Kappa k
                 statistic is 73.41\%. With HMM, the highest recorded
                 precision, recall, and f-measure values are 55.98\%,
                 83.11\%, and 66.90\%, respectively, and with ANN, they
                 are 81.05\%, 87.54\%, and 84.17\%, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Kim:2017:PEN,
  author =       "Hyun Kim and Hun-Young Jung and Hongseok Kwon and
                 Jong-Hyeok Lee and Seung-Hoon Na",
  title =        "Predictor--Estimator: Neural Quality Estimation Based
                 on Target Word Prediction for Machine Translation",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "1",
  pages =        "3:1--3:??",
  month =        nov,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3109480",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Recently, quality estimation has been attracting
                 increasing interest from machine translation
                 researchers, aiming at finding a good estimator for the
                 ``quality'' of machine translation output. The common
                 approach for quality estimation is to treat the problem
                 as a supervised regression/classification task using a
                 quality-annotated noisy parallel corpus, called quality
                 estimation data, as training data. However, the
                 available size of quality estimation data remains
                 small, due to the too-expensive cost of creating such
                 data. In addition, most conventional quality estimation
                 approaches rely on manually designed features to model
                 nonlinear relationships between feature vectors and
                 corresponding quality labels. To overcome these
                 problems, this article proposes a novel neural network
                 architecture for quality estimation task-called the
                 predictor-estimator -that considers word prediction as
                 an additional pre-task. The major component of the
                 proposed neural architecture is a word prediction model
                 based on a modified neural machine translation model-a
                 probabilistic model for predicting a target word
                 conditioned on all the other source and target
                 contexts. The underlying assumption is that the word
                 prediction model is highly related to quality
                 estimation models and is therefore able to transfer
                 useful knowledge to quality estimation tasks. Our
                 proposed quality estimation method sequentially trains
                 the following two types of neural models: (1)
                 Predictor: a neural word prediction model trained from
                 parallel corpora and (2) Estimator: a neural quality
                 estimation model trained from quality estimation data.
                 To transfer word a prediction task to a quality
                 estimation task, we generate quality estimation feature
                 vectors from the word prediction model and feed them
                 into the quality estimation model. The experimental
                 results on WMT15 and 16 quality estimation datasets
                 show that our proposed method has great potential in
                 the various sub-challenges.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Almeman:2017:ABV,
  author =       "Khalid Almeman",
  title =        "Automatically Building {VoIP} Speech Parallel Corpora
                 for {Arabic} Dialects",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "1",
  pages =        "4:1--4:??",
  month =        nov,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3132708",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article discusses the process of automatically
                 building Arabic multi-dialect speech corpora using
                 Voice over Internet Protocol (VoIP). The Asterisk
                 framework was adopted to act as the main connection
                 between the parties, for which two virtual machines
                 were created: a sender and a receiver. The sender makes
                 a VoIP call to the receiver using the Asterisk
                 framework, while the receiver records the call
                 automatically, a process that is repeated for all the
                 audio files involved in the corpora. In this work, more
                 than 67,000 automatic calls were made between the
                 sender and receiver machines, generating VoIP Arabic
                 corpora for four Arabic dialects. The resulting corpora
                 can be considered the first Arabic VoIP parallel speech
                 corpora and will be made freely available to
                 researchers in Arabic NLP and speech recognition
                 research.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Tran:2017:LRB,
  author =       "Phuoc Tran and Dien Dinh and Tan Le and Long H. B.
                 Nguyen",
  title =        "Linguistic-Relationships-Based Approach for Improving
                 Word Alignment",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "1",
  pages =        "5:1--5:??",
  month =        nov,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3133323",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The unsupervised word alignments (such as GIZA++) are
                 widely used in the phrase-based statistical machine
                 translation. The quality of the model is proportional
                 to the size and the quality of the bilingual corpus.
                 However, for low-resource language pairs such as
                 Chinese and Vietnamese, a result of unsupervised word
                 alignment sometimes is of low quality due to the sparse
                 data. In addition, this model does not take advantage
                 of the linguistic relationships to improve performance
                 of word alignment. Chinese and Vietnamese have the same
                 language type and have close linguistic relationships.
                 In this article, we integrate the characteristics of
                 linguistic relationships into the word alignment model
                 to enhance the quality of Chinese-Vietnamese word
                 alignment. These linguistic relationships are
                 Sino-Vietnamese and content word. The experimental
                 results showed that our method improved the performance
                 of word alignment as well as the quality of machine
                 translation.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Cheng:2017:ECC,
  author =       "Xiyao Cheng and Ying Chen and Bixiao Cheng and
                 Shoushan Li and Guodong Zhou",
  title =        "An Emotion Cause Corpus for {Chinese} Microblogs with
                 Multiple-User Structures",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "1",
  pages =        "6:1--6:??",
  month =        nov,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3132684",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "A notably challenging problem in emotion analysis is
                 recognizing the cause of an emotion. Although there
                 have been a few studies on emotion cause detection,
                 most of them work on news reports or a few of them
                 focus on microblogs using a single-user structure
                 (i.e., all texts in a microblog are written by the same
                 user). In this article, we focus on emotion cause
                 detection for Chinese microblogs using a multiple-user
                 structure (i.e., texts in a microblog are successively
                 written by several users). First, based on the fact
                 that the causes of an emotion of a focused user may be
                 provided by other users in a microblog with the
                 multiple-user structure, we design an emotion cause
                 annotation scheme which can deal with such a
                 complicated case, and then provide an emotion cause
                 corpus using the annotation scheme. Second, based on
                 the analysis of the emotion cause corpus, we formalize
                 two emotion cause detection tasks for microblogs
                 (current-subtweet-based emotion cause detection and
                 original-subtweet-based emotion cause detection).
                 Furthermore, in order to examine the difficulty of the
                 two emotion cause detection tasks and the contributions
                 of texts written by different users in a microblog with
                 the multiple-user structure, we choose two popular
                 classification methods (SVM and LSTM) to do emotion
                 cause detection. Our experiments show that the
                 current-subtweet-based emotion cause detection is much
                 more difficult than the original-subtweet-based emotion
                 cause detection, and texts written by different users
                 are very helpful for both emotion cause detection
                 tasks. This study presents a pilot study of emotion
                 cause detection which deals with Chinese microblogs
                 using a complicated structure.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sarma:2017:DAS,
  author =       "Himangshu Sarma and Navanath Saharia and Utpal
                 Sharma",
  title =        "Development and Analysis of Speech Recognition Systems
                 for {Assamese} Language Using {HTK}",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "1",
  pages =        "7:1--7:??",
  month =        nov,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3137055",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Language analysis is very important for the native
                 speaker to connect with the digital world. Assamese is
                 a relatively unexplored language. In this report, we
                 analyze different aspects of speech-to-text processing,
                 starting from building a speech corpus, defining
                 syllable rules, and finally developing a speech search
                 engine of Assamese. We have collected about 20 hours of
                 speech in three (viz., read, extempore, and
                 conversation) modes and transcribed it. We also discuss
                 some issues and challenges faced during development of
                 the corpus. We have developed an automatic
                 syllabification model with 11 rules for the Assamese
                 language and found an accuracy of more than 95\% in our
                 result. We found 12 different syllable patterns where 5
                 are found most frequent. The maximum length of a
                 syllable found is four letters. With the help of Hidden
                 Markov Model Toolkit (HTK) 3.5, we used deep learning
                 based neural network for our speech recognition model,
                 where we obtained 78.05\% accuracy for automatic
                 transcription of Assamese speech.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Bhattacharya:2017:COB,
  author =       "Nilanjana Bhattacharya and Umapada Pal and Partha
                 Pratim Roy",
  title =        "Cleaning of Online {Bangla} Free-form Handwritten
                 Text",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "1",
  pages =        "8:1--8:??",
  month =        nov,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3145538",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Sat Dec 23 10:06:06 MST 2017",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In the normal free-form handwritten text, repetition
                 (repeated writing of the same stroke several times in
                 the same place), over-writing, and crossing out are
                 very common. In this article, we call the presence of
                 these three types of writing as ``noise.'' Cleaning to
                 extract useful text from such types of noisy text is an
                 important task for robust recognition. To the best of
                 our knowledge, no work has been reported on cleaning of
                 such noise from online text in any scripts and hence,
                 in this article, we propose an automatic text-cleaning
                 approach for online handwriting recognition. Here, at
                 first, crossing out noise with straight strike-through
                 lines is detected using the straightness criteria of
                 online strokes. Next, regions containing repetition,
                 over-writing, and other types of crossing out are
                 located using the positional information of the
                 overlapping strokes. Stroke density, self-intersections
                 of strokes etc. are computed from the strokes of
                 located regions to predict the type of noise and this
                 type of information is used as follows for their
                 cleaning. For cleaning of crossing outs, all strokes of
                 the crossing-out region are removed. For cleaning
                 repetition and over-writing, strokes written earlier
                 are removed, keeping the latest strokes. Finally,
                 delayed strokes are properly arranged and word is
                 passed to online recognizer. Though recognition of
                 free-form handwriting is quite difficult, in this
                 attempt, we obtained up to 70.71\% improvement in
                 word-recognition accuracy after noise cleaning.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Nasution:2018:GCA,
  author =       "Arbi Haza Nasution and Yohei Murakami and Toru
                 Ishida",
  title =        "A Generalized Constraint Approach to Bilingual
                 Dictionary Induction for Low-Resource Language
                 Families",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "2",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3138815",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The lack or absence of parallel and comparable corpora
                 makes bilingual lexicon extraction a difficult task for
                 low-resource languages. The pivot language and cognate
                 recognition approaches have been proven useful for
                 inducing bilingual lexicons for such languages. We
                 propose constraint-based bilingual lexicon induction
                 for closely related languages by extending constraints
                 from the recent pivot-based induction technique and
                 further enabling multiple symmetry assumption cycle to
                 reach many more cognates in the transgraph. We further
                 identify cognate synonyms to obtain many-to-many
                 translation pairs. This article utilizes four datasets:
                 one Austronesian low-resource language and three
                 Indo-European high-resource languages. We use three
                 constraint-based methods from our previous work, the
                 Inverse Consultation method and translation pairs
                 generated from Cartesian product of input dictionaries
                 as baselines. We evaluate our result using the metrics
                 of precision, recall, and F-score. Our customizable
                 approach allows the user to conduct cross validation to
                 predict the optimal hyperparameters (cognate threshold
                 and cognate synonym threshold) with various combination
                 of heuristics and number of symmetry assumption cycles
                 to gain the highest F-score. Our proposed methods have
                 statistically significant improvement of precision and
                 F-score compared to our previous constraint-based
                 methods. The results show that our method demonstrates
                 the potential to complement other bilingual dictionary
                 creation methods like word alignment models using
                 parallel corpora for high-resource languages while well
                 handling low-resource languages.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Onyenwe:2018:BLR,
  author =       "Ikechukwu E. Onyenwe and Mark Hepple and Uchechukwu
                 Chinedu and Ignatius Ezeani",
  title =        "A Basic Language Resource Kit Implementation for the
                 {Igbo} {NLP} Project",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "2",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3146387",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Igbo, an African language with around 32 million
                 speakers worldwide, is one of the many languages having
                 few or none of the language processing resources needed
                 for advanced language technology applications. In this
                 article, we describe the approach taken to creating an
                 initial set of resources for Igbo, including an
                 electronic text corpus, a part-of-speech (POS) tagset,
                 and a POS-tagged subcorpus. We discuss the approach
                 taken in gathering texts, the preprocessing of these
                 texts, and the development of the POS tagged corpus. We
                 also discuss some of the problems encountered during
                 corpus and tagset development and the solutions arrived
                 at for these problems.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Jia:2018:IDP,
  author =       "Yanyan Jia and Yansong Feng and Yuan Ye and Chao Lv
                 and Chongde Shi and Dongyan Zhao",
  title =        "Improved Discourse Parsing with Two-Step Neural
                 Transition-Based Model",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "2",
  pages =        "11:1--11:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3152537",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Discourse parsing aims to identify structures and
                 relationships between different discourse units. Most
                 existing approaches analyze a whole discourse at once,
                 which often fails in distinguishing long-span relations
                 and properly representing discourse units. In this
                 article, we propose a novel parsing model to analyze
                 discourse in a two-step fashion with different feature
                 representations to characterize intra sentence and
                 inter sentence discourse structures, respectively. Our
                 model works in a transition-based framework and
                 benefits from a stack long short-term memory neural
                 network model. Experiments on benchmark tree banks show
                 that our method outperforms traditional 1-step parsing
                 methods in both English and Chinese.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Naili:2018:CSS,
  author =       "Marwa Naili and Anja Habacha Chaibi and Henda {Hajjami
                 Ben Ghezala}",
  title =        "The Contribution of Stemming and Semantics in {Arabic}
                 Topic Segmentation",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "2",
  pages =        "12:1--12:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3152464",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Topic Segmentation is one of the pillars of Natural
                 Language Processing. Yet there is a remarkable research
                 gap in this field, as far as the Arabic language is
                 concerned. The purpose of this article is to improve
                 Arabic Topic Segmentation (ATS) by inquiring into two
                 segmenters: ArabC99 and ArabTextTiling. This study is
                 carried out on two independent levels: the
                 pre-processing level and the segmentation level. These
                 levels represent the basic steps of topic segmentation.
                 On the pre-processing level, we examine the effect of
                 using different Arabic stemming algorithms on ATS. We
                 find out that Light10 is more appropriate for the
                 pre-processing step. Based on this conclusion, we
                 proceed to the second level by proposing two Arabic
                 segmenters called ArabC99-LS-LSA and
                 ArabTextTiling-LS-LSA. These latter use external
                 semantic knowledge related to the Latent Semantic
                 Analysis (LSA). Based on the evaluation results, we
                 notice that LSA provides improvements in this field.
                 Hence, the main outcome of this article emphasizes the
                 multilevel improvement of ATS based on Light10 and
                 LSA.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Fujita:2018:EPL,
  author =       "Atsushi Fujita and Pierre Isabelle",
  title =        "Expanding Paraphrase Lexicons by Exploiting
                 Generalities",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "2",
  pages =        "13:1--13:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3160488",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Techniques for generating and recognizing paraphrases,
                 i.e., semantically equivalent expressions, play an
                 important role in a wide range of natural language
                 processing tasks. In the last decade, the task of
                 automatic acquisition of subsentential paraphrases,
                 i.e., words and phrases with (approximately) the same
                 meaning, has been drawing much attention in the
                 research community. The core problem is to obtain
                 paraphrases of high quality in large quantity. This
                 article presents a method for tackling this issue by
                 systematically expanding an initial seed lexicon made
                 up of high-quality paraphrases. This involves
                 automatically capturing morpho-semantic and syntactic
                 generalizations within the lexicon and using them to
                 leverage the power of large-scale monolingual data.
                 Given an input set of paraphrases, our method starts by
                 inducing paraphrase patterns that constitute
                 generalizations over corresponding pairs of lexical
                 variants, such as ``amending'' and ``amendment,'' in a
                 fully empirical way. It then searches large-scale
                 monolingual data for new paraphrases matching those
                 patterns. The results of our experiments on English,
                 French, and Japanese demonstrate that our method
                 manages to expand seed lexicons by a large multiple.
                 Human evaluation based on paraphrase substitution tests
                 reveals that the automatically acquired paraphrases are
                 also of high quality.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wang:2018:EEW,
  author =       "Shaonan Wang and Jiajun Zhang and Chengqing Zong",
  title =        "Empirical Exploring Word-Character Relationship for
                 {Chinese} Sentence Representation",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "3",
  pages =        "14:1--14:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3156778",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article addresses the problem of learning
                 compositional Chinese sentence representations, which
                 represent the meaning of a sentence by composing the
                 meanings of its constituent words. In contrast to
                 English, a Chinese word is composed of characters,
                 which contain rich semantic information. However, this
                 information has not been fully exploited by existing
                 methods. In this work, we introduce a novel, mixed
                 character-word architecture to improve the Chinese
                 sentence representations by utilizing rich semantic
                 information of inner-word characters. We propose two
                 novel strategies to reach this purpose. The first one
                 is to use a mask gate on characters, learning the
                 relation among characters in a word. The second one is
                 to use a max-pooling operation on words to adaptively
                 find the optimal mixture of the atomic and
                 compositional word representations. Finally, the
                 proposed architecture is applied to various sentence
                 composition models, which achieves substantial
                 performance gains over baseline models on sentence
                 similarity task. To further verify the generalization
                 ability of our model, we employ the learned sentence
                 representations as features in sentence classification
                 task, question classification task, and sentence
                 entailment task. Results have shown that the proposed
                 mixed character-word sentence representation models
                 outperform both the character-based and word-based
                 models.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Jia:2018:COR,
  author =       "Shengbin Jia and Shijia E. and Maozhen Li and Yang
                 Xiang",
  title =        "{Chinese} Open Relation Extraction and Knowledge Base
                 Establishment",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "3",
  pages =        "15:1--15:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3162077",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Named entity relation extraction is an important
                 subject in the field of information extraction.
                 Although many English extractors have achieved
                 reasonable performance, an effective system for Chinese
                 relation extraction remains undeveloped due to the lack
                 of Chinese annotation corpora and the specificity of
                 Chinese linguistics. Here, we summarize three kinds of
                 unique but common phenomena in Chinese linguistics. In
                 this article, we investigate unsupervised
                 linguistics-based Chinese open relation extraction
                 (ORE), which can automatically discover arbitrary
                 relations without any manually labeled datasets, and
                 research the establishment of a large-scale corpus. By
                 mapping the entity relations into dependency-trees and
                 considering the unique Chinese linguistic
                 characteristics, we propose a novel unsupervised
                 Chinese ORE model based on Dependency Semantic Normal
                 Forms (DSNFs). This model imposes no restrictions on
                 the relative positions among entities and relationships
                 and achieves a high yield by extracting relations
                 mediated by verbs or nouns and processing the parallel
                 clauses. Empirical results from our model demonstrate
                 the effectiveness of this method, which obtains stable
                 performance on four heterogeneous datasets and achieves
                 better precision and recall in comparison with several
                 Chinese ORE systems. Furthermore, a large-scale
                 knowledge base of entity and relation, called COER, is
                 established and published by applying our method to web
                 text, which conquers the trouble of lack of Chinese
                 corpora.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Marie:2018:PTI,
  author =       "Benjamin Marie and Atsushi Fujita",
  title =        "Phrase Table Induction Using Monolingual Data for
                 Low-Resource Statistical Machine Translation",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "3",
  pages =        "16:1--16:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3168054",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "We propose a new method for inducing a phrase-based
                 translation model from a pair of unrelated monolingual
                 corpora. Our method is able to deal with phrases of
                 arbitrary length and to find phrase pairs that are
                 useful for statistical machine translation, without
                 requiring large parallel or comparable corpora. First,
                 our method generates phrase pairs through coupling
                 source and target phrases separately collected from
                 respective monolingual data. Then, for each phrase
                 pair, we compute features using the monolingual data
                 and a small quantity of parallel sentences. Finally,
                 incorrect phrase pairs are pruned, and a phrase table
                 is made using the remaining phrase pairs. In our
                 experiments on French--Japanese and Spanish--Japanese
                 translation tasks under low-resource conditions, we
                 observe that incorporating a phrase table induced by
                 our method to the machine translation system leads to
                 large improvements in translation quality. Furthermore,
                 we show that a phrase table induced by our method can
                 also be useful in a wide range of configurations,
                 including configurations where we have already access
                 to large parallel corpora and configurations where only
                 small monolingual corpora are available.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Salami:2018:ISS,
  author =       "Shahram Salami and Mehrnoush Shamsfard",
  title =        "Integrating Shallow Syntactic Labels in the
                 Phrase-Boundary Translation Model",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "3",
  pages =        "17:1--17:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3178460",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Using a novel rule labeling method, this article
                 proposes a hierarchical model for statistical machine
                 translation. The proposed model labels translation
                 rules by matching the boundaries of target side phrases
                 with the shallow syntactic labels including POS tags
                 and chunk labels on the target side of the training
                 corpus. The boundary labels are concatenated if there
                 is no label for the whole target span. Labeling with
                 the classes of boundary words on the target side
                 phrases has been previously proposed as a
                 phrase-boundary model which can be considered as the
                 base form of our model. In the extended model, the
                 labeler uses a POS tag if there is no chunk label in
                 one boundary. Using chunks as phrase labels, the
                 proposed model generalizes the rules to decrease the
                 model sparseness. The sparseness is a more important
                 issue in the language pairs with a lot of differences
                 in the word order because they have less number of
                 aligned phrase pairs for extraction of rules. The
                 extended phrase-boundary model is also applicable for
                 low-resource languages having no syntactic parser. Some
                 experiments are performed with the proposed model, the
                 base phrase-boundary model, and variants of Syntax
                 Augmented Machine Translation (SAMT) in translation
                 from Persian and German to English as source and target
                 languages with different word orders. According to the
                 results, the proposed model improves the translation
                 performance in the quality and decoding time aspects.
                 Using BLEU as our metric, the proposed model has
                 achieved a statistically significant improvement of
                 about 0.5 point over the base phrase-boundary model.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sherkawi:2018:ASA,
  author =       "Lina Sherkawi and Nada Ghneim and Oumayma {Al
                 Dakkak}",
  title =        "{Arabic} Speech Act Recognition Techniques",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "3",
  pages =        "18:1--18:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3170576",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article presents rule-based and statistical-based
                 techniques for Arabic speech act recognition. The
                 proposed techniques classify an utterance into Arabic
                 speech act categories based on three criteria: surface
                 features, cue words, and contextual information. A
                 rule-based expert system has been developed in a
                 bootstrapping manner based on the fact that Arabic
                 language syntax is inherently rule-based. Various
                 machine-learning algorithms have been used to detect
                 Arabic speech act categories: Decision Tree, Na{\"\i}ve
                 Bayes, Neural Network, and SVM. We compare the
                 experimental results for both techniques
                 (machine-learning and rule-based expert systems). Using
                 a corpus of 1,500 sentences, the rule-based expert
                 system achieved an accuracy rate of 98.92\%, while the
                 Decision Tree, Na{\"\i}ve Bayes, Neural Network, and
                 SVM achieved an accuracy rate of 97.09\%, 96.48\%,
                 93.50\%, and 93.70\%, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Jung:2018:EEK,
  author =       "Sangkeun Jung and Changki Lee and Hyunsun Hwang",
  title =        "End-to-End {Korean} Part-of-Speech Tagging Using
                 Copying Mechanism",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "3",
  pages =        "19:1--19:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3178458",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In this article, we introduce a novel neural
                 architecture for the end-to-end Korean Part-of-Speech
                 (POS) tagging problem. To address the problem, we
                 extend the present recurrent neural network-based
                 sequence-to-sequence models to deal with the key
                 challenges in this task: rare word generation and POS
                 tagging. To overcome these issues, Input-Feeding and
                 Copying mechanism are adopted. Although our approach
                 does not require any manual features or preprocessed
                 pattern matching dictionaries, our best single model
                 achieves an F-score of 97.08. This is competitive with
                 the current state-of-the-art model (F-score 98.03),
                 which requires extensive manual feature processing.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sen:2018:AST,
  author =       "Shibaprasad Sen and Ankan Bhattacharyya and Pawan
                 Kumar Singh and Ram Sarkar and Kaushik Roy and David
                 Doermann",
  title =        "Application of Structural and Topological Features to
                 Recognize Online Handwritten {Bangla} Characters",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "3",
  pages =        "20:1--20:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3178457",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article presents a set of novel features for
                 robust online Bangla handwritten character recognition.
                 Two feature extraction methods are presented here. The
                 first describes the transition from background to
                 foreground pixels and vice versa. The second uses a
                 combination of topological features and
                 centre-of-gravity- (CG) based circular features where
                 global information, local information, and Circular
                 Quadrant Mass Distribution information have been
                 extracted. The impact of each along with their
                 combination have also been analyzed. A total of 15,000
                 isolated online Bangla character samples have been
                 collected and used for the evaluation. A Support Vector
                 Machine classifier records the best recognition rate
                 when the transition count feature, CG-based circular
                 features, and topological features are combined.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{She:2018:LHD,
  author =       "Xiaohan She and Ping Jian and Pengcheng Zhang and
                 Heyan Huang",
  title =        "Leveraging Hierarchical Deep Semantics to Classify
                 Implicit Discourse Relations via a Mutual Learning
                 Method",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "3",
  pages =        "21:1--21:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3178456",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "This article presents a mutual learning method using
                 hierarchical deep semantics for the classification of
                 implicit discourse relations in English. With the
                 absence of explicit discourse markers, traditional
                 discourse techniques mainly concentrate on discrete
                 linguistic features in this task, which always leads to
                 a data sparseness problem. To relieve this problem, we
                 propose a mutual learning neural model that makes use
                 of multilevel semantic information together, including
                 the distribution of implicit discourse relations, the
                 semantics of arguments, and the co-occurrence of
                 phrases and words. During the training process, the
                 predicting targets of the model, which are the
                 probability of the discourse relation type and the
                 distributed representation of semantic components, are
                 learned jointly and optimized mutually. The
                 experimental results show that this method outperforms
                 the previous works, especially in multiclass
                 identification attributed to the hierarchical semantic
                 representations and the mutual learning strategy.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Mohamed:2018:MSP,
  author =       "Emad Mohamed",
  title =        "Morphological Segmentation and Part-of-Speech Tagging
                 for the {Arabic} Heritage",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "3",
  pages =        "22:1--22:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3178459",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "We annotate 60,000 words of Classical Arabic (CA) with
                 topics in philosophy, religion, literature, and law
                 with fine-grain segment-based morphological
                 descriptions. We use these annotations for building a
                 morphological segmenter and part-of-speech (POS) tagger
                 for CA. With character-level classification and
                 features from the word and its lexical context, the
                 segmenter achieves a word accuracy of 96.8\% with the
                 main issue being a high rate of out-of-vocabulary
                 words. A token-based POS tagger achieves an accuracy of
                 96.22\% with 97.72\% on known tokens despite the small
                 size of the corpus. An error analysis shows that most
                 of the tagging errors are results of segmentation and
                 that quality improves with more data being added. The
                 morphological segmenter and tagger have a wide range of
                 potential applications in processing CA, a low-resource
                 variety of the language.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Huang:2018:IPK,
  author =       "Degen Huang and Jiahuan Pei and Cong Zhang and Kaiyu
                 Huang and Jianjun Ma",
  title =        "Incorporating Prior Knowledge into Word Embedding for
                 {Chinese} Word Similarity Measurement",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "3",
  pages =        "23:1--23:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3182622",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Word embedding-based methods have received increasing
                 attention for their flexibility and effectiveness in
                 many natural language-processing (NLP) tasks, including
                 Word Similarity (WS). However, these approaches rely on
                 high-quality corpus and neglect prior knowledge.
                 Lexicon-based methods concentrate on human's
                 intelligence contained in semantic resources, e.g.,
                 Tongyici Cilin, HowNet, and Chinese WordNet, but they
                 have the drawback of being unable to deal with unknown
                 words. This article proposes a three-stage framework
                 for measuring the Chinese word similarity by
                 incorporating prior knowledge obtained from lexicons
                 and statistics into word embedding: in the first stage,
                 we utilize retrieval techniques to crawl the contexts
                 of word pairs from web resources to extend context
                 corpus. In the next stage, we investigate three types
                 of single similarity measurements, including lexicon
                 similarities, statistical similarities, and
                 embedding-based similarities. Finally, we exploit
                 simple combination strategies with math operations and
                 the counter-fitting combination strategy using
                 optimization method. To demonstrate our system's
                 efficiency, comparable experiments are conducted on the
                 PKU-500 dataset. Our final results are 0.561/0.516 of
                 Spearman/Pearson rank correlation coefficient, which
                 outperform the state-of-the-art performance to the best
                 of our knowledge. Experiment results on Chinese MC-30
                 and SemEval-2012 datasets show that our system also
                 performs well on other Chinese datasets, which proves
                 its transferability. Besides, our system is not
                 language-specific and can be applied to other
                 languages, e.g., English.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ehsani:2018:CWT,
  author =       "Razieh Ehsani and Ercan Solak and Olcay Taner Yildiz",
  title =        "Constructing a {WordNet} for {Turkish} Using Manual
                 and Automatic Annotation",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "3",
  pages =        "24:1--24:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3185664",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "In this article, we summarize the methodology and the
                 results of our 2-year-long efforts to construct a
                 comprehensive WordNet for Turkish. In our approach, we
                 mine a dictionary for synonym candidate pairs and
                 manually mark the senses in which the candidates are
                 synonymous. We marked every pair twice by different
                 human annotators. We derive the synsets by finding the
                 connected components of the graph whose edges are
                 synonym senses. We also mined Turkish Wikipedia for
                 hypernym relations among the senses. We analyzed the
                 resulting WordNet to highlight the difficulties brought
                 about by the dictionary construction methods of
                 lexicographers. After splitting the unusually large
                 synsets, we used random walk-based clustering that
                 resulted in a Zipfian distribution of synset sizes. We
                 compared our results to BalkaNet and automatic
                 thesaurus construction methods using variation of
                 information metric. Our Turkish WordNet is available
                 online.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Huang:2018:LRR,
  author =       "Jizhou Huang and Shiqiang Ding and Haifeng Wang and
                 Ting Liu",
  title =        "Learning to Recommend Related Entities With
                 Serendipity for {Web} Search Users",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "3",
  pages =        "25:1--25:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3185663",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Entity recommendation, providing entity suggestions to
                 assist users in discovering interesting information,
                 has become an indispensable feature of today's Web
                 search engine. However, the majority of existing entity
                 recommendation methods are not designed to boost the
                 performance in terms of serendipity, which also plays
                 an important role in the appreciation of users for a
                 recommendation system. To keep users engaged, it is
                 important to take into account serendipity when
                 building an entity recommendation system. In this
                 article, we propose a learning to recommend framework
                 that consists of two components: related entity finding
                 and candidate entity ranking. To boost serendipity
                 performance, three different sets of features that
                 correlate with the three aspects of serendipity are
                 employed in the proposed framework. Extensive
                 experiments are conducted on large-scale, real-world
                 datasets collected from a widely used commercial Web
                 search engine. The experiments show that our method
                 significantly outperforms several strong baseline
                 methods. An analysis on the impact of features reveals
                 that the set of interestingness features is the most
                 powerful feature set, and the set of unexpectedness
                 features can significantly contribute to recommendation
                 effectiveness. In addition, online controlled
                 experiments conducted on a commercial Web search engine
                 demonstrate that our method can significantly improve
                 user engagement against multiple baseline methods. This
                 further confirms the effectiveness of the proposed
                 framework.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Basiri:2018:WII,
  author =       "Mohammad Ehsan Basiri and Arman Kabiri",
  title =        "Words Are Important: Improving Sentiment Analysis in
                 the {Persian} Language by Lexicon Refining",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "4",
  pages =        "26:1--26:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3195633",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Lexicon-based sentiment analysis (SA) aims to address
                 the problem of extracting people's opinions from their
                 comments on the Web using a predefined lexicon of
                 opinionated words. In contrast to the machine learning
                 (ML) approach, lexicon-based methods are
                 domain-independent methods that do not need a large
                 annotated training corpus and hence are faster. This
                 makes the lexicon-based approach prevalent in the SA
                 community. However, the story is different for the
                 Persian language. In contrast to English, using the
                 lexicon-based method in Persian is a new discipline.
                 There are rather limited resources available for SA in
                 Persian, making the accuracy of the existing
                 lexicon-based methods lower than other languages. In
                 the current study, first an exhaustive investigation of
                 the lexicon-based method is performed. Then two new
                 resources are introduced to address the problem of
                 resource scarcity for SA in Persian: a carefully
                 labeled lexicon of sentiment words, PerLex, and a new
                 handmade dataset of about 16,000 rated documents,
                 PerView. Moreover, a new hybrid method using both ML
                 and the lexicon-based approach is presented in which
                 PerLex words are used to train the ML algorithm.
                 Experiments are carried out on our new PerView dataset.
                 Results indicate that the accuracy of PerLex is higher
                 than the existing CNRC, Adjectives, SentiStrength,
                 PerSent, and LexiPers lexicons. In addition, the
                 results show that using PerLex significantly decreases
                 the execution time of the proposed system in comparison
                 to the above-mentioned lexicons. Moreover, the results
                 demonstrate the excellence of using opinionated lexicon
                 terms followed by bigrams as the features employed in
                 the ML method.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Suryani:2018:RBS,
  author =       "Arie Ardiyanti Suryani and Dwi Hendratmo Widyantoro
                 and Ayu Purwarianti and Yayat Sudaryat",
  title =        "The Rule-Based Sundanese Stemmer",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "4",
  pages =        "27:1--27:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3195634",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Our research proposed an iterative Sundanese stemmer
                 by removing the derivational affixes prior to the
                 inflexional. This scheme was chosen because, in the
                 Sundanese affixation, a confix (one of derivational
                 affix) is applied in the last phase of a morphological
                 process. Moreover, most of Sundanese affixes are
                 derivational, so removing the derivational affix as the
                 first step is reasonable. To handle ambiguity, the last
                 recognized affix was returned as the result. As the
                 baseline, a Confix-Stripping Approach that applies
                 Porter Stemmer for the Indonesian language was used.
                 This stemmer shares similarities in terms of affix
                 type, but uses a different stemming order. To observe
                 whether the baseline stems the Sundanese affixed word
                 properly, some features that were not covered by the
                 baseline, such as the infix and allomorph removal, were
                 added. The evaluation was done using 4,453 unique
                 affixed words collected from Sundanese online
                 magazines. The experiment shows that, as a whole, our
                 stemmer outperforms the modified baseline in terms of
                 recognized affixed type accuracy and properly stemmed
                 affixed words. Our stemmer recognized 68.87\% of the
                 Sundanese affixed types and produced 96.79\% of the
                 correctly affixed words; the modified baseline resulted
                 in 21.70\% and 71.59\%, respectively",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{He:2018:DPS,
  author =       "Ruifang He and Yaru Wang and Dawei Song and Peng Zhang
                 and Yuan Jia and Aijun Li",
  title =        "A Dependency Parser for Spontaneous {Chinese} Spoken
                 Language",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "4",
  pages =        "28:1--28:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3196278",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Dependency analysis is vital for spoken language
                 understanding in spoken dialogue systems. However,
                 existing research has mainly focused on western spoken
                 languages, Japanese, and so on. Little research has
                 been done for spoken Chinese in terms of dependency
                 parsing. Therefore, the new spoken corpus, D-ESCSC
                 (Dependency-Expressive Speech Corpus of Standard
                 Chinese) is built by adding new dependency relations
                 special to spoken Chinese based on a written Chinese
                 annotation scheme. Since spoken Chinese contains
                 typical ill-grammatical phenomena, e.g., translocation,
                 repetition, duplication, and omission, the new atom
                 feature related to punctuation and three feature
                 templates are proposed to improve a graph-based
                 dependency parser. Experimental results on spoken
                 Chinese corpus show that the atom feature and three
                 templates really work and the new parser outperforms
                 the baseline parser. To our best knowledge, it is the
                 first work to report dependency parsing results of
                 spoken Chinese.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Bai:2018:IVS,
  author =       "Xuefeng Bai and Hailong Cao and Tiejun Zhao",
  title =        "Improving Vector Space Word Representations Via Kernel
                 Canonical Correlation Analysis",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "4",
  pages =        "29:1--29:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3197566",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Cross-lingual word embeddings are representations for
                 vocabularies of two or more languages in one common
                 continuous vector space and are widely used in various
                 natural language processing tasks. A state-of-the-art
                 way to generate cross-lingual word embeddings is to
                 learn a linear mapping, with an assumption that the
                 vector representations of similar words in different
                 languages are related by a linear relationship.
                 However, this assumption does not always hold true,
                 especially for substantially different languages. We
                 therefore propose to use kernel canonical correlation
                 analysis to capture a non-linear relationship between
                 word embeddings of two languages. By extensively
                 evaluating the learned word embeddings on three tasks
                 (word similarity, cross-lingual dictionary induction,
                 and cross-lingual document classification) across five
                 language pairs, we demonstrate that our proposed
                 approach achieves essentially better performances than
                 previous linear methods on all of the three tasks,
                 especially for language pairs with substantial
                 typological difference.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Park:2018:NCI,
  author =       "Taekeun Park and Seung-Hoon Kim",
  title =        "Novel Character Identification Utilizing Semantic
                 Relation with Animate Nouns in {Korean}",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "4",
  pages =        "30:1--30:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3197657",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "For identifying speakers of quoted speech or
                 extracting social networks from literature, it is
                 indispensable to extract character names and nominals.
                 However, detecting proper nouns in the novels
                 translated into or written in Korean is harder than in
                 English because Korean does not have a capitalization
                 feature. In addition, it is almost impossible for any
                 proper noun dictionary to include all kinds of
                 character names that have been created or will be
                 created by authors. Fortunately, a previous study shows
                 that utilizing postpositions for animate nouns is a
                 simple and effective tool for character identification
                 in Korean novels without a proper noun dictionary and a
                 training corpus. In this article, we propose a
                 character identification method utilizing the semantic
                 relation with known animate nouns. For 80 novels in
                 Korean, the proposed method increases the micro- and
                 macro-average recall by 13.68\% and 11.86\%,
                 respectively, while decreasing the micro-average
                 precision by 0.28\% and increasing the macro-average
                 precision by 0.07\% compared to the previous study. If
                 we focus on characters that are responsible for more
                 than 1\% of the character name mentions in each novel,
                 the micro- and macro-average F-measure of the proposed
                 method are 96.98\% and 97.32\%, respectively.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wang:2018:GBB,
  author =       "Rui Wang and Hai Zhao and Sabine Ploux and Bao-Liang
                 Lu and Masao Utiyama and Eiichiro Sumita",
  title =        "Graph-Based Bilingual Word Embedding for Statistical
                 Machine Translation",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "4",
  pages =        "31:1--31:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3203078",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Bilingual word embedding has been shown to be helpful
                 for Statistical Machine Translation (SMT). However,
                 most existing methods suffer from two obvious
                 drawbacks. First, they only focus on simple contexts
                 such as an entire document or a fixed-sized sliding
                 window to build word embedding and ignore latent useful
                 information from the selected context. Second, the word
                 sense but not the word should be the minimal semantic
                 unit; however, most existing methods still use word
                 representation. To overcome these drawbacks, this
                 article presents a novel Graph-Based Bilingual Word
                 Embedding (GBWE) method that projects bilingual word
                 senses into a multidimensional semantic space. First, a
                 bilingual word co-occurrence graph is constructed using
                 the co-occurrence and pointwise mutual information
                 between the words. Then, maximum complete subgraphs
                 (cliques), which play the role of a minimal unit for
                 bilingual sense representation, are dynamically
                 extracted according to the contextual information.
                 Consequently, correspondence analysis, principal
                 component analyses, and neural networks are used to
                 summarize the clique-word matrix into lower dimensions
                 to build the embedding model. Without contextual
                 information, the proposed GBWE can be applied to
                 lexical translation. In addition, given contextual
                 information, GBWE is able to give a dynamic solution
                 for bilingual word representations, which can be
                 applied to phrase translation and generation. Empirical
                 results show that GBWE can enhance the performance of
                 lexical translation, as well as
                 Chinese/French-to-English and Chinese-to-Japanese
                 phrase-based SMT tasks (IWSLT, NTCIR, NIST, and WAT).",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Hamdi:2018:CCS,
  author =       "Ali Hamdi and Khaled Shaban and Anazida Zainal",
  title =        "{CLASENTI}: a Class-Specific Sentiment Analysis
                 Framework",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "4",
  pages =        "32:1--32:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209885",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Arabic text sentiment analysis suffers from low
                 accuracy due to Arabic-specific challenges (e.g.,
                 limited resources, morphological complexity, and
                 dialects) and general linguistic issues (e.g.,
                 fuzziness, implicit sentiment, sarcasm, and spam). The
                 limited resources problem requires efforts to build new
                 and improved Arabic corpora and lexica. We propose a
                 class-specific sentiment analysis (CLASENTI) framework.
                 The framework includes a new annotation approach to
                 build multi-faceted Arabic corpus and lexicon allowing
                 for simultaneous annotation of different facets,
                 including domains, dialects, linguistic issues, and
                 polarity strengths. Each of these facets has multiple
                 classes (e.g., the nine classes representing dialects
                 found in the Arab world). The new corpus and lexicon
                 annotations facilitate the development of new
                 class-specific classification models and polarity
                 strength calculation. For the new sentiment
                 classification models, we propose a hybrid model
                 combining corpus-based and lexicon-based models. The
                 corpus-based model has two interrelated phases to
                 build; (1) full-corpus classification models for all
                 facets; and (2) class-specific models trained on
                 filtered subsets of the corpus according to the
                 performances of the full-corpus models. To calculate
                 polarity strengths, the lexicon-based model filters the
                 annotated lexicon based on the specific classes of the
                 domain and dialect. As a case study, we collect and
                 annotate 15274 reviews from various sources, including
                 surveys, Facebook comments, and Twitter posts,
                 pertaining to governmental services. In addition, we
                 develop a new web-based application to apply the
                 proposed framework on the case study. CLASENTI
                 framework reaches up to 95\% accuracy and 93\% F1-Score
                 surpassing the best-known sentiment classifiers
                 implemented in Scikit-learn library that achieve 82\%
                 accuracy and 81\% F1-Score for Arabic when tested on
                 the same dataset.",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wang:2018:DSN,
  author =       "Limin Wang and Shoushan Li and Qian Yan and Guodong
                 Zhou",
  title =        "Domain-specific Named Entity Recognition with
                 Document-Level Optimization",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "4",
  pages =        "33:1--33:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3213544",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Previous studies normally formulate named entity
                 recognition (NER) as a sequence labeling task and
                 optimize the solution in the sentence level. In this
                 article, we propose a document-level optimization
                 approach to NER and apply it in a domain-specific
                 document-level NER task. As a baseline, we apply a
                 state-of-the-art approach, i.e., long-short-term memory
                 (LSTM), to perform word classification. On this basis,
                 we define a global objective function with the obtained
                 word classification results and achieve global
                 optimization via Integer Linear Programming (ILP).
                 Specifically, in the ILP-based approach, we propose
                 four kinds of constraints, i.e., label transition,
                 entity length, label consistency, and domain-specific
                 regulation constraints, to incorporate various entity
                 recognition knowledge in the document level. Empirical
                 studies demonstrate the effectiveness of the proposed
                 approach to domain-specific document-level NER.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Komiya:2018:CMA,
  author =       "Kanako Komiya and Masaya Suzuki and Tomoya Iwakura and
                 Minoru Sasaki and Hiroyuki Shinnou",
  title =        "Comparison of Methods to Annotate Named Entity
                 Corpora",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "4",
  pages =        "34:1--34:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3218820",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "The authors compared two methods for annotating a
                 corpus for the named entity (NE) recognition task using
                 non-expert annotators: (i) revising the results of an
                 existing NE recognizer and (ii) manually annotating the
                 NEs completely. The annotation time, degree of
                 agreement, and performance were evaluated based on the
                 gold standard. Because there were two annotators for
                 one text for each method, two performances were
                 evaluated: the average performance of both annotators
                 and the performance when at least one annotator is
                 correct. The experiments reveal that semi-automatic
                 annotation is faster, achieves better agreement, and
                 performs better on average. However, they also indicate
                 that sometimes, fully manual annotation should be used
                 for some texts whose document types are substantially
                 different from the training data document types. In
                 addition, the machine learning experiments using
                 semi-automatic and fully manually annotated corpora as
                 training data indicate that the F-measures could be
                 better for some texts when manual instead of
                 semi-automatic annotation was used. Finally,
                 experiments using the annotated corpora for training as
                 additional corpora show that (i) the NE recognition
                 performance does not always correspond to the
                 performance of the NE tag annotation and (ii) the
                 system trained with the manually annotated corpus
                 outperforms the system trained with the
                 semi-automatically annotated corpus with respect to
                 newswires, even though the existing NE recognizer was
                 mainly trained with newswires.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Zhou:2018:WSP,
  author =       "Deyu Zhou and Zhikai Zhang and Min-Ling Zhang and
                 Yulan He",
  title =        "Weakly Supervised {POS} Tagging without
                 Disambiguation",
  journal =      j-TALLIP,
  volume =       "17",
  number =       "4",
  pages =        "35:1--35:??",
  month =        aug,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3214707",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:31 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  abstract =     "Weakly supervised part-of-speech (POS) tagging is to
                 learn to predict the POS tag for a given word in
                 context by making use of partial annotated data instead
                 of the fully tagged corpora. Weakly supervised POS
                 tagging would benefit various natural language
                 processing applications in such languages where tagged
                 corpora are mostly unavailable. In this article, we
                 propose a novel framework for weakly supervised POS
                 tagging based on a dictionary of words with their
                 possible POS tags. In the constrained error-correcting
                 output codes (ECOC)-based approach, a unique L -bit
                 vector is assigned to each POS tag. The set of
                 bitvectors is referred to as a coding matrix with value
                 { 1, -1}. Each column of the coding matrix specifies a
                 dichotomy over the tag space to learn a binary
                 classifier. For each binary classifier, its training
                 data is generated in the following way: each pair of
                 words and its possible POS tags are considered as a
                 positive training example only if the whole set of its
                 possible tags falls into the positive dichotomy
                 specified by the column coding and similarly for
                 negative training examples. Given a word in context,
                 its POS tag is predicted by concatenating the
                 predictive outputs of the L binary classifiers and
                 choosing the tag with the closest distance according to
                 some measure. By incorporating the ECOC strategy, the
                 set of all possible tags for each word is treated as an
                 entirety without the need of performing disambiguation.
                 Moreover, instead of manual feature engineering
                 employed in most previous POS tagging approaches,
                 features for training and testing in the proposed
                 framework are automatically generated using neural
                 language modeling. The proposed framework has been
                 evaluated on three corpora for English, Italian, and
                 Malagasy POS tagging, achieving accuracies of 93.21\%,
                 90.9\%, and 84.5\% individually, which shows a
                 significant improvement compared to the
                 state-of-the-art approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Bhattacharya:2019:UCW,
  author =       "Paheli Bhattacharya and Pawan Goyal and Sudeshna
                 Sarkar",
  title =        "Using Communities of Words Derived from Multilingual
                 Word Vectors for Cross-Language Information Retrieval
                 in {Indian} Languages",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3208358",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3208358",
  abstract =     "We investigate the use of word embeddings for query
                 translation to improve precision in cross-language
                 information retrieval (CLIR). Word vectors represent
                 words in a distributional space such that syntactically
                 or semantically similar words are close to each other
                 in this space. Multilingual word embeddings are
                 constructed in such a way that similar words across
                 languages have similar vector representations. We
                 explore the effective use of bilingual and multilingual
                 word embeddings learned from comparable corpora of
                 Indic languages to the task of CLIR. We propose a
                 clustering method based on the multilingual word
                 vectors to group similar words across languages. For
                 this we construct a graph with words from multiple
                 languages as nodes and with edges connecting words with
                 similar vectors. We use the Louvain method for
                 community detection to find communities in this graph.
                 We show that choosing target language words as query
                 translations from the clusters or communities
                 containing the query terms helps in improving CLIR. We
                 also find that better-quality query translations are
                 obtained when words from more languages are used to do
                 the clustering even when the additional languages are
                 neither the source nor the target languages. This is
                 probably because having more similar words across
                 multiple languages helps define well-defined dense
                 subclusters that help us obtain precise query
                 translations. In this article, we demonstrate the use
                 of multilingual word embeddings and word clusters for
                 CLIR involving Indic languages. We also make available
                 a tool for obtaining related words and the
                 visualizations of the multilingual word vectors for
                 English, Hindi, Bengali, Marathi, Gujarati, and
                 Tamil.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Li:2019:OAE,
  author =       "Maoxi Li and Mingwen Wang",
  title =        "Optimizing Automatic Evaluation of Machine Translation
                 with the {ListMLE} Approach",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3226045",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3226045",
  abstract =     "Automatic evaluation of machine translation is
                 critical for the evaluation and development of machine
                 translation systems. In this study, we propose a new
                 model for automatic evaluation of machine translation.
                 The proposed model combines standard n-gram precision
                 features and sentence semantic mapping features with
                 neural features, including neural language model
                 probabilities and the embedding distances between
                 translation outputs and their reference translations.
                 We optimize the model with a representative list-wise
                 learning to rank approach, ListMLE, in terms of human
                 ranking assessments. The experimental results on
                 WMT'2015 Metrics task indicated that the proposed
                 approach yields significantly better correlations with
                 human assessments than several state-of-the-art
                 baseline approaches. In particular, the results
                 confirmed that the proposed list-wise learning to rank
                 approach is useful and powerful for optimizing
                 automatic evaluation metrics in terms of human ranking
                 assessments. Deep analysis also demonstrated that
                 optimizing automatic metrics with the ListMLE approach
                 is a reasonable method and adding the neural features
                 can gain considerable improvements compared with the
                 traditional features.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Su:2019:RSA,
  author =       "Ming-Hsiang Su and Chung-Hsien Wu and Kun-Yi Huang and
                 Wu-Hsuan Lin",
  title =        "Response Selection and Automatic Message-Response
                 Expansion in Retrieval-Based {QA} Systems using
                 Semantic Dependency Pair Model",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3229184",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3229184",
  abstract =     "This article presents an approach to response
                 selection and message-response (MR) database expansion
                 from the unstructured data on the psychological
                 consultation websites for a retrieval-based question
                 answering (QA) system in a constrained domain for
                 emotional support and comforting. First, we manually
                 construct an initial MR database based on the articles
                 collected from the psychological consultation websites.
                 The Chinese Knowledge and Information Processing
                 probabilistic context-free grammar is adopted to obtain
                 the semantic dependency graphs (SDGs) of all the
                 messages and responses in the initial MR database. For
                 each sentence in the MR database, all the semantic
                 dependencies, each composed of two words and their
                 semantic relation, are extracted from the SDG of the
                 sentence to form a semantic dependency set. Finally, a
                 matrix with the element representing the correlation
                 between the semantic dependencies of the messages and
                 their corresponding responses is constructed as a
                 semantic dependency pair model (SDPM) for response
                 selection. Moreover, as the number of MR pairs in the
                 psychological consultation websites is increasing day
                 by day, the MR database in the QA system should be
                 expanded to meet the needs of the users. For MR
                 database expansion, the unstructured data from the
                 message board are automatically collected. For the
                 collected data, the supervised latent Dirichlet
                 allocation is adopted for event detection and then the
                 event-based delta Bayesian Information Criterion is
                 used for message and response article segmentation.
                 Each extracted message segment is then fed to the
                 constructed retrieval-based QA system to find the best
                 matched response segment and the matching score is also
                 estimated to verify if the new MR pair is suitable to
                 be included in the expanded MR database. Fivefold cross
                 validation was employed to evaluate the performance of
                 the proposed retrieval-based QA system over the
                 expanded MR database based on SDPM. Compared to the
                 vector space model-based method, the Okapi BM25 model,
                 and the deep learning-based sequence-to-sequence with
                 attention model, the proposed approach achieved a more
                 favorable performance according to a statistical
                 significance test. The retrieval accuracy based on MR
                 expansion was also evaluated and a satisfactory result
                 was obtained confirming the effectiveness of the
                 expanded MR database. In addition, the user's
                 satisfaction score of the proposed system was evaluated
                 using the Cronbach's alpha value and the satisfaction
                 score of the proposed SDPM was higher than those of the
                 methods for comparison.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Huang:2019:IMH,
  author =       "Guoping Huang and Jiajun Zhang and Yu Zhou and
                 Chengqing Zong",
  title =        "Input Method for Human Translators: a Novel Approach
                 to Integrate Machine Translation Effectively and
                 Imperceptibly",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3230638",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3230638",
  abstract =     "Computer-aided translation (CAT) systems are the most
                 popular tool for helping human translators efficiently
                 perform language translation. To further improve the
                 translation efficiency, there is an increasing interest
                 in applying machine translation (MT) technology to
                 upgrade CAT. To thoroughly integrate MT into CAT
                 systems, in this article, we propose a novel approach:
                 a new input method that makes full use of the knowledge
                 adopted by MT systems, such as translation rules,
                 decoding hypotheses, and n-best translation lists. The
                 proposed input method contains two parts: a phrase
                 generation model, allowing human translators to type
                 target sentences quickly, and an n-gram prediction
                 model, helping users choose perfect MT fragments
                 smoothly. In addition, to tune the underlying MT system
                 to generate the input method preferable results, we
                 design a new evaluation metric for the MT system. The
                 proposed input method integrates MT effectively and
                 imperceptibly, and it is particularly suitable for many
                 target languages with complex characters, such as
                 Chinese and Japanese. The extensive experiments
                 demonstrate that our method saves more than 23\% in
                 time and over 42\% in keystrokes, and it also improves
                 the translation quality by more than 5 absolute BLEU
                 scores compared with the strong baseline, i.e.,
                 post-editing using Google Pinyin.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Altakrori:2019:AAA,
  author =       "Malik H. Altakrori and Farkhund Iqbal and Benjamin C.
                 M. Fung and Steven H. H. Ding and Abdallah Tubaishat",
  title =        "{Arabic} Authorship Attribution: an Extensive Study on
                 {Twitter} Posts",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3236391",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3236391",
  abstract =     "Law enforcement faces problems in tracing the true
                 identity of offenders in cybercrime investigations.
                 Most offenders mask their true identity, impersonate
                 people of high authority, or use identity deception and
                 obfuscation tactics to avoid detection and
                 traceability. To address the problem of anonymity,
                 authorship analysis is used to identify individuals by
                 their writing styles without knowing their actual
                 identities. Most authorship studies are dedicated to
                 English due to its widespread use over the Internet,
                 but recent cyber-attacks such as the distribution of
                 Stuxnet indicate that Internet crimes are not limited
                 to a certain community, language, culture, ideology, or
                 ethnicity. To effectively investigate cybercrime and to
                 address the problem of anonymity in online
                 communication, there is a pressing need to study
                 authorship analysis of languages such as Arabic,
                 Chinese, Turkish, and so on. Arabic, the focus of this
                 study, is the fourth most widely used language on the
                 Internet. This study investigates authorship of Arabic
                 discourse/text, especially tiny text, Twitter posts. We
                 benchmark the performance of a profile-based approach
                 that uses n -grams as features and compare it with
                 state-of-the-art instance-based classification
                 techniques. Then we adapt an event-visualization tool
                 that is developed for English to accommodate both
                 Arabic and English languages and visualize the result
                 of the attribution evidence. In addition, we
                 investigate the relative effect of the training set,
                 the length of tweets, and the number of authors on
                 authorship classification accuracy. Finally, we show
                 that diacritics have an insignificant effect on the
                 attribution process and part-of-speech tags are less
                 effective than character-level and word-level n
                 -grams.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Zhang:2019:WSB,
  author =       "Shaoning Zhang and Cunli Mao and Zhengtao Yu and
                 Hongbin Wang and Zhongwei Li and Jiafu Zhang",
  title =        "Word Segmentation for {Burmese} Based on Dual-Layer
                 {CRFs}",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "1",
  pages =        "6:1--6:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3232537",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3232537",
  abstract =     "Burmese is an isolated language, in which the syllable
                 is the smallest unit. Syllable segmentation methods
                 based on matching lead to performance subject to the
                 syllable segmentation effect. This article proposes a
                 word segmentation method with fusion conditions of
                 double syllable features. It combines word segmentation
                 and segmentation of syllables into one process, thus
                 reducing the impact of errors on the syllable
                 segmentation of Burmese. In the first layer of the
                 conditional random fields (CRF) model, Burmese
                 characters as atomic features are integrated into the
                 Burma section of the Barkis Speech Paradigm (Backus
                 normal form) features to realize the Burma syllable
                 sequence tags. In the second layer of the CRFs model,
                 with the syllable marked as input, it realizes the
                 sequence markers through building a feature template
                 with syllables as atomic features. The experimental
                 results show that the proposed method has a better
                 effect compared with the method based on the matching
                 of syllables.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Li:2019:IML,
  author =       "Junjie Li and Haoran Li and Xiaomian Kang and Haitong
                 Yang and Chengqing Zong",
  title =        "Incorporating Multi-Level User Preference into
                 Document-Level Sentiment Classification",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "1",
  pages =        "7:1--7:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3234512",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3234512",
  abstract =     "Document-level sentiment classification aims to
                 predict a user's sentiment polarity in a document about
                 a product. Most existing methods only focus on review
                 contents and ignore users who post reviews. In fact,
                 when reviewing a product, different users have
                 different word-using habits to express opinions (i.e.,
                 word-level user preference), care about different
                 attributes of the product (i.e., aspect-level user
                 preference), and have different characteristics to
                 score the review (i.e., polarity-level user
                 preference). These preferences have great influence on
                 interpreting the sentiment of text. To address this
                 issue, we propose a model called Hierarchical User
                 Attention Network (HUAN), which incorporates
                 multi-level user preference into a hierarchical neural
                 network to perform document-level sentiment
                 classification. Specifically, HUAN encodes different
                 kinds of information (word, sentence, aspect, and
                 document) in a hierarchical structure and imports user
                 embedding and user attention mechanism to model these
                 preferences. Empirical results on two real-world
                 datasets show that HUAN achieves state-of-the-art
                 performance. Furthermore, HUAN can also mine important
                 attributes of products for different users.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Jain:2019:UES,
  author =       "Amita Jain and Minni Jain and Goonjan Jain and
                 Devendra K. Tayal",
  title =        "{``UTTAM''}: an Efficient Spelling Correction System
                 for {Hindi} Language Based on Supervised Learning",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "1",
  pages =        "8:1--8:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3264620",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3264620",
  abstract =     "In this article, we propose a system called ``UTTAM,''
                 for correcting spelling errors in Hindi language text
                 using supervised learning. Unlike other languages,
                 Hindi contains a large set of characters, words with
                 inflections and complex characters, phonetically
                 similar sets of characters, and so on. The complexity
                 increases the possibility of confusion and occasionally
                 leads to entering a wrong character in a word. The
                 existence of spelling errors in text significantly
                 decreases the accuracy of the available resources, like
                 search engine, text editor, and so on. The proposed
                 work is the first approach to correct non-word (Out of
                 Vocabulary) errors as well as real-word errors
                 simultaneously in a sentence of Hindi language. The
                 proposed method investigates the human behavior, i.e.,
                 the type and frequency of spelling errors done by
                 humans in Hindi text. Based on the type and frequency
                 of spelling errors, the heterogeneous data is collected
                 in matrices. This data in matrices is used to generate
                 the suitable candidate words for an input word. After
                 generating candidate words, the Viterbi algorithm is
                 applied to perform the word correction. The Viterbi
                 algorithm finds the best sequence of candidate words to
                 correct the input sentence. For Hindi, this work is the
                 first attempt for real-word error correction. For
                 non-word errors, the experiments show that ``UTTAM''
                 performs better than the existing systems SpellGuru and
                 Saksham.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Murthy:2019:INT,
  author =       "Rudra Murthy and Mitesh M. Khapra and Pushpak
                 Bhattacharyya",
  title =        "Improving {NER} Tagging Performance in Low-Resource
                 Languages via Multilingual Learning",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "2",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3238797",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3238797",
  abstract =     "Existing supervised solutions for Named Entity
                 Recognition (NER) typically rely on a large annotated
                 corpus. Collecting large amounts of NER annotated
                 corpus is time-consuming and requires considerable
                 human effort. However, collecting small amounts of
                 annotated corpus for any language is feasible, but the
                 performance degrades due to data sparsity. We address
                 the data sparsity by borrowing features from the data
                 of a closely related language. We use hierarchical
                 neural networks to train a supervised NER system. The
                 feature borrowing from a closely related language
                 happens via the shared layers of the network. The
                 neural network is trained on the combined dataset of
                 the low-resource language and a closely related
                 language, also termed Multilingual Learning. Unlike
                 existing systems, we share all layers of the network
                 between the two languages. We apply multilingual
                 learning for NER in Indian languages and empirically
                 show the benefits over a monolingual deep learning
                 system and a traditional machine-learning system with
                 some feature engineering. Using multilingual learning,
                 we show that the low-resource language NER performance
                 increases mainly due to (1) increased named entity
                 vocabulary, (2) cross-lingual subword features, and (3)
                 multilingual learning playing the role of
                 regularization.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Jarrar:2019:DBM,
  author =       "Mustafa Jarrar and Fadi Zaraket and Rami Asia and
                 Hamzeh Amayreh",
  title =        "Diacritic-Based Matching of {Arabic} Words",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "2",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3242177",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3242177",
  abstract =     "Words in Arabic consist of letters and short vowel
                 symbols called diacritics inscribed atop regular
                 letters. Changing diacritics may change the syntax and
                 semantics of a word; turning it into another. This
                 results in difficulties when comparing words based
                 solely on string matching. Typically, Arabic NLP
                 applications resort to morphological analysis to battle
                 ambiguity originating from this and other challenges.
                 In this article, we introduce three alternative
                 algorithms to compare two words with possibly different
                 diacritics. We propose the Subsume knowledge-based
                 algorithm, the Imply rule-based algorithm, and the
                 Alike machine-learning-based algorithm. We evaluated
                 the soundness, completeness, and accuracy of the
                 algorithms against a large dataset of 86,886 word
                 pairs. Our evaluation shows that the accuracy of
                 Subsume (100\%), Imply (99.32\%), and Alike (99.53\%).
                 Although accurate, Subsume was able to judge only 75\%
                 of the data. Both Subsume and Imply are sound, while
                 Alike is not. We demonstrate the utility of the
                 algorithms using a real-life use case --- in lemma
                 disambiguation and in linking hundreds of Arabic
                 dictionaries.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Bhattacharya:2019:SSW,
  author =       "Nilanjana Bhattacharya and Partha Pratim Roy and
                 Umapada Pal",
  title =        "Sub-Stroke-Wise Relative Feature for Online {Indic}
                 Handwriting Recognition",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "2",
  pages =        "11:1--11:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3264735",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3264735",
  abstract =     "The main problem of Bangla (Bengali) and Devanagari
                 handwriting recognition is the shape similarity of
                 characters. There are only a few pieces of work on
                 writer-independent cursive online Indian text
                 recognition, and the shape similarity problem needs
                 more attention from the researchers. To handle the
                 shape similarity problem of cursive characters of
                 Bangla and Devanagari scripts, in this article, we
                 propose a new category of features called `
                 sub-stroke-wise relative feature ' (SRF) which are
                 based on relative information of the constituent parts
                 of the handwritten strokes. Relative information among
                 some of the parts within a character can be a
                 distinctive feature as it scales up small
                 dissimilarities and enhances discrimination among
                 similar-looking shapes. Also, contextual anticipatory
                 phenomena are automatically modeled by this type of
                 feature, as it takes into account the influence of
                 previous and forthcoming strokes. We have tested
                 popular state-of-the-art feature sets as well as
                 proposed SRF using various (up to 20,000-word) lexicons
                 and noticed that SRF significantly outperforms the
                 state-of-the-art feature sets for online Bangla and
                 Devanagari cursive word recognition.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Mrinalini:2019:PBP,
  author =       "K. Mrinalini and T. Nagarajan and P. Vijayalakshmi",
  title =        "Pause-Based Phrase Extraction and Effective {OOV}
                 Handling for Low-Resource Machine Translation Systems",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "2",
  pages =        "12:1--12:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3265751",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3265751",
  abstract =     "Machine translation is the core problem for several
                 natural language processing research across the globe.
                 However, building a translation system involving
                 low-resource languages remains a challenge with respect
                 to statistical machine translation (SMT). This work
                 proposes and studies the effect of a phrase-induced
                 hybrid machine translation system for translation from
                 English to Tamil, under a low-resource setting. Unlike
                 conventional hybrid MT systems, the free-word ordering
                 feature of the target language Tamil is exploited to
                 form a re-ordered target language model and to extend
                 the parallel text corpus for training the SMT. In the
                 current work, a novel rule-based phrase-extraction
                 method, implemented using parts-of-speech (POS) and
                 place-of-pause in both languages is proposed, which is
                 used to pre-process the training corpus for developing
                 the back-off phrase-induced SMT. Further,
                 out-of-vocabulary (OOV) words are handled using
                 speech-based transliteration and two-level thesaurus
                 intersection techniques based on the POS tag of the OOV
                 word. To ensure that the input with OOV words does not
                 skip phrase-level translation in the hierarchical
                 model, a phrase-level example-based machine translation
                 approach is adopted to find the closest matching phrase
                 and perform translation followed by OOV replacement.
                 The proposed system results in a bilingual evaluation
                 understudy score of 84.78 and a translation edit rate
                 of 19.12. The performance of the system is compared in
                 terms of adequacy and fluency, with existing
                 translation systems for this specific language pair,
                 and it is observed that the proposed system outperforms
                 its counterparts.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Le:2019:LRM,
  author =       "Ngoc Tan Le and Fatiha Sadat and Lucie Menard and Dien
                 Dinh",
  title =        "Low-Resource Machine Transliteration Using Recurrent
                 Neural Networks",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "2",
  pages =        "13:1--13:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3265752",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3265752",
  abstract =     "Grapheme-to-phoneme models are key components in
                 automatic speech recognition and text-to-speech
                 systems. With low-resource language pairs that do not
                 have available and well-developed pronunciation
                 lexicons, grapheme-to-phoneme models are particularly
                 useful. These models are based on initial alignments
                 between grapheme source and phoneme target sequences.
                 Inspired by sequence-to-sequence recurrent neural
                 network--based translation methods, the current
                 research presents an approach that applies an alignment
                 representation for input sequences and pretrained
                 source and target embeddings to overcome the
                 transliteration problem for a low-resource languages
                 pair. Evaluation and experiments involving French and
                 Vietnamese showed that with only a small bilingual
                 pronunciation dictionary available for training the
                 transliteration models, promising results were obtained
                 with a large increase in BLEU scores and a reduction in
                 Translation Error Rate (TER) and Phoneme Error Rate
                 (PER). Moreover, we compared our proposed neural
                 network--based transliteration approach with a
                 statistical one.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Na:2019:TBK,
  author =       "Seung-hoon Na and Jianri Li and Jong-hoon Shin and
                 Kangil Kim",
  title =        "Transition-Based {Korean} Dependency Parsing Using
                 Hybrid Word Representations of Syllables and Morphemes
                 with {LSTMs}",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "2",
  pages =        "14:1--14:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3241745",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3241745",
  abstract =     "Recently, neural approaches for transition-based
                 dependency parsing have become one of the state-of-the
                 art methods for performing dependency parsing tasks in
                 many languages. In neural transition-based parsing, a
                 parser state representation is first computed from the
                 configuration of a stack and a buffer, which is then
                 fed into a feed-forward neural network model that
                 predicts the next transition action. Given that words
                 are basic elements of a stack and buffer, a parser
                 state representation is considerably affected by how a
                 word representation is defined. In particular, word
                 representation issues become more critical in
                 morphologically rich languages such as Korean, as the
                 set of potential words is not bound but introduce the
                 second-order vocabulary complexity, called the phrase
                 vocabulary complexity due to the agglutinative
                 characteristics of the language. In this article, we
                 propose a hybrid word representation that combines two
                 compositional word representations, each of which is
                 derived from representations of syllables and
                 morphemes, respectively. Our underlying assumption for
                 this hybrid word representation is that, because both
                 syllables and morphemes are two common ways of
                 decomposing Korean words, it is expected that their
                 effects in inducing word representation are
                 complementary to one another. Experimental results
                 carried on Sejong and SPMRL 2014 datasets show that our
                 proposed hybrid word representation leads to the
                 state-of-the-art performance.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Akhtar:2019:IWE,
  author =       "Md Shad Akhtar and Palaash Sawant and Sukanta Sen and
                 Asif Ekbal and Pushpak Bhattacharyya",
  title =        "Improving Word Embedding Coverage in Less-Resourced
                 Languages Through Multi-Linguality and
                 Cross-Linguality: a Case Study with Aspect-Based
                 Sentiment Analysis",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "2",
  pages =        "15:1--15:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3273931",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3273931",
  abstract =     "In the era of deep learning-based systems, efficient
                 input representation is one of the primary requisites
                 in solving various problems related to Natural Language
                 Processing (NLP), data mining, text mining, and the
                 like. Absence of adequate representation for an input
                 introduces the problem of data sparsity, and it poses a
                 great challenge to solve the underlying problem. The
                 problem is more intensified with resource-poor
                 languages due to the absence of a sufficiently large
                 corpus required to train a word embedding model. In
                 this work, we propose an effective method to improve
                 the word embedding coverage in less-resourced languages
                 by leveraging bilingual word embeddings learned from
                 different corpora. We train and evaluate deep Long
                 Short Term Memory (LSTM)-based architecture and show
                 the effectiveness of the proposed approach for two
                 aspect-level sentiment analysis tasks (i.e., aspect
                 term extraction and sentiment classification). The
                 neural network architecture is further assisted by
                 hand-crafted features for prediction. We apply the
                 proposed model in two experimental setups:
                 multi-lingual and cross-lingual. Experimental results
                 show the effectiveness of the proposed approach against
                 the state-of-the-art methods.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Nakamura:2019:WBR,
  author =       "Tatsuya Nakamura and Masumi Shirakawa and Takahiro
                 Hara and Shojiro Nishio",
  title =        "{Wikipedia}-Based Relatedness Measurements for
                 Multilingual Short Text Clustering",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "2",
  pages =        "16:1--16:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3276473",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3276473",
  abstract =     "Throughout the world, people can post information
                 about their local area in their own languages using
                 social networking services. Multilingual short text
                 clustering is an important task to organize such
                 information, and it can be applied to various
                 applications, such as event detection and
                 summarization. However, measuring the relatedness
                 between short texts written in various languages is a
                 challenging problem. In addition to handling multiple
                 languages, the semantic gaps among all languages must
                 be considered. In this article, we propose two
                 Wikipedia-based semantic relatedness measurement
                 methods for multilingual short text clustering. The
                 proposed methods solve the semantic gap problem by
                 incorporating the inter-language links of Wikipedia
                 into Extended Naive Bayes (ENB), a probabilistic method
                 that can be applied to measure semantic relatedness
                 among monolingual short texts. The proposed methods
                 represent a multilingual short text as a vector of the
                 English version of Wikipedia articles (entities). By
                 transferring texts to a unified vector space, the
                 relatedness between texts in different languages with
                 similar meanings can be increased. We also propose an
                 approach that can improve clustering performance and
                 reduce the processing time by eliminating
                 language-specific entities in the unified vector space.
                 Experimental results on multilingual Twitter message
                 clustering revealed that the proposed methods
                 outperformed cross-lingual explicit semantic analysis,
                 a previously proposed method to measure relatedness
                 between texts in different languages. Moreover, the
                 proposed methods were comparable to ENB applied to
                 texts translated into English using a proprietary
                 translation service. The proposed methods enabled
                 relatedness measurements for multilingual short text
                 clustering without requiring machine translation
                 processes.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ding:2019:NFF,
  author =       "Chenchen Ding and Masao Utiyama and Eiichiro Sumita",
  title =        "{NOVA}: a Feasible and Flexible Annotation System for
                 Joint Tokenization and Part-of-Speech Tagging",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "2",
  pages =        "17:1--17:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3276773",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3276773",
  abstract =     "A feasible and flexible annotation system is designed
                 for joint tokenization and part-of-speech (POS) tagging
                 to annotate those languages without natural definitions
                 of words. This design was motivated by the fact that
                 word separators are not used in many highly analytic
                 East and Southeast Asian languages. Although several of
                 the languages are well-studied, e.g., Chinese and
                 Japanese, many are understudied with low resources,
                 e.g., Burmese (Myanmar) and Khmer. In the first part of
                 the article, the proposed annotation system, named
                 nova, is introduced. nova contains only four basic tags
                 (n, v, a, and o); these tags can be further modified
                 and combined to adapt complex linguistic phenomena in
                 tokenization and POS tagging. In the second part of the
                 article, the feasibility and flexibility of nova is
                 illustrated from the annotation practice on Burmese and
                 Khmer. The relation between nova and two universal POS
                 tagsets is discussed in the final part of the
                 article.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ahmadi:2019:RBK,
  author =       "Sina Ahmadi",
  title =        "A Rule-Based {Kurdish} Text Transliteration System",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "2",
  pages =        "18:1--18:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3278623",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3278623",
  abstract =     "In this article, we present a rule-based approach for
                 transliterating two of the most used orthographies in
                 Sorani Kurdish. Our work consists of detecting a
                 character in a word by removing the possible
                 ambiguities and mapping it into the target orthography.
                 We describe different challenges in Kurdish text mining
                 and propose novel ideas concerning the transliteration
                 task for Sorani Kurdish. Our transliteration system,
                 named Wergor, achieves 82.79\% overall precision and
                 more than 99\% in detecting the double-usage
                 characters. We also present a manually transliterated
                 corpus for Kurdish.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Kamila:2019:THL,
  author =       "Sabyasachi Kamila and Mohammad Hasanuzzaman and Asif
                 Ekbal and Pushpak Bhattacharyya",
  title =        "{Tempo-HindiWordNet}: a Lexical Knowledge-base for
                 Temporal Information Processing",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "2",
  pages =        "19:1--19:??",
  month =        feb,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3277504",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3277504",
  abstract =     "Temporality has significantly contributed to various
                 Natural Language Processing and Information Retrieval
                 applications. In this article, we first create a
                 lexical knowledge-base in Hindi by identifying the
                 temporal orientation of word senses based on their
                 definition and then use this resource to detect
                 underlying temporal orientation of the sentences. To
                 create the resource, we propose a semi-supervised
                 learning framework, where each synset of the Hindi
                 WordNet is classified into one of the five categories,
                 namely, past, present, future, neutral, and atemporal.
                 The algorithm initiates learning with a set of seed
                 synsets and then iterates following different expansion
                 strategies, viz. probabilistic expansion based on
                 classifier's confidence and semantic distance based
                 measures. We manifest the usefulness of the resource
                 that we build on an external task, viz. sentence-level
                 temporal classification. The underlying idea is that a
                 temporal knowledge-base can help in classifying the
                 sentences according to their inherent temporal
                 properties. Experiments on two different domains, viz.
                 general and Twitter, show interesting results.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Alnawas:2019:SAI,
  author =       "Anwar Alnawas and Nursal Arici",
  title =        "Sentiment Analysis of {Iraqi Arabic} Dialect on
                 {Facebook} Based on Distributed Representations of
                 Documents",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "20:1--20:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3278605",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3278605",
  abstract =     "Nowadays, social media is used by many people to
                 express their opinions about a variety of topics.
                 Opinion Mining or Sentiment Analysis techniques extract
                 opinions from user generated contents. Over the years,
                 a multitude of Sentiment Analysis studies has been done
                 about the English language with deficiencies of
                 research in all other languages. Unfortunately, Arabic
                 is one of the languages that seems to lack substantial
                 research, despite the rapid growth of its use on social
                 media outlets. Furthermore, specific Arabic dialects
                 should be studied, not just Modern Standard Arabic. In
                 this paper, we experiment sentiments analysis of Iraqi
                 Arabic dialect using word embedding. First, we made a
                 large corpus from previous works to learn word
                 representations. Second, we generated word embedding
                 model by training corpus using Doc2Vec representations
                 based on Paragraph and Distributed Memory Model of
                 Paragraph Vectors (DM-PV) architecture. Lastly, the
                 represented feature used for training four binary
                 classifiers (Logistic Regression, Decision Tree,
                 Support Vector Machine and Naive Bayes) to detect
                 sentiment. We also experimented different values of
                 parameters (window size, dimension and negative
                 samples). In the light of the experiments, it can be
                 concluded that our approach achieves a better
                 performance for Logistic Regression and Support Vector
                 Machine than the other classifiers.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Singh:2019:OHG,
  author =       "Sukhdeep Singh and Anuj Sharma",
  title =        "Online Handwritten {Gurmukhi} Words Recognition: an
                 Inclusive Study",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "21:1--21:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3282441",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3282441",
  abstract =     "Identification of offline and online handwritten words
                 is a challenging and complex task. In comparison to
                 Latin and Oriental scripts, the research and study of
                 handwriting recognition at word level in Indic scripts
                 is at its initial phases. The two main methods of
                 handwriting recognition are global and analytical. The
                 present work introduces a novel analytical approach for
                 online handwritten Gurmukhi word recognition based on a
                 minimal set of words and recognizes an input Gurmukhi
                 word as a sequence of characters. We employed a
                 sequential step-by-step approach to recognize online
                 handwritten Gurmukhi words. Considering the massive
                 variability in online Gurmukhi handwriting, the present
                 work employs the completely linked non-homogeneous
                 hidden Markov model. In the present study, we
                 considered the dependent, major-dependent, and
                 super-dependent nature of strokes to form Gurmukhi
                 characters in words. On test sets of online handwritten
                 Gurmukhi datasets, the word-level accuracy rates are
                 85.98\%, 84.80\%, 82.40\%, and 82.20\% in four
                 different modes. Besides the online Gurmukhi word
                 recognition, the present work also provides Gurmukhi
                 handwriting analysis study for varying writing styles
                 and proposes novel techniques for zone detection and
                 rearrangement of strokes. Our proposed algorithms have
                 been successfully employed to online handwritten
                 Gurmukhi word recognition in dependent and independent
                 modes of handwriting.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Yucesoy:2019:COW,
  author =       "Veysel Y{\"u}cesoy and Aykut Ko{\c{c}}",
  title =        "Co-occurrence Weight Selection in Generation of Word
                 Embeddings for Low Resource Languages",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "22:1--22:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3282443",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3282443",
  abstract =     "This study aims to increase the performance of word
                 embeddings by proposing a new weighting scheme for
                 co-occurrence counting. The idea behind this new family
                 of weights is to overcome the disadvantage of distant
                 appearing word pairs, which are indeed semantically
                 close, while representing them in the co-occurrence
                 counting. For high-resource languages, this
                 disadvantage might not be effective due to the high
                 frequency of co-occurrence. However, when there are not
                 enough available resources, such pairs suffer from
                 being distant. To favour such pairs, a weighting scheme
                 based on a polynomial fitting procedure is proposed to
                 shift the weights up for distant words while the
                 weights of nearby words are left almost unchanged. The
                 parameter optimization for new weights and the effects
                 of the weighting scheme are analysed for the English,
                 Italian, and Turkish languages. A small portion of
                 English resources and a quarter of Italian resources
                 are utilized for demonstration purposes, as if these
                 languages are low-resource languages. Performance
                 increase is observed in analogy tests when the proposed
                 weighting scheme is applied to relatively small corpora
                 (i.e., mimicking low-resource languages) of both
                 English and Italian. To show the effectiveness of the
                 proposed scheme in small corpora, it is also shown for
                 a large English corpus that the performance of the
                 proposed weighting scheme cannot outperform the
                 original weights. Since Turkish is relatively a
                 low-resource language, it is demonstrated that the
                 proposed weighting scheme can increase the performance
                 of both analogy and similarity tests when all Turkish
                 Wikipedia pages are utilized as a corpus. The positive
                 effect of the proposed scheme has also been
                 demonstrated in a standard sentiment analysis task for
                 the Turkish language.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Bounhas:2019:UCA,
  author =       "Ibrahim Bounhas",
  title =        "On the Usage of a Classical {Arabic} Corpus as a
                 Language Resource: Related Research and Key
                 Challenges",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "23:1--23:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3277591",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3277591",
  abstract =     "This article presents a literature review of
                 computer-science-related research applied on hadith, a
                 kind of Arabic narration which appeared in the 7th
                 century. We study and compare existent works in several
                 fields of Natural Language Processing (NLP),
                 Information Retrieval (IR), and Knowledge Extraction
                 (KE). Thus, we illicit their main drawbacks and
                 identify some perspectives, which may be considered by
                 the research community. We also study the
                 characteristics of these types of documents, by
                 enumerating the advantages/limits of using hadith as a
                 language resource. Moreover, our study shows that
                 previous studies used different collections of hadiths,
                 thus making it hard to compare their results
                 objectively. Besides, many preprocessing steps are
                 recurrent through these applications, thus wasting a
                 lot of time. Consequently, the key issues for building
                 generic language resources from hadiths are discussed,
                 taking into account the relevance of related literature
                 and the wide community of researchers that are
                 interested in these narrations. The ultimate goal is to
                 structure hadith books for multiple usages, thus
                 building common collections which may be exploited in
                 future applications.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Jung:2019:MPN,
  author =       "Sangkeun Jung and Cheon-Eum Park and Changki Lee",
  title =        "Multitask Pointer Network for {Korean} Dependency
                 Parsing",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "24:1--24:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3282442",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3282442",
  abstract =     "Dependency parsing is a fundamental problem in natural
                 language processing. We introduce a novel
                 dependency-parsing framework called
                 head-pointing--based dependency parsing. In this
                 framework, we cast the Korean dependency parsing
                 problem as a statistical head-pointing and arc-labeling
                 problem. To address this problem, a novel neural
                 network called the multitask pointer network is devised
                 for a neural sequential head-pointing and type-labeling
                 architecture. Our approach does not require any
                 handcrafted features or language-specific rules to
                 parse dependency. Furthermore, it achieves
                 state-of-the-art performance for Korean dependency
                 parsing.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Bolucu:2019:UJP,
  author =       "Necva B{\"o}l{\"u}c{\"u} and Burcu Can",
  title =        "Unsupervised Joint {PoS} Tagging and Stemming for
                 Agglutinative Languages",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "25:1--25:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3292398",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3292398",
  abstract =     "The number of possible word forms is theoretically
                 infinite in agglutinative languages. This brings up the
                 out-of-vocabulary (OOV) issue for part-of-speech (PoS)
                 tagging in agglutinative languages. Since inflectional
                 morphology does not change the PoS tag of a word, we
                 propose to learn stems along with PoS tags
                 simultaneously. Therefore, we aim to overcome the
                 sparsity problem by reducing word forms into their
                 stems. We adopt a Bayesian model that is fully
                 unsupervised. We build a Hidden Markov Model for PoS
                 tagging where the stems are emitted through hidden
                 states. Several versions of the model are introduced in
                 order to observe the effects of different dependencies
                 throughout the corpus, such as the dependency between
                 stems and PoS tags or between PoS tags and affixes.
                 Additionally, we use neural word embeddings to estimate
                 the semantic similarity between the word form and stem.
                 We use the semantic similarity as prior information to
                 discover the actual stem of a word since inflection
                 does not change the meaning of a word. We compare our
                 models with other unsupervised stemming and PoS tagging
                 models on Turkish, Hungarian, Finnish, Basque, and
                 English. The results show that a joint model for PoS
                 tagging and stemming improves on an independent PoS
                 tagger and stemmer in agglutinative languages.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Kang:2019:SDR,
  author =       "Xiaomian Kang and Chengqing Zong and Nianwen Xue",
  title =        "A Survey of Discourse Representations for {Chinese}
                 Discourse Annotation",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "26:1--26:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3293442",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3293442",
  abstract =     "A key element in computational discourse analysis is
                 the design of a formal representation for the discourse
                 structure of a text. With machine learning being the
                 dominant method, it is important to identify a
                 discourse representation that can be used to perform
                 large-scale annotation. This survey provides a
                 systematic analysis of existing discourse
                 representation theories to evaluate whether they are
                 suitable for annotation of Chinese text. Specifically,
                 the two properties, expressiveness and practicality,
                 are introduced to compare the representations of
                 theories based on rhetorical relations and the
                 representations of theories based on entity relations.
                 The comparison systematically reveals linguistic and
                 computational characteristics of the theories. After
                 that, we conclude that none of the existing theories
                 are quite suitable for scalable Chinese discourse
                 annotation because they are not both expressive and
                 practical. Therefore, a new discourse representation
                 needs to be proposed, which should balance the
                 expressiveness and practicality, and cover rhetorical
                 relations and entity relations. Inspired by the
                 conclusions, this survey discusses some preliminary
                 proposals on how to represent the discourse structure
                 that are worth pursuing.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Badaro:2019:SOM,
  author =       "Gilbert Badaro and Ramy Baly and Hazem Hajj and Wassim
                 El-Hajj and Khaled Bashir Shaban and Nizar Habash and
                 Ahmad Al-Sallab and Ali Hamdi",
  title =        "A Survey of Opinion Mining in {Arabic}: a
                 Comprehensive System Perspective Covering Challenges
                 and Advances in Tools, Resources, Models, Applications,
                 and Visualizations",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "27:1--27:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3295662",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3295662",
  abstract =     "Opinion-mining or sentiment analysis continues to gain
                 interest in industry and academics. While there has
                 been significant progress in developing models for
                 sentiment analysis, the field remains an active area of
                 research for many languages across the world, and in
                 particular for the Arabic language, which is the fifth
                 most-spoken language and has become the fourth
                 most-used language on the Internet. With the flurry of
                 research activity in Arabic opinion mining, several
                 researchers have provided surveys to capture advances
                 in the field. While these surveys capture a wealth of
                 important progress in the field, the fast pace of
                 advances in machine learning and natural language
                 processing (NLP) necessitates a continuous need for a
                 more up-to-date literature survey. The aim of this
                 article is to provide a comprehensive literature survey
                 for state-of-the-art advances in Arabic opinion mining.
                 The survey goes beyond surveying previous works that
                 were primarily focused on classification models.
                 Instead, this article provides a comprehensive system
                 perspective by covering advances in different aspects
                 of an opinion-mining system, including advances in NLP
                 software tools, lexical sentiment and corpora
                 resources, classification models, and applications of
                 opinion mining. It also presents future directions for
                 opinion mining in Arabic. The survey also covers latest
                 advances in the field, including deep learning advances
                 in Arabic Opinion Mining. The article provides
                 state-of-the-art information to help new or established
                 researchers in the field as well as industry developers
                 who aim to deploy an operational complete
                 opinion-mining system. Key insights are captured at the
                 end of each section for particular aspects of the
                 opinion-mining system giving the reader a choice of
                 focusing on particular aspects of interest.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Masmoudi:2019:ADR,
  author =       "Abir Masmoudi and Salima Mdhaffar and Rahma Sellami
                 and Lamia Hadrich Belguith",
  title =        "Automatic Diacritics Restoration for {Tunisian}
                 Dialect",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "28:1--28:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3297278",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3297278",
  abstract =     "Modern Standard Arabic, as well as Arabic dialect
                 languages, are usually written without diacritics. The
                 absence of these marks constitute a real problem in the
                 automatic processing of these data by NLP tools.
                 Indeed, writing Arabic without diacritics introduces
                 several types of ambiguity. First, a word without
                 diacratics could have many possible meanings depending
                 on their diacritization. Second, undiacritized surface
                 forms of an Arabic word might have as many as 200
                 readings depending on the complexity of its morphology
                 [12]. In fact, the agglutination property of Arabic
                 might produce a problem that can only be resolved using
                 diacritics. Third, without diacritics a word could have
                 many possible parts of speech (POS) instead of one.
                 This is the case with the words that have the same
                 spelling and POS tag but a different lexical sense, or
                 words that have the same spelling but different POS
                 tags and lexical senses [8]. Finally, there is
                 ambiguity at the grammatical level (syntactic
                 ambiguity). In this article, we propose the first work
                 that investigates the automatic diacritization of
                 Tunisian Dialect texts. We first describe our
                 annotation guidelines and procedure. Then, we propose
                 two major models, namely a statistical machine
                 translation (SMT) and a discriminative model as a
                 sequence classification task based on Conditional
                 Random Fields (CRF). In the second approach, we
                 integrate POS features to influence the generation of
                 diacritics. Diacritics restoration was performed at
                 both the word and the character levels. The results
                 showed high scores of automatic diacritization based on
                 the CRF system (Word Error Rate (WER) 21.44\% for CRF
                 and WER 34.6\% for SMT).",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Rudra:2019:IAD,
  author =       "Koustav Rudra and Ashish Sharma and Kalika Bali and
                 Monojit Choudhury and Niloy Ganguly",
  title =        "Identifying and Analyzing Different Aspects of
                 {English--Hindi} Code-Switching in {Twitter}",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "29:1--29:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3314935",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3314935",
  abstract =     "Code-switching or the juxtaposition of linguistic
                 units from two or more languages in a single utterance,
                 has, in recent times, become very common in text,
                 thanks to social media and other computer mediated
                 forms of communication. In this exploratory study of
                 English-Hindi code-switching on Twitter, we
                 automatically create a large corpus of code-switched
                 tweets and devise techniques to identify the
                 relationship between successive components in a
                 code-switched tweet. More specifically, we identify
                 pragmatic functions such as narrative-evaluative,
                 negative reinforcement, translation or semantically
                 equivalent statements, and so on characterizing the
                 relation between successive components. We analyze the
                 difference/similarity between switching patterns in
                 code-switched and monolingual multi-component tweets.
                 We observe strong dominance of narrative-evaluative
                 (non-opinion to opinion or vice versa) switching in
                 case of both code-switched and monolingual
                 multi-component tweets in around 40\% of cases.
                 Polarity switching appears to be a prevalent switching
                 phenomenon (10\%) specifically in code-switched tweets
                 (three to four times higher than monolingual
                 multi-component tweets) where preference of expressing
                 negative sentiment in Hindi is approximately twice
                 compared to English. Positive reinforcement appears to
                 be an important pragmatic function for English
                 multi-component tweets, whereas negative reinforcement
                 plays a key role for Devanagari multi-component tweets.
                 Our results also indicate that the extent and nature of
                 code-switching also strongly depend on the topic
                 (sports, politics, etc.) of discussion.",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Verma:2019:CAH,
  author =       "Pradeepika Verma and Sukomal Pal and Hari Om",
  title =        "A Comparative Analysis on {Hindi} and {English}
                 Extractive Text Summarization",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "30:1--30:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3308754",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/python.bib;
                 http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3308754",
  abstract =     "Text summarization is the process of transfiguring a
                 large documental information into a clear and concise
                 form. In this article, we present a detailed
                 comparative study of various extractive methods for
                 automatic text summarization on Hindi and English text
                 datasets of news articles. We consider 13 different
                 summarization techniques, namely, TextRank, LexRank,
                 Luhn, LSA, Edmundson, ChunkRank, TGraph, UniRank,
                 NN-ED, NN-SE, FE-SE, SummaRuNNer, and MMR-SE, and we
                 evaluate their performance using various performance
                 metrics, such as precision, recall, F$_1$, cohesion,
                 non-redundancy, readability, and significance. A
                 thorough analysis is done in eight different parts that
                 exhibits the strengths and limitations of these
                 methods, effect of performance over the summary length,
                 impact of language of a document, and other factors as
                 well. A standard summary evaluation tool (ROUGE) and
                 extensive programmatic evaluation using Python 3.5 in
                 Anaconda environment are used to evaluate their
                 outcome.",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wei:2019:ROD,
  author =       "Bingzhen Wei and Xuancheng Ren and Yi Zhang and
                 Xiaoyan Cai and Qi Su and Xu Sun",
  title =        "Regularizing Output Distribution of Abstractive
                 {Chinese} Social Media Text Summarization for Improved
                 Semantic Consistency",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "31:1--31:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3314934",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3314934",
  abstract =     "Abstractive text summarization is a highly difficult
                 problem, and the sequence-to-sequence model has shown
                 success in improving the performance on the task.
                 However, the generated summaries are often inconsistent
                 with the source content in semantics. In such cases,
                 when generating summaries, the model selects
                 semantically unrelated words with respect to the source
                 content as the most probable output. The problem can be
                 attributed to heuristically constructed training data,
                 where summaries can be unrelated to the source content,
                 thus containing semantically unrelated words and
                 spurious word correspondence. In this article, we
                 propose a regularization approach for the
                 sequence-to-sequence model and make use of what the
                 model has learned to regularize the learning objective
                 to alleviate the effect of the problem. In addition, we
                 propose a practical human evaluation method to address
                 the problem that the existing automatic evaluation
                 method does not evaluate the semantic consistency with
                 the source content properly. Experimental results
                 demonstrate the effectiveness of the proposed approach,
                 which outperforms almost all the existing models.
                 Especially, the proposed approach improves the semantic
                 consistency by 4\% in terms of human evaluation.",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Trieu:2019:LAR,
  author =       "Hai-Long Trieu and Duc-Vu Tran and Ashwin Ittoo and
                 Le-Minh Nguyen",
  title =        "Leveraging Additional Resources for Improving
                 Statistical Machine Translation on {Asian} Low-Resource
                 Languages",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "32:1--32:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3314936",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3314936",
  abstract =     "Phrase-based machine translation (MT) systems require
                 large bilingual corpora for training. Nevertheless,
                 such large bilingual corpora are unavailable for most
                 language pairs in the world, causing a bottleneck for
                 the development of MT. For the Asian language
                 pairs-Japanese, Indonesian, Malay paired with
                 Vietnamese-they are also not excluded from the case, in
                 which there are no large bilingual corpora on these
                 low-resource language pairs. Furthermore, although the
                 languages are widely used in the world, there is no
                 prior work on MT, which causes an issue for the
                 development of MT on these languages. In this article,
                 we conducted an empirical study of leveraging
                 additional resources to improve MT for the Asian
                 low-resource language pairs: translation from Japanese,
                 Indonesian, and Malay to Vietnamese. We propose an
                 innovative approach that lies in two strategies of
                 building bilingual corpora from comparable data and
                 phrase pivot translation on existing bilingual corpora
                 of the languages paired with English. Bilingual corpora
                 were built from Wikipedia bilingual titles to enhance
                 bilingual data for the low-resource languages.
                 Additionally, we introduced a combined model of the
                 additional resources to create an effective solution to
                 improve MT on the Asian low-resource languages.
                 Experimental results show the effectiveness of our
                 systems with the improvement of +2 to +7 BLEU points.
                 This work contributes to the development of MT on
                 low-resource languages, especially opening a promising
                 direction for the progress of MT on the Asian language
                 pairs.",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Dehghan:2019:CDS,
  author =       "Mohammad Hossein Dehghan and Heshaam Faili",
  title =        "Converting Dependency Structure Into {Persian} Phrase
                 Structure",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "3",
  pages =        "33:1--33:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3314937",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:32 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3314937",
  abstract =     "Treebank is one of the important and useful resources
                 in natural language processing represented in two
                 different annotated schemas: phrase and dependency
                 structures. There are many works that convert a phrase
                 structure into a dependency structure and vice versa.
                 Most of them are based that exploit the handcrafted
                 head percolation table and argument table in predefined
                 deterministic ways. In this article, we propose a
                 method to convert a dependency structure into a phrase
                 structure by enriching a trainable model of former
                 hybrid strategy approach. By adding a classifier to the
                 algorithm and using postprocessing modification, the
                 quality of conversion is increased. We evaluate our
                 method in two different languages, English and Persian,
                 and then analyze the errors. The results of our
                 experiments show a 46.01\% reduction of error rate in
                 English and 76.50\% for Persian compared to our
                 baseline. We build a new phrase structure treebank by
                 converting 10,000 sentences of Persian dependency
                 treebank into corresponding phrase structures and
                 correcting them manually.",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Awais:2019:RDI,
  author =       "Muhammad Awais and Muhammad Shoaib",
  title =        "Role of Discourse Information in {Urdu} Sentiment
                 Classification: a Rule-based Method and
                 Machine-learning Technique",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "34:1--34:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3300050",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3300050",
  abstract =     "In computational linguistics, sentiment analysis
                 refers to the classification of opinions in a positive
                 class or a negative class. There exist a lot of
                 different methods for sentiment analysis of the English
                 language, but the literature lacks the availability of
                 methods and techniques for Urdu, which is the largely
                 spoken language in the South Asian sub-continent and
                 the national language of Pakistan. The currently
                 available techniques, such as adjective count method
                 known as Bag of Words (BoW), is not sufficient for
                 classification of complex sentiment written in the Urdu
                 language. Also, the performance of available
                 machine-learning techniques (with legacy features), for
                 classification of Urdu sentiments, are not comparable
                 with the achieved accuracy of other languages. In the
                 case of the English language, the discourse information
                 (sub-sentence-level information) boosts the performance
                 of both the BoW method and machine-learning techniques,
                 but there are very few works available that have tested
                 the context-level information for the sentiment
                 analysis of the Urdu language. This research aims to
                 extract the discourse information from the Urdu
                 sentiments and utilise the discourse information to
                 improve the performance and reduce the error rate of
                 existing techniques for Urdu Sentiment classification.
                 The proposed solution extracts the discourse
                 information, suggests a new set of features for
                 machine-learning techniques, and introduces a set of
                 rules to extend the capabilities of the BoW model. The
                 results show that the task has been enhanced
                 significantly and the performance metrics such as
                 recall, precision, and accuracy are increased by
                 31.25\%, 8.46\%, and 21.6\%, respectively. In future,
                 the proposed technique can be extended to sentiments
                 with more than two sub-opinions, such as for blogs,
                 reviews, and TV talk shows.",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Nongmeikapam:2019:HMM,
  author =       "Kishorjit Nongmeikapam and Kanan Wahengbam and Oinam
                 Nickson Meetei and Themrichon Tuithung",
  title =        "Handwritten {Manipuri Meetei--Mayek} Classification
                 Using Convolutional Neural Network",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "35:1--35:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3309497",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309497",
  abstract =     "A new technique for classifying all 56 different
                 characters of the Manipuri Meetei-Mayek (MMM) is
                 proposed herein. The characters are grouped under five
                 categories, which are Eeyek Eepee (original alphabets),
                 Lom Eeyek (additional letters), Cheising Eeyek
                 (digits), Lonsum Eeyek (letters with short endings),
                 and Cheitap Eeyek (vowel signs). Two related works
                 proposed by previous researchers are studied for
                 understanding the benefits claimed by the proposed deep
                 learning approach in handwritten Manipuri Meetei-Mayek.
                 (1) Histogram of Oriented (HOG) with SVM classifier is
                 implemented for thoroughly understanding how HOG
                 features can influence accuracy. (2) The handwritten
                 samples are trained using simple Convolutional Neural
                 Network (CNN) and compared with the proposed CNN-based
                 architecture. Significant progress has been made in the
                 field of Optical Character Recognition (OCR) for
                 well-known Indian languages as well as globally popular
                 languages. Our work is novel in the sense that there is
                 no record of work available to date that is able to
                 classify all 56 classes of the MMM. It will also serve
                 as a pre-cursor for developing end-to-end OCR software
                 for translating old manuscripts, newspaper archives,
                 books, and so on.",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Gao:2019:SBC,
  author =       "Shengxiang Gao and Jihao Huang and Mingya Xue and
                 Zhengtao Yu and Zhuo Wang and Yang Zhang",
  title =        "Syntax-Based {Chinese--Vietnamese} Tree-to-Tree
                 Statistical Machine Translation with Bilingual
                 Features",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "36:1--36:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3314938",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3314938",
  abstract =     "Because of the scarcity of bilingual corpora, current
                 Chinese--Vietnamese machine translation is far from
                 satisfactory. Considering the differences between
                 Chinese and Vietnamese, we investigate whether
                 linguistic differences can be used to supervise machine
                 translation and propose a method of syntax-based
                 Chinese--Vietnamese tree-to-tree statistical machine
                 translation with bilingual features. Analyzing the
                 syntax differences between Chinese and Vietnamese, we
                 define some linguistic difference-based rules, such as
                 attributive position, time adverbial position, and
                 locative adverbial position, and create rewards for
                 similar rules. These rewards are integrated into the
                 extraction of tree-to-tree translation rules, and we
                 optimize the pruning of the search space during the
                 decoding phase. The experiments on Chinese--Vietnamese
                 bilingual sentence translation show that the proposed
                 method performs better than several compared methods.
                 Further, the results show that syntactic difference
                 features, with search pruning, can improve the accuracy
                 of machine translation without degrading the
                 efficiency.",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Sun:2019:NSP,
  author =       "Ruiyong Sun and Yijia Zhao and Qi Zhang and Keyu Ding
                 and Shijin Wang and Cui Wei",
  title =        "A Neural Semantic Parser for Math Problems
                 Incorporating Multi-Sentence Information",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "37:1--37:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3314939",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3314939",
  abstract =     "In this article, we study the problem of parsing a
                 math problem into logical forms. It is an essential
                 pre-processing step for automatically solving math
                 problems. Most of the existing studies about semantic
                 parsing mainly focused on the single-sentence level.
                 However, for parsing math problems, we need to take the
                 information of multiple sentences into consideration.
                 To achieve the task, we formulate the task as a machine
                 translation problem and extend the sequence-to-sequence
                 model with a novel two-encoder architecture and a
                 word-level selective mechanism. For training and
                 evaluating the proposed method, we construct a
                 large-scale dataset. Experimental results show that the
                 proposed two-encoder architecture and word-level
                 selective mechanism could bring significant
                 improvement. The proposed method can achieve better
                 performance than the state-of-the-art methods.",
  acknowledgement = ack-nhfb,
  articleno =    "37",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Maimaiti:2019:MRT,
  author =       "Mieradilijiang Maimaiti and Yang Liu and Huanbo Luan
                 and Maosong Sun",
  title =        "Multi-Round Transfer Learning for Low-Resource {NMT}
                 Using Multiple High-Resource Languages",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "38:1--38:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3314945",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3314945",
  abstract =     "Neural machine translation (NMT) has made remarkable
                 progress in recent years, but the performance of NMT
                 suffers from a data sparsity problem since large-scale
                 parallel corpora are only readily available for
                 high-resource languages (HRLs). In recent days,
                 transfer learning (TL) has been used widely in
                 low-resource languages (LRLs) machine translation,
                 while TL is becoming one of the vital directions for
                 addressing the data sparsity problem in low-resource
                 NMT. As a solution, a transfer learning method in NMT
                 is generally obtained via initializing the low-resource
                 model (child) with the high-resource model (parent).
                 However, leveraging the original TL to low-resource
                 models is neither able to make full use of highly
                 related multiple HRLs nor to receive different
                 parameters from the same parents. In order to exploit
                 multiple HRLs effectively, we present a
                 language-independent and straightforward multi-round
                 transfer learning (MRTL) approach to low-resource NMT.
                 Besides, with the intention of reducing the differences
                 between high-resource and low-resource languages at the
                 character level, we introduce a unified transliteration
                 method for various language families, which are both
                 semantically and syntactically highly analogous with
                 each other. Experiments on low-resource datasets show
                 that our approaches are effective, significantly
                 outperform the state-of-the-art methods, and yield
                 improvements of up to 5.63 BLEU points.",
  acknowledgement = ack-nhfb,
  articleno =    "38",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ihasz:2019:SFS,
  author =       "Peter Lajos Ihasz and Mate Kovacs and Ian Piumarta and
                 Victor V. Kryssanov",
  title =        "A Supplementary Feature Set for Sentiment Analysis in
                 {Japanese} Dialogues",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "39:1--39:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3310283",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3310283",
  abstract =     "Recently, real-time affect-awareness has been applied
                 in several commercial systems, such as dialogue systems
                 and computer games. Real-time recognition of affective
                 states, however, requires the application of costly
                 feature extraction methods and/or labor-intensive
                 annotation of large datasets, especially in the case of
                 Asian languages where large annotated datasets are
                 seldom available. To improve recognition accuracy, we
                 propose the use of cognitive context in the form of
                 ``emotion-sensitive'' intentions. Intentions are often
                 represented through dialogue acts and, as an
                 emotion-sensitive model of dialogue acts, a tagset of
                 interpersonal-relations-directing interpersonal acts
                 (the IA model) is proposed. The model's adequacy is
                 assessed using a sentiment classification task in
                 comparison with two well-known dialogue act models, the
                 SWBD-DAMSL and the DIT++. For the assessment, five
                 Japanese in-game dialogues were annotated with labels
                 of sentiments and the tags of all three dialogue act
                 models which were used to enhance a baseline sentiment
                 classifier system. The adequacy of the IA tagset is
                 demonstrated by a 9\% improvement to the baseline
                 sentiment classifier's recognition accuracy,
                 outperforming the other two models by more than 5\%.",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Saeed:2019:SAC,
  author =       "Ali Saeed and Rao Muhammad Adeel Nawab and Mark
                 Stevenson and Paul Rayson",
  title =        "A Sense Annotated Corpus for All-Words {Urdu} Word
                 Sense Disambiguation",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "40:1--40:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3314940",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3314940",
  abstract =     "Word Sense Disambiguation (WSD) aims to automatically
                 predict the correct sense of a word used in a given
                 context. All human languages exhibit word sense
                 ambiguity, and resolving this ambiguity can be
                 difficult. Standard benchmark resources are required to
                 develop, compare, and evaluate WSD techniques. These
                 are available for many languages, but not for Urdu,
                 despite this being a language with more than 300
                 million speakers and large volumes of text available
                 digitally. To fill this gap, this study proposes a
                 novel benchmark corpus for the Urdu All-Words WSD task.
                 The corpus contains 5,042 words of Urdu running text in
                 which all ambiguous words (856 instances) are manually
                 tagged with senses from the Urdu Lughat dictionary. A
                 range of baseline WSD models based on n -gram are
                 applied to the corpus, and the best performance
                 (accuracy of 57.71\%) is achieved using word 4-gram.
                 The corpus is freely available to the research
                 community to encourage further WSD research in Urdu.",
  acknowledgement = ack-nhfb,
  articleno =    "40",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Dahou:2019:MCE,
  author =       "Abdelghani Dahou and Shengwu Xiong and Junwei Zhou and
                 Mohamed Abd Elaziz",
  title =        "Multi-Channel Embedding Convolutional Neural Network
                 Model for {Arabic} Sentiment Classification",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "41:1--41:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3314941",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3314941",
  abstract =     "With the advent of social network services, Arabs'
                 opinions on the web have attracted many researchers in
                 recent years toward detecting and classifying
                 sentiments in Arabic tweets and reviews. However, the
                 impact of word embeddings vectors (WEVs) initialization
                 and dataset balance on Arabic sentiment classification
                 using deep learning has not been thoroughly studied. In
                 this article, a multi-channel embedding convolutional
                 neural network (MCE-CNN) is proposed to improve Arabic
                 sentiment classification by learning sentiment features
                 from different text domains, word, and character
                 n-grams levels. MCE-CNN encodes a combination of
                 different pre-trained word embeddings into the
                 embedding block at each embedding channel and trains
                 these channels in parallel. Besides, a separate feature
                 extraction module implemented in a CNN block is used to
                 extract more relevant sentiment features. These
                 channels and blocks help to start training on
                 high-quality WEVs and fine-tuning them. The performance
                 of MCE-CNN is evaluated on several standard balanced
                 and imbalanced datasets to reflect real-world use
                 cases. Experimental results show that MCE-CNN provides
                 a high classification accuracy and benefits from the
                 second embedding channel on both standard Arabic and
                 dialectal Arabic text, which outperforms
                 state-of-the-art methods.",
  acknowledgement = ack-nhfb,
  articleno =    "41",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Onyenwe:2019:TEI,
  author =       "Ikechukwu E. Onyenwe and Mark Hepple and Uchechukwu
                 Chinedu and Ignatius Ezeani",
  title =        "Toward an Effective {Igbo} Part-of-Speech Tagger",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "42:1--42:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3314942",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3314942",
  abstract =     "Part-of-speech (POS) tagging is a well-established
                 technology for most Western European languages and a
                 few other world languages, but it has not been
                 evaluated on Igbo, an agglutinative African language.
                 This article presents POS tagging experiments conducted
                 using an Igbo corpus as a test bed for identifying the
                 POS taggers and the Machine Learning (ML) methods that
                 can achieve a good performance with the small dataset
                 available for the language. Experiments have been
                 conducted using different well-known POS taggers
                 developed for English or European languages, and
                 different training data styles and sizes. Igbo has a
                 number of language-specific characteristics that
                 present a challenge for effective POS tagging. One
                 interesting case is the wide use of verbs (and
                 nominalizations thereof) that have an inherent noun
                 complement, which form ``linked pairs'' in the POS
                 tagging scheme, but which may appear discontinuously.
                 Another issue is Igbo's highly productive agglutinative
                 morphology, which can produce many variant word forms
                 from a given root. This productivity is a key cause of
                 the out-of-vocabulary (OOV) words observed during Igbo
                 tagging. We report results of experiments on a
                 promising direction for improving tagging performance
                 on such morphologically-inflected OOV words.",
  acknowledgement = ack-nhfb,
  articleno =    "42",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Costa-Jussa:2019:CCN,
  author =       "Marta R. Costa-Juss{\`a} and No{\'e} Casas and Carlos
                 Escolano and Jos{\'e} A. R. Fonollosa",
  title =        "{Chinese--Catalan}: a Neural Machine Translation
                 Approach Based on Pivoting and Attention Mechanisms",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "43:1--43:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3312575",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3312575",
  abstract =     "This article innovatively addresses machine
                 translation from Chinese to Catalan using neural pivot
                 strategies trained without any direct parallel data.
                 The Catalan language is very similar to Spanish from a
                 linguistic point of view, which motivates the use of
                 Spanish as pivot language. Regarding neural
                 architecture, we are using the latest state-of-the-art,
                 which is the Transformer model, only based on attention
                 mechanisms. Additionally, this work provides new
                 resources to the community, which consists of a
                 human-developed gold standard of 4,000 sentences
                 between Catalan and Chinese and all the others United
                 Nations official languages (Arabic, English, French,
                 Russian, and Spanish). Results show that the standard
                 pseudo-corpus or synthetic pivot approach performs
                 better than cascade.",
  acknowledgement = ack-nhfb,
  articleno =    "43",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Yu:2019:MTE,
  author =       "Hui Yu and Weizhi Xu and Shouxun Lin and Qun Liu",
  title =        "Machine Translation Evaluation Metric Based on
                 Dependency Parsing Model",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "44:1--44:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3312573",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3312573",
  abstract =     "Most of the syntax-based metrics obtain the similarity
                 by comparing the sub-structures extracted from the
                 trees of hypothesis and reference. These sub-structures
                 cannot represent all the information in the trees
                 because their lengths are limited. To sufficiently use
                 the reference syntax information, a new automatic
                 evaluation metric is proposed based on the dependency
                 parsing model. First, a dependency parsing model is
                 trained using the reference dependency tree for each
                 sentence. Then, the hypothesis is parsed by this
                 dependency parsing model and the corresponding
                 hypothesis dependency tree is generated. The quality of
                 hypothesis can be judged by the quality of the
                 hypothesis dependency tree. Unigram F-score is included
                 in the new metric so that lexicon similarity is
                 obtained. According to experimental results, the
                 proposed metric can perform better than METEOR and BLEU
                 on system level and get comparable results with METEOR
                 on sentence level. To further improve the performance,
                 we also propose a combined metric which gets the best
                 performance on the sentence level and on the system
                 level.",
  acknowledgement = ack-nhfb,
  articleno =    "44",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Liu:2019:EBC,
  author =       "Yang Liu and Shaonan Wang and Jiajun Zhang and
                 Chengqing Zong",
  title =        "Experience-based Causality Learning for Intelligent
                 Agents",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "45:1--45:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3314943",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3314943",
  abstract =     "Understanding causality in text is crucial for
                 intelligent agents. In this article, inspired by human
                 causality learning, we propose an experience-based
                 causality learning framework. Comparing to traditional
                 approaches, which attempt to handle the causality
                 problem relying on textual clues and linguistic
                 resources, we are the first to use experience
                 information for causality learning. Specifically, we
                 first construct various scenarios for intelligent
                 agents, thus, the agents can gain experience from
                 interaction in these scenarios. Then, human
                 participants build a number of training instances for
                 agents of causality learning based on these scenarios.
                 Each instance contains two sentences and a label. Each
                 sentence describes an event that an agent experienced
                 in a scenario, and the label indicates whether the
                 sentence (event) pair has a causal relation.
                 Accordingly, we propose a model that can infer the
                 causality in text using experience by accessing the
                 corresponding event information based on the input
                 sentence pair. Experiment results show that our method
                 can achieve impressive performance on the grounded
                 causality corpus and significantly outperform the
                 conventional approaches. Our work suggests that
                 experience is very important for intelligent agents to
                 understand causality.",
  acknowledgement = ack-nhfb,
  articleno =    "45",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Yin:2019:PTE,
  author =       "Yongjing Yin and Jinsong Su and Huating Wen and Jiali
                 Zeng and Yang Liu and Yidong Chen",
  title =        "{POS} Tag-enhanced Coarse-to-fine Attention for Neural
                 Machine Translation",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "46:1--46:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321124",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321124",
  abstract =     "Although neural machine translation (NMT) has certain
                 capability to implicitly learn semantic information of
                 sentences, we explore and show that Part-of-Speech
                 (POS) tags can be explicitly incorporated into the
                 attention mechanism of NMT effectively to yield further
                 improvements. In this article, we propose an NMT model
                 with tag-enhanced attention mechanism. In our model,
                 NMT and POS tagging are jointly modeled via multi-task
                 learning. Besides following common practice to enrich
                 encoder annotations by introducing predicted source POS
                 tags, we exploit predicted target POS tags to refine
                 attention model in a coarse-to-fine manner.
                 Specifically, we first implement a coarse attention
                 operation solely on source annotations and target
                 hidden state, where the produced context vector is
                 applied to update target hidden state used for target
                 POS tagging. Then, we perform a fine attention
                 operation that extends the coarse one by further
                 exploiting the predicted target POS tags. Finally, we
                 facilitate word prediction by simultaneously utilizing
                 the context vector from fine attention and the
                 predicted target POS tags. Experimental results and
                 further analyses on Chinese--English and
                 Japanese-English translation tasks demonstrate the
                 superiority of our proposed model over the conventional
                 NMT models. We release our code at
                 https://github.com/middlekisser/PEA-NMT.git.",
  acknowledgement = ack-nhfb,
  articleno =    "46",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Yang:2019:MEA,
  author =       "Jun Yang and Runqi Yang and Hengyang Lu and Chongjun
                 Wang and Junyuan Xie",
  title =        "Multi-Entity Aspect-Based Sentiment Analysis with
                 Context, Entity, Aspect Memory and Dependency
                 Information",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "47:1--47:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321125",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321125",
  abstract =     "Fine-grained sentiment analysis is a useful tool for
                 producers to understand consumers' needs as well as
                 complaints about products and related aspects from
                 online platforms. In this article, we define a novel
                 task named ``Multi-Entity Aspect-Based Sentiment
                 Analysis (ME-ABSA)''. It investigates the sentiment
                 towards entities and their related aspects. It makes
                 the well-studied aspect-based sentiment analysis a
                 special case of this type, where the number of entities
                 is limited to one. We contribute a new dataset for this
                 task, with multi-entity Chinese posts in it. We propose
                 to model context, entity, and aspect memory to address
                 the task and incorporate dependency information for
                 further improvement. Experiments show that our methods
                 perform significantly better than baseline methods on
                 datasets for both ME-ABSA task and ABSA task. The
                 in-depth analysis further validates the effectiveness
                 of our methods and shows that our methods are capable
                 of generalizing to new (entity, aspect) combinations
                 with little loss of accuracy. This observation
                 indicates that data annotation in real applications can
                 be largely simplified.",
  acknowledgement = ack-nhfb,
  articleno =    "47",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Kim:2019:MTS,
  author =       "Hyun Kim and Jong-Hyeok Lee and Seung-Hoon Na",
  title =        "Multi-task Stack Propagation for Neural Quality
                 Estimation",
  journal =      j-TALLIP,
  volume =       "18",
  number =       "4",
  pages =        "48:1--48:??",
  month =        aug,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321127",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Wed Oct 2 10:34:33 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321127",
  abstract =     "Quality estimation is an important task in machine
                 translation that has attracted increased interest in
                 recent years. A key problem in translation-quality
                 estimation is the lack of a sufficient amount of the
                 quality annotated training data. To address this
                 shortcoming, the Predictor-Estimator was proposed
                 recently by introducing ``word prediction'' as an
                 additional pre-subtask that predicts a current target
                 word with consideration of surrounding source and
                 target contexts, resulting in a two-stage neural model
                 composed of a predictor and an estimator. However, the
                 original Predictor-Estimator is not trained on a
                 continuous stacking model but instead in a cascaded
                 manner that separately trains the predictor from the
                 estimator. In addition, the Predictor-Estimator is
                 trained based on single-task learning only, which uses
                 target-specific quality-estimation data without using
                 other training data that are available from other-level
                 quality-estimation tasks. In this article, we thus
                 propose a multi-task stack propagation, which
                 extensively applies stack propagation to fully train
                 the Predictor-Estimator on a continuous stacking
                 architecture and multi-task learning to enhance the
                 training data from related other-level
                 quality-estimation tasks. Experimental results on WMT17
                 quality-estimation datasets show that the
                 Predictor-Estimator trained with multi-task stack
                 propagation provides statistically significant
                 improvements over the baseline models. In particular,
                 under an ensemble setting, the proposed multi-task
                 stack propagation leads to state-of-the-art performance
                 at all the sentence/word/phrase levels for WMT17
                 quality estimation tasks.",
  acknowledgement = ack-nhfb,
  articleno =    "48",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wang:2020:GCL,
  author =       "Hongmin Wang and Jie Yang and Yue Zhang",
  title =        "From {Genesis} to {Creole} Language: Transfer Learning
                 for {Singlish} Universal Dependencies Parsing and {POS}
                 Tagging",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--29",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321128",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3321128",
  abstract =     "Singlish can be interesting to the computational
                 linguistics community both linguistically, as a major
                 low-resource creole based on English, and
                 computationally, for information extraction and
                 sentiment analysis of regional social media. In our
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Kong:2020:CZP,
  author =       "Fang Kong and Min Zhang and Guodong Zhou",
  title =        "{Chinese} Zero Pronoun Resolution: a Chain-to-chain
                 Approach",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--21",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321129",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3321129",
  abstract =     "Chinese zero pronoun (ZP) resolution plays a critical
                 role in discourse analysis. Different from traditional
                 mention-to-mention approaches, this article proposes a
                 chain-to-chain approach to improve the performance of
                 ZP resolution in three aspects. \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Yin:2020:CZP,
  author =       "Qingyu Yin and Weinan Zhang and Yu Zhang and Ting
                 Liu",
  title =        "{Chinese} Zero Pronoun Resolution: a Collaborative
                 Filtering-based Approach",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--20",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3325884",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3325884",
  abstract =     "Semantic information that has been proven to be
                 necessary to the resolution of common noun phrases is
                 typically ignored by most existing Chinese zero pronoun
                 resolvers. This is because that zero pronouns convey no
                 descriptive information, which makes it \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Das:2020:TCT,
  author =       "Ayan Das and Sudeshna Sarkar",
  title =        "Transform, Combine, and Transfer: Delexicalized
                 Transfer Parser for Low-resource Languages",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--30",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3325886",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3325886",
  abstract =     "Transfer parsing has been used for developing
                 dependency parsers for languages with no treebank by
                 using transfer from treebanks of other languages
                 (source languages). In delexicalized transfer, parsed
                 words are replaced by their part-of-speech tags.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ding:2020:TBM,
  author =       "Chenchen Ding and Hnin Thu Zar Aye and Win Pa Pa and
                 Khin Thandar Nwet and Khin Mar Soe and Masao Utiyama
                 and Eiichiro Sumita",
  title =        "Towards {Burmese} ({Myanmar}) Morphological Analysis:
                 Syllable-based Tokenization and Part-of-speech
                 Tagging",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--34",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3325885",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3325885",
  abstract =     "This article presents a comprehensive study on two
                 primary tasks in Burmese (Myanmar) morphological
                 analysis: tokenization and part-of-speech (POS)
                 tagging. Twenty thousand Burmese sentences of newswire
                 are annotated with two-layer tokenization and
                 POS-\ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Liu:2020:AMC,
  author =       "Dayiheng Liu and Kexin Yang and Qian Qu and Jiancheng
                 Lv",
  title =        "Ancient--Modern {Chinese} Translation with a New Large
                 Training Dataset",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--13",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3325887",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3325887",
  abstract =     "Ancient Chinese brings the wisdom and spirit culture
                 of the Chinese nation. Automatic translation from
                 ancient Chinese to modern Chinese helps to inherit and
                 carry forward the quintessence of the ancients.
                 However, the lack of large-scale parallel \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Wang:2020:CSP,
  author =       "Wei Wang and Degen Huang and Jingxiang Cao",
  title =        "{Chinese} Syntax Parsing Based on Sliding Match of
                 Semantic String",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--14",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3329707",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3329707",
  abstract =     "Different from the current syntax parsing based on
                 deep learning, we present a novel Chinese parsing
                 method, which is based on Sliding Match of Semantic
                 String (SMOSS). (1) Training stage: In a treebank,
                 headwords of tree nodes are represented by \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Kanwal:2020:UNE,
  author =       "Safia Kanwal and Kamran Malik and Khurram Shahzad and
                 Faisal Aslam and Zubair Nawaz",
  title =        "{Urdu} Named Entity Recognition: Corpus Generation and
                 Deep Learning Applications",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--13",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3329710",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3329710",
  abstract =     "Named Entity Recognition (NER) plays a pivotal role in
                 various natural language processing tasks, such as
                 machine translation and automatic question-answering
                 systems. Recognizing the importance of NER, a plethora
                 of NER techniques for Western and Asian \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Liu:2020:DCW,
  author =       "Yijia Liu and Wanxiang Che and Yuxuan Wang and Bo
                 Zheng and Bing Qin and Ting Liu",
  title =        "Deep Contextualized Word Embeddings for Universal
                 Dependency Parsing",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--17",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3326497",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3326497",
  abstract =     "Deep contextualized word embeddings (Embeddings from
                 Language Model, short for ELMo), as an emerging and
                 effective replacement for the static word embeddings,
                 have achieved success on a bunch of syntactic and
                 semantic NLP problems. However, little is \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Mehmood:2020:SAR,
  author =       "Khawar Mehmood and Daryl Essam and Kamran Shafi and
                 Muhammad Kamran Malik",
  title =        "Sentiment Analysis for a Resource Poor Language
                 ---{Roman Urdu}",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--15",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3329709",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3329709",
  abstract =     "Sentiment analysis is an important sub-task of Natural
                 Language Processing that aims to determine the polarity
                 of a review. Most of the work done on sentiment
                 analysis is for the resource-rich languages of the
                 world, but very limited work has been done \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Bakhshaei:2020:MGM,
  author =       "Somayeh Bakhshaei and Reza Safabakhsh and Shahram
                 Khadivi",
  title =        "Matching Graph, a Method for Extracting Parallel
                 Information from Comparable Corpora",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--29",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3329713",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3329713",
  abstract =     "Comparable corpora are valuable alternatives for the
                 expensive parallel corpora. They comprise informative
                 parallel fragments that are useful resources for
                 different natural language processing tasks. In this
                 work, a generative model is proposed for \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Liu:2020:FTV,
  author =       "Dayiheng Liu and Yang Xue and Feng He and Yuanyuan
                 Chen and Jiancheng Lv",
  title =        "$ \mu $-Forcing: Training Variational Recurrent
                 Autoencoders for Text Generation",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--17",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3341110",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3341110",
  abstract =     "It has been previously observed that training
                 Variational Recurrent Autoencoders (VRAE) for text
                 generation suffers from serious uninformative latent
                 variables problems. The model would collapse into a
                 plain language model that totally ignores the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Srivastava:2020:AMA,
  author =       "Jyoti Srivastava and Sudip Sanyal and Ashish Kumar
                 Srivastava",
  title =        "An Automatic and a Machine-assisted Method to Clean
                 Bilingual Corpus",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--19",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3342351",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3342351",
  abstract =     "Two different methods of corpus cleaning are presented
                 in this article. One is a machine-assisted technique,
                 which is good to clean small-sized parallel corpus, and
                 the other is an automatic method, which is suitable for
                 cleaning large-sized parallel \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Prakash:2020:ISP,
  author =       "Jeena J. Prakash and Golda Brunet Rajan and Hema A.
                 Murthy",
  title =        "Importance of Signal Processing Cues in Transcription
                 Correction for Low-Resource {Indian} Languages",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--26",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3342352",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3342352",
  abstract =     "Accurate phonetic transcriptions are crucial for
                 building robust acoustic models for speech recognition
                 as well as speech synthesis applications. Phonetic
                 transcriptions are not usually provided with speech
                 corpora. A lexicon is used to generate phone-\ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Han:2020:EMW,
  author =       "Dong Han and Junhui Li and Yachao Li and Min Zhang and
                 Guodong Zhou",
  title =        "Explicitly Modeling Word Translations in Neural
                 Machine Translation",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--17",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3342353",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3342353",
  abstract =     "In this article, we show that word translations can be
                 explicitly incorporated into NMT effectively to avoid
                 wrong translations. Specifically, we propose three
                 cross-lingual encoders to explicitly incorporate word
                 translations into NMT: (1) Factored\ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Chakrabarty:2020:NNM,
  author =       "Abhisek Chakrabarty and Akshay Chaturvedi and Utpal
                 Garain",
  title =        "{NeuMorph}: Neural Morphological Tagging for
                 Low-Resource Languages-An Experimental Study for
                 {Indic} Languages",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--19",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3342354",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3342354",
  abstract =     "This article deals with morphological tagging for
                 low-resource languages. For this purpose, five Indic
                 languages are taken as reference. In addition, two
                 severely resource-poor languages, Coptic and Kurmanji,
                 are also considered. The task entails \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Ji:2020:ATU,
  author =       "Yatu Ji and Hongxu Hou and Junjie Chen and Nier Wu",
  title =        "Adversarial Training for Unknown Word Problems in
                 Neural Machine Translation",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "1",
  pages =        "1--12",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3342482",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Fri Jan 10 08:11:41 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3342482",
  abstract =     "Nearly all of the work in neural machine translation
                 (NMT) is limited to a quite restricted vocabulary,
                 crudely treating all other words the same as an unk
                 symbol. For the translation of language with abundant
                 morphology, unknown (UNK) words also \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1521",
}

@Article{Zhu:2020:OSK,
  author =       "Qingfu Zhu and Weinan Zhang and Lei Cui and Ting Liu",
  title =        "Order-Sensitive Keywords Based Response Generation in
                 Open-Domain Conversational Systems",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "18:1--18:18",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3343258",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3343258",
  abstract =     "External keywords are crucial for response generation
                 models to address the generic response problems in
                 open-domain conversational systems. The occurrence of
                 keywords in a response depends heavily on the order of
                 the keywords as they are generated \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Zhou:2020:NCG,
  author =       "Guangyou Zhou and Yizhen Fang and Yehong Peng and
                 Jiaheng Lu",
  title =        "Neural Conversation Generation with Auxiliary
                 Emotional Supervised Models",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "19:1--19:17",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3344788",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3344788",
  abstract =     "An important aspect of developing dialogue agents
                 involves endowing a conversation system with emotion
                 perception and interaction. Most existing emotion
                 dialogue models lack the adaptability and extensibility
                 of different scenes because of their \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Zhu:2020:EDC,
  author =       "Wenhao Zhu and Xin Jin and Shuang Liu and Zhiguo Lu
                 and Wu Zhang and Ke Yan and Baogang Wei",
  title =        "Enhanced Double-Carrier Word Embedding via Phonetics
                 and Writing",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "20:1--20:18",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3344920",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3344920",
  abstract =     "Word embeddings, which map words into a unified vector
                 space, capture rich semantic information. From a
                 linguistic point of view, words have two carriers,
                 speech and writing. Yet the most recent word embedding
                 models focus on only the writing carrier \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Dehkharghani:2020:SPP,
  author =       "Rahim Dehkharghani",
  title =        "{SentiFars}: a {Persian} Polarity Lexicon for
                 Sentiment Analysis",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "21:1--21:12",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3345627",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3345627",
  abstract =     "There is no doubt about the usefulness of public
                 opinion toward different issues in social media and the
                 World Wide Web. Extracting the feelings of people about
                 an issue from text is not straightforward. Polarity
                 lexicons that assign polarity tags or \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Abdulhameed:2020:WVT,
  author =       "Tiba Zaki Abdulhameed and Imed Zitouni and Ikhlas
                 Abdel-Qader",
  title =        "{Wasf-Vec}: Topology-based Word Embedding for Modern
                 Standard {Arabic} and {Iraqi} Dialect Ontology",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "22:1--22:27",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3345517",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3345517",
  abstract =     "Word clustering is a serious challenge in low-resource
                 languages. Since words that share semantics are
                 expected to be clustered together, it is common to use
                 a feature vector representation generated from a
                 distributional theory-based word embedding \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Xu:2020:EPS,
  author =       "Ge Xu and Xiaoyan Yang and Yuanzheng Cai and Zhiqiang
                 Ruan and Tao Wang and Xiangwen Liao",
  title =        "Extracting Polarity Shifting Patterns from Any Corpus
                 Based on Natural Annotation",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "23:1--23:16",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3345518",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3345518",
  abstract =     "In recent years, online sentiment texts are generated
                 by users in various domains and in different languages.
                 Binary polarity classification (positive or negative)
                 on business sentiment texts can help both companies and
                 customers to evaluate products or \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Imankulova:2020:FPP,
  author =       "Aizhan Imankulova and Takayuki Sato and Mamoru
                 Komachi",
  title =        "Filtered Pseudo-parallel Corpus Improves Low-resource
                 Neural Machine Translation",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "24:1--24:16",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3341726",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3341726",
  abstract =     "Large-scale parallel corpora are essential for
                 training high-quality machine translation systems;
                 however, such corpora are not freely available for many
                 language translation pairs. Previously, training data
                 has been augmented by pseudo-parallel corpora
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Gupta:2020:DNN,
  author =       "Deepak Gupta and Asif Ekbal and Pushpak
                 Bhattacharyya",
  title =        "A Deep Neural Network Framework for {English} {Hindi}
                 Question Answering",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "25:1--25:22",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3359988",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3359988",
  abstract =     "In this article, we propose a unified deep neural
                 network framework for multilingual question answering
                 (QA). The proposed network deals with the multilingual
                 questions and answers snippets. The input to the
                 network is a pair of factoid question and \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Yu:2020:LWT,
  author =       "Hongfei Yu and Xiaoqing Zhou and Xiangyu Duan and Min
                 Zhang",
  title =        "Layer-Wise De-Training and Re-Training for {ConvS2S}
                 Machine Translation",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "26:1--26:15",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3358414",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3358414",
  abstract =     "The convolutional sequence-to-sequence (ConvS2S)
                 machine translation system is one of the typical neural
                 machine translation (NMT) systems. Training the ConvS2S
                 model tends to get stuck in a local optimum in our
                 pre-studies. To overcome this inferior \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Somsap:2020:IDW,
  author =       "Sittichai Somsap and Pusadee Seresangtakul",
  title =        "{Isarn Dharma} Word Segmentation Using a Statistical
                 Approach with Named Entity Recognition",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "27:1--27:16",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3359990",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3359990",
  abstract =     "In this study, we developed an Isarn Dharma word
                 segmentation system. We mainly focused on solving the
                 word ambiguity and unknown word problems in unsegmented
                 Isarn Dharma text. Ambiguous Isarn Dharma words occur
                 frequently in word construction due to \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Abbas:2020:PIR,
  author =       "Muhammad Raihan Abbas and Dr. Khadim Hussain Asif",
  title =        "{Punjabi} to {ISO 15919} and {Roman} Transliteration
                 with Phonetic Rectification",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "28:1--28:20",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3359991",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3359991",
  abstract =     "Transliteration removes the script barriers.
                 Unfortunately, Punjabi is written in four different
                 scripts, i.e., Gurmukhi, Shahmukhi, Devnagri, and
                 Latin. The Latin script is understandable for nearly
                 all factions of the Punjabi community. The objective
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Beseiso:2020:SAM,
  author =       "Majdi Beseiso and Haytham Elmousalami",
  title =        "Subword Attentive Model for {Arabic} Sentiment
                 Analysis: a Deep Learning Approach",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "29:1--29:17",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3360016",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3360016",
  abstract =     "Social media data is unstructured data where these big
                 data are exponentially increasing day to day in many
                 different disciplines. Analysis and understanding the
                 semantics of these data are a big challenge due to its
                 variety and huge volume. To address \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "29",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Harikrishna:2020:CSC,
  author =       "D. M. Harikrishna and K. Sreenivasa Rao",
  title =        "{Children}'s Story Classification in {Indian}
                 Languages Using Linguistic and Keyword-based Features",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "30:1--30:22",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3342356",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3342356",
  abstract =     "The primary objective of this work is to classify
                 Hindi and Telugu stories into three genres: fable,
                 folk-tale, and legend. In this work, we are proposing a
                 framework for story classification (SC) using keyword
                 and part-of-speech (POS) features. For \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "30",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Jung:2020:WRT,
  author =       "Hun-Young Jung and Jong-Hyeok Lee and Eunju Min and
                 Seung-Hoon Na",
  title =        "Word Reordering for Translation into {Korean} Sign
                 Language Using Syntactically-guided Classification",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "31:1--31:20",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3357612",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3357612",
  abstract =     "Machine translation aims to break the language barrier
                 that prevents communication with others and increase
                 access to information. Deaf people face huge language
                 barriers in their daily lives, including access to
                 digital and spoken information. There \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "31",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Masmoudi:2020:TAA,
  author =       "Abir Masmoudi and Mariem Ellouze Khmekhem and Mourad
                 Khrouf and Lamia Hadrich Belguith",
  title =        "Transliteration of {Arabizi} into {Arabic} Script for
                 {Tunisian} Dialect",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "32:1--32:21",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3364319",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3364319",
  abstract =     "The evolution of information and communication
                 technology has markedly influenced communication
                 between correspondents. This evolution has facilitated
                 the transmission of information and has engendered new
                 forms of written communication (email, chat, \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "32",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Mukherjee:2020:FST,
  author =       "Subham Mukherjee and Pradeep Kumar and Partha Pratim
                 Roy",
  title =        "Fusion of Spatio-temporal Information for {Indic} Word
                 Recognition Combining Online and Offline Text Data",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "2",
  pages =        "33:1--33:24",
  month =        mar,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3364533",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:05:40 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3364533",
  abstract =     "We present a novel Indic handwritten word recognition
                 scheme by fusion of spatio-temporal information
                 extracted from handwritten images. The main challenge
                 in Indic word recognition lies in its complexity
                 because of modifiers, touching characters, and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "33",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Yu:2020:ELR,
  author =       "Zhiqiang Yu and Zhengtao Yu and Junjun Guo and Yuxin
                 Huang and Yonghua Wen",
  title =        "Efficient Low-Resource Neural Machine Translation with
                 Reread and Feedback Mechanism",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "3",
  pages =        "34:1--34:13",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365244",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:11:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3365244",
  abstract =     "How to utilize information sufficiently is a key
                 problem in neural machine translation (NMT), which is
                 effectively improved in rich-resource NMT by leveraging
                 large-scale bilingual sentence pairs. However, for
                 low-resource NMT, lack of bilingual \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "34",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Park:2020:NSB,
  author =       "Cheoneum Park and Heejun Song and Changki Lee",
  title =        "{$ S^3$-NET}: {SRU}-Based Sentence and Self-Matching
                 Networks for Machine Reading Comprehension",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "3",
  pages =        "35:1--35:14",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365679",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:11:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3365679",
  abstract =     "Machine reading comprehension question answering
                 (MRC-QA) is the task of understanding the context of a
                 given passage to find a correct answer within it. A
                 passage is composed of several sentences; therefore,
                 the length of the input sentence becomes ...$^^^^$",
  acknowledgement = ack-nhfb,
  articleno =    "35",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Sarwar:2020:SSF,
  author =       "Raheem Sarwar and Thanasarn Porthaveepong and Attapol
                 Rutherford and Thanawin Rakthanmanon and Sarana
                 Nutanong",
  title =        "{StyloThai}:: a Scalable Framework for Stylometric
                 Authorship Identification of {Thai} Documents",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "3",
  pages =        "36:1--36:15",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365832",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:11:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3365832",
  abstract =     "Authorship identification helps to identify the true
                 author of a given anonymous document from a set of
                 candidate authors. The applications of this task can be
                 found in several domains, such as law enforcement
                 agencies and information retrieval. These \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "36",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Kim:2020:UIB,
  author =       "Hyun Kim and Seung-Hoon Na",
  title =        "Uniformly Interpolated Balancing for Robust Prediction
                 in Translation Quality Estimation: a Case Study of
                 {English--Korean} Translation",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "3",
  pages =        "37:1--37:27",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365916",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:11:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3365916",
  abstract =     "There has been growing interest among researchers in
                 quality estimation (QE), which attempts to
                 automatically predict the quality of machine
                 translation (MT) outputs. Most existing works on QE are
                 based on supervised approaches using quality-annotated
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "37",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Zhou:2020:LMU,
  author =       "Xiao Zhou and Zhen-Hua Ling and Li-Rong Dai",
  title =        "Learning and Modeling Unit Embeddings Using Deep
                 Neural Networks for Unit-Selection-Based {Mandarin}
                 Speech Synthesis",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "3",
  pages =        "38:1--38:14",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3372244",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:11:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3372244",
  abstract =     "A method of learning and modeling unit embeddings
                 using deep neutral networks (DNNs) is presented in this
                 article for unit-selection-based Mandarin speech
                 synthesis. Here, a unit embedding is defined as a
                 fixed-length embedding vector for a phone-sized
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "38",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Mirzaei:2020:SRL,
  author =       "Azadeh Mirzaei and Fatemeh Sedghi and Pegah Safari",
  title =        "Semantic Role Labeling System for {Persian} Language",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "3",
  pages =        "39:1--39:12",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3372246",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:11:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3372246",
  abstract =     "In this article, we present an automatic semantic role
                 labeling system in Persian consisting of two modules:
                 argument identification for specifying argument spans
                 and argument classification for categorizing their
                 semantic roles. Our modules have been \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "39",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Ding:2020:BMT,
  author =       "Chenchen Ding and Sann Su Su Yee and Win Pa Pa and
                 Khin Mar Soe and Masao Utiyama and Eiichiro Sumita",
  title =        "A {Burmese} ({Myanmar}) {Treebank}: Guideline and
                 Analysis",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "3",
  pages =        "40:1--40:13",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3373268",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:11:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3373268",
  abstract =     "A 20,000-sentence Burmese (Myanmar) treebank on news
                 articles has been released under a CC BY-NC-SA license.
                 Complete phrase structure annotation was developed for
                 each sentence from the morphologically annotated data
                 prepared in previous work of Ding \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "40",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Song:2020:KPS,
  author =       "Hyun-Je Song and Seong-Bae Park",
  title =        "{Korean} Part-of-speech Tagging Based on Morpheme
                 Generation",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "3",
  pages =        "41:1--41:10",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3373608",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:11:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3373608",
  abstract =     "Two major problems of Korean part-of-speech (POS)
                 tagging are that the word-spacing unit is not mapped
                 one-to-one to a POS tag and that morphemes should be
                 recovered during POS tagging. Therefore, this article
                 proposes a novel two-step Korean POS tagger \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "41",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Mi:2020:LIL,
  author =       "Chenggang Mi and Lei Xie and Yanning Zhang",
  title =        "Loanword Identification in Low-Resource Languages with
                 Minimal Supervision",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "3",
  pages =        "43:1--43:22",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3374212",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:11:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3374212",
  abstract =     "Bilingual resources play a very important role in many
                 natural language processing tasks, especially the tasks
                 in cross-lingual scenarios. However, it is expensive
                 and time consuming to build such resources. Lexical
                 borrowing happens in almost every \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "43",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Li:2020:INM,
  author =       "Yachao Li and Junhui Li and Min Zhang and Yixin Li and
                 Peng Zou",
  title =        "Improving Neural Machine Translation with Linear
                 Interpolation of a Short-Path Unit",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "3",
  pages =        "44:1--44:16",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377851",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:11:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377851",
  abstract =     "In neural machine translation (NMT), the source and
                 target words are at the two ends of a large deep neural
                 network, normally mediated by a series of non-linear
                 activations. The problem with such consequent
                 non-linear activations is that they \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "44",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Liu:2020:DUK,
  author =       "Xiao-Yang Liu and Yimeng Zhang and Yukang Liao and
                 Ling Jiang",
  title =        "Dynamic Updating of the Knowledge Base for a
                 Large-Scale Question Answering System",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "3",
  pages =        "45:1--45:13",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377708",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:11:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377708",
  abstract =     "Today, the knowledge base question answering (KB-QA)
                 system is promising to achieve a large-scale
                 high-quality reply in the e-commerce industry. However,
                 there exist two major challenges to efficiently support
                 large-scale KB-QA systems. On the one hand, \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "45",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Liu:2020:ELM,
  author =       "Shih-Hung Liu and Kuan-Yu Chen and Berlin Chen",
  title =        "Enhanced Language Modeling with Proximity and Sentence
                 Relatedness Information for Extractive Broadcast News
                 Summarization",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "3",
  pages =        "46:1--46:19",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377407",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:11:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377407",
  abstract =     "The primary task of extractive summarization is to
                 automatically select a set of representative sentences
                 from a text or spoken document that can concisely
                 express the most important theme of the original
                 document. Recently, language modeling (LM) has
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "46",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

@Article{Du:2020:CNL,
  author =       "Qianlong Du and Chengqing Zong and Keh-Yih Su",
  title =        "Conducting Natural Language Inference with
                 Word-Pair-Dependency and Local Context",
  journal =      j-TALLIP,
  volume =       "19",
  number =       "3",
  pages =        "47:1--47:23",
  month =        feb,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3377704",
  ISSN =         "2375-4699 (print), 2375-4702 (electronic)",
  ISSN-L =       "2375-4699",
  bibdate =      "Tue Mar 3 09:11:26 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/tallip.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3377704",
  abstract =     "This article proposes to conduct natural language
                 inference with novel Enhanced-Relation-Head-Dependent
                 triplets (RHD triplets), which are constructed via
                 enhancing each word in the RHD triplet with its
                 associated local context. Most previous approaches
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "47",
  fjournal =     "ACM Transactions on Asian and Low-Resource Language
                 Information Processing (TALLIP)",
  journal-URL =  "https://dl.acm.org/loi/tallip",
}

%%% TO DO: [03-Mar-2020] v19n3 is still incomplete