%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.18", %%% date = "02 July 2009", %%% time = "14:21:33 MDT", %%% filename = "talip.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% FAX = "+1 801 581 4148", %%% URL = "http://www.math.utah.edu/~beebe", %%% checksum = "22155 3377 15180 138118", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "Asian language information processing, %%% bibliography, BibTeX, TALIP", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE BibTeX bibliography for %%% ACM Transactions on Asian language %%% information processing (TALIP) (CODEN none, %%% ISSN 1530-0226), which began publishing in %%% March 2002. %%% %%% The journal has a World Wide Web site at %%% %%% http://www.acm.org/pubs/talip/ %%% http://portal.acm.org/browse_dl.cfm?&idx=J820 %%% %%% At version 1.18, the year coverage looked %%% like this: %%% %%% 2002 ( 15) 2005 ( 17) 2008 ( 13) %%% 2003 ( 22) 2006 ( 28) 2009 ( 9) %%% 2004 ( 17) 2007 ( 14) %%% %%% Article: 135 %%% %%% Total entries: 135 %%% %%% This bibliography has been constructed %%% primarily from the publisher Web site. %%% %%% Numerous errors in the sources noted above %%% have been corrected. Spelling has been %%% verified with the UNIX spell and GNU ispell %%% programs using the exception dictionary %%% stored in the companion file with extension %%% .sok. %%% %%% BibTeX citation tags are uniformly chosen as %%% name:year:abbrev, where name is the family %%% name of the first author or editor, year is a %%% 4-digit number, and abbrev is a 3-letter %%% condensation of important title words. %%% Citation labels were automatically generated %%% by software developed for the BibNet Project. %%% %%% In this bibliography, entries are sorted in %%% publication order, with the help of %%% ``bibsort -byvolume''. The bibsort utility %%% is available from ftp.math.utah.edu in %%% /pub/tex/bib. %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility.", %%% } %%% ==================================================================== @Preamble{ "\hyphenation{ }" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|http://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Journal abbreviations: @String{j-TALIP = "ACM Transactions on Asian Language Information Processing"} %%% ==================================================================== %%% Bibliography entries: @Article{Wong:2002:P, author = "Kam-Fai Wong and Jun'ichi Tsujii", title = "Prologue", journal = j-TALIP, volume = "1", number = "1", pages = "1--2", month = mar, year = "2002", CODEN = "????", ISSN = "1530-0226", bibdate = "Tue Nov 5 23:44:34 MST 2002", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gao:2002:TUA, author = "Jianfeng Gao and Joshua Goodman and Mingjing Li and Kai-Fu Lee", title = "Toward a unified approach to statistical language modeling for {Chinese}", journal = j-TALIP, volume = "1", number = "1", pages = "3--33", month = mar, year = "2002", CODEN = "????", DOI = "http://doi.acm.org/10.1145/509900.509903", ISSN = "1530-0226", bibdate = "Tue Nov 5 23:44:34 MST 2002", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Lai:2002:MTE, author = "Yu-Sheng Lai and Chung-Hsien Wu", title = "Meaningful term extraction and discriminative term selection in text categorization via unknown-word methodology", journal = j-TALIP, volume = "1", number = "1", pages = "34--64", month = mar, year = "2002", CODEN = "????", DOI = "http://doi.acm.org/10.1145/509900.509904", ISSN = "1530-0226", bibdate = "Tue Nov 5 23:44:34 MST 2002", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kim:2002:MBG, author = "Byeongchang Kim and Gary Geunbae Lee and Jong-Hyeok Lee", title = "Morpheme-based grapheme to phoneme conversion using phonetic patterns and morphophonemic connectivity information", journal = j-TALIP, volume = "1", number = "1", pages = "65--82", month = mar, year = "2002", CODEN = "????", ISSN = "1530-0226", bibdate = "Tue Nov 5 23:44:34 MST 2002", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Lee:2002:UTI, author = "Tan Lee and Wai Lau and Y. W. Wong and P. C. Ching", title = "Using tone information in {Cantonese} continuous speech recognition", journal = j-TALIP, volume = "1", number = "1", pages = "83--102", month = mar, year = "2002", CODEN = "????", DOI = "http://doi.acm.org/10.1145/509900.509906", ISSN = "1530-0226", bibdate = "Tue Nov 5 23:44:34 MST 2002", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Chen:2002:BCE, author = "Hsin-Hsi Chen and Chi-Ching Lin and Wen-Cheng Lin", title = "Building a {Chinese-English} wordnet for translingual applications", journal = j-TALIP, volume = "1", number = "2", pages = "103--122", month = jun, year = "2002", CODEN = "????", DOI = "http://doi.acm.org/10.1145/568954.568955", ISSN = "1530-0226", bibdate = "Tue Nov 5 23:44:36 MST 2002", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Meng:2002:GPM, author = "Helen Meng and Po-Chui Luk and Kui Xu and Fuliang Weng", title = "{GLR} parsing with multiple grammars for natural language queries", journal = j-TALIP, volume = "1", number = "2", pages = "123--144", month = jun, year = "2002", CODEN = "????", DOI = "http://doi.acm.org/10.1145/568954.568956", ISSN = "1530-0226", bibdate = "Tue Nov 5 23:44:36 MST 2002", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Murata:2002:CTM, author = "Masaki Murata and Qing Ma and Hitoshi Isahara", title = "Comparison of three machine-learning methods for {Thai} part-of-speech tagging", journal = j-TALIP, volume = "1", number = "2", pages = "145--158", month = jun, year = "2002", CODEN = "????", DOI = "http://doi.acm.org/10.1145/568954.568957", ISSN = "1530-0226", bibdate = "Tue Nov 5 23:44:36 MST 2002", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Lu:2002:TWQ, author = "Wen-Hsiang Lu and Lee-Feng Chien and Hsi-Jian Lee", title = "Translation of web queries using anchor text mining", journal = j-TALIP, volume = "1", number = "2", pages = "159--172", month = jun, year = "2002", CODEN = "????", DOI = "http://doi.acm.org/10.1145/568954.568958", ISSN = "1530-0226", bibdate = "Tue Nov 5 23:44:36 MST 2002", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Li:2002:WBA, author = "Wenjie Li and Kam-Fai Wong", title = "A word-based approach for modeling and discovering temporal relations embedded in {Chinese} sentences", journal = j-TALIP, volume = "1", number = "3", pages = "173--206", month = sep, year = "2002", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Aug 7 08:49:00 MDT 2003", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Lee:2002:ACB, author = "Jin-Seok Lee and Byeongchang Kim and Gary Geunbae Lee", title = "Automatic corpus-based tone and break-index prediction using {K-ToBI} representation", journal = j-TALIP, volume = "1", number = "3", pages = "207--224", month = sep, year = "2002", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Aug 7 08:49:00 MDT 2003", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Luk:2002:CCD, author = "Robert W. P. Luk and K. L. Kwok", title = "A comparison of {Chinese} document indexing strategies and retrieval models", journal = j-TALIP, volume = "1", number = "3", pages = "225--268", month = sep, year = "2002", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Aug 7 08:49:00 MDT 2003", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Suzuki:2002:LCS, author = "Izumi Suzuki and Yoshiki Mikami and Ario Ohsato and Yoshihide Chubachi", title = "A language and character set determination method based on {N}-gram statistics", journal = j-TALIP, volume = "1", number = "3", pages = "269--278", month = sep, year = "2002", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Aug 7 08:49:00 MDT 2003", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Jin:2002:CDC, author = "Honglan Jin and Kam-Fai Wong", title = "A {Chinese} dictionary construction algorithm for information retrieval", journal = j-TALIP, volume = "1", number = "4", pages = "281--296", month = dec, year = "2002", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Aug 7 08:49:01 MDT 2003", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Li:2002:CCB, author = "Yuanxiang Li and Xiaoqing Ding and Chew Lim Tan", title = "Combining character-based bigrams with word-based bigrams in contextual postprocessing for {Chinese} script recognition", journal = j-TALIP, volume = "1", number = "4", pages = "297--309", month = dec, year = "2002", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Aug 7 08:49:01 MDT 2003", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Lo:2003:CLS, author = "Wai-Kit Lo and Helen Meng and P. C. Ching", title = "Cross-language spoken document retrieval using {HMM}-based retrieval model with multi-scale fusion", journal = j-TALIP, volume = "2", number = "1", pages = "1--26", month = mar, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Sun Jan 11 10:17:38 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Shi:2003:OHC, author = "Daming Shi and Robert I. Damper and Steve R. Gunn", title = "Offline handwritten {Chinese} character recognition by radical decomposition", journal = j-TALIP, volume = "2", number = "1", pages = "27--48", month = mar, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Sun Jan 11 10:17:38 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Lee:2003:TAS, author = "Yue-Shi Lee", title = "Task adaptation in stochastic language model for {Chinese} homophone disambiguation", journal = j-TALIP, volume = "2", number = "1", pages = "49--62", month = mar, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Sun Jan 11 10:17:38 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Shieh:2003:EAT, author = "Jiann-Cherng Shieh", title = "An efficient accessing technique for {Taiwanese} phonetic transcriptions", journal = j-TALIP, volume = "2", number = "1", pages = "63--77", month = mar, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Sun Jan 11 10:17:38 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Oard:2003:SLE, author = "Douglas W. Oard", title = "The surprise language exercises", journal = j-TALIP, volume = "2", number = "2", pages = "79--84", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Allan:2003:MTD, author = "James Allan and Victor Lavrenko and Margaret E. Connell", title = "A month to topic detection and tracking in {Hindi}", journal = j-TALIP, volume = "2", number = "2", pages = "85--100", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Strassel:2003:LRC, author = "Stephanie Strassel and Mike Maxwell and Christopher Cieri", title = "Linguistic resource creation for research and technology development: {A} recent experiment", journal = j-TALIP, volume = "2", number = "2", pages = "101--117", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Dorr:2003:RPD, author = "Bonnie J. Dorr and Necip Fazil Ayan and Nizar Habash and Nitin Madnani and Rebecca Hwa", title = "Rapid porting of {DUSTer} to {Hindi}", journal = j-TALIP, volume = "2", number = "2", pages = "118--123", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Huang:2003:ENE, author = "Fei Huang and Stephan Vogel and Alex Waibel", title = "Extracting named entity translingual equivalence with limited resources", journal = j-TALIP, volume = "2", number = "2", pages = "124--129", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Larkey:2003:HCT, author = "Leah S. Larkey and Margaret E. Connell and Nasreen Abduljaleel", title = "{Hindi CLIR} in thirty days", journal = j-TALIP, volume = "2", number = "2", pages = "130--142", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Lavie:2003:EHE, author = "Alon Lavie and Stephan Vogel and Lori Levin and Erik Peterson and Katharina Probst and Ariadna Font Llitj{\'o}s and Rachel Reynolds and Jaime Carbonell and Richard Cohen", title = "Experiments with a {Hindi-to-English} transfer-based {MT} system under a miserly data scenario", journal = j-TALIP, volume = "2", number = "2", pages = "143--163", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Xu:2003:CLR, author = "Jinxi Xu and Ralph Weischedel", title = "Cross-lingual retrieval for {Hindi}", journal = j-TALIP, volume = "2", number = "2", pages = "164--168", month = jun, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:35 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{May:2003:SWC, author = "Jonathan May and Ada Brunstein and Prem Natarajan and Ralph Weischedel", title = "Surprise! {What}'s in a {Cebuano} or {Hindi Name?}", journal = j-TALIP, volume = "2", number = "3", pages = "169--180", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sekine:2003:HEC, author = "Satoshi Sekine and Ralph Grishman", title = "{Hindi-English} cross-lingual question-answering system", journal = j-TALIP, volume = "2", number = "3", pages = "181--192", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Ma:2003:AHO, author = "Huanfeng Ma and David Doermann", title = "Adaptive {Hindi OCR} using generalized {Hausdorff} image comparison", journal = j-TALIP, volume = "2", number = "3", pages = "193--218", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{He:2003:MMI, author = "Daqing He and Douglas W. Oard and Jianqiang Wang and Jun Luo and Dina Demner-Fushman and Kareem Darwish and Philip Resnik and Sanjeev Khudanpur and Michael Nossal and Michael Subotin and Anton Leuski", title = "Making {MIRACLEs}: {Interactive} translingual search for {Cebuano} and {Hindi}", journal = j-TALIP, volume = "2", number = "3", pages = "219--244", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Leuski:2003:CLC, author = "Anton Leuski and Chin-Yew Lin and Liang Zhou and Ulrich Germann and Franz Josef Och and Eduard Hovy", title = "Cross-lingual {C*ST*RD}: {English} access to {Hindi} information", journal = j-TALIP, volume = "2", number = "3", pages = "245--269", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Dorr:2003:CLH, author = "Bonnie Dorr and David Zajic and Richard Schwartz", title = "Cross-language headline generation for {Hindi}", journal = j-TALIP, volume = "2", number = "3", pages = "270--289", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Li:2003:RDH, author = "Wei Li and Andrew McCallum", title = "Rapid development of {Hindi} named entity recognition using conditional random fields and feature induction", journal = j-TALIP, volume = "2", number = "3", pages = "290--294", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Maynard:2003:RCI, author = "Diana Maynard and Valentin Tablan and Kalina Bontcheva and Hamish Cunningham", title = "Rapid customization of an information extraction system for a surprise language", journal = j-TALIP, volume = "2", number = "3", pages = "295--300", month = sep, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kang:2003:IPP, author = "Mi-Young Kang and Aesun Yoon and Hyuk-Chul Kwon", title = "Improving partial parsing based on error-pattern analysis for a {Korean} grammar-checker", journal = j-TALIP, volume = "2", number = "4", pages = "301--323", month = dec, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kim:2003:RRE, author = "Harksoo Kim and Jungyun Seo", title = "Resolution of referring expressions in a {Korean} multimodal dialogue system", journal = j-TALIP, volume = "2", number = "4", pages = "324--337", month = dec, year = "2003", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Mani:2004:ISI, author = "Inderjeet Mani and James Pustejovsky and Beth Sundheim", title = "Introduction to the special issue on temporal information processing", journal = j-TALIP, volume = "3", number = "1", pages = "1--10", month = mar, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Han:2004:FRT, author = "Benjamin Han and Alon Lavie", title = "A framework for resolution of time in natural language", journal = j-TALIP, volume = "3", number = "1", pages = "11--32", month = mar, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Schilder:2004:EMT, author = "Frank Schilder", title = "Extracting meaning from temporal nouns and temporal prepositions", journal = j-TALIP, volume = "3", number = "1", pages = "33--50", month = mar, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Jang:2004:ATT, author = "Seok Bae Jang and Jennifer Baldwin and Inderjeet Mani", title = "Automatic {TIMEX2} tagging of {Korean} news", journal = j-TALIP, volume = "3", number = "1", pages = "51--65", month = mar, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Hobbs:2004:OTS, author = "Jerry R. Hobbs and Feng Pan", title = "An ontology of time for the {Semantic Web}", journal = j-TALIP, volume = "3", number = "1", pages = "66--85", month = mar, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Nov 4 08:37:36 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gao:2004:ISI, author = "Jianfeng Gao and Chin-Yew Lin", title = "Introduction to the special issue on statistical language modeling", journal = j-TALIP, volume = "3", number = "2", pages = "87--93", month = jun, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Mon Nov 22 06:20:04 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kim:2004:LTL, author = "Woosung Kim and Sanjeev Khudanpur", title = "Lexical triggers and latent semantic analysis for cross-lingual language model adaptation", journal = j-TALIP, volume = "3", number = "2", pages = "94--112", month = jun, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Mon Nov 22 06:20:04 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Linares:2004:HLM, author = "Diego Linares and Jos{\'e}-Miguel Bened{\'\i} and Joan-Andreu S{\'a}nchez", title = "A hybrid language model based on a combination of {$N$}-grams and stochastic context-free grammars", journal = j-TALIP, volume = "3", number = "2", pages = "113--127", month = jun, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Mon Nov 22 06:20:04 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Chen:2004:DHG, author = "Berlin Chen and Hsin-Min Wang and Lin-Shan Lee", title = "A discriminative {HMM\slash N}-gram-based retrieval approach for {Mandarin} spoken documents", journal = j-TALIP, volume = "3", number = "2", pages = "128--145", month = jun, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Mon Nov 22 06:20:04 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Nguyen:2004:EBS, author = "Minh Le Nguyen and Susumu Horiguchi and Akira Shimazu and Bao Tu Ho", title = "Example-based sentence reduction using the hidden {Markov} model", journal = j-TALIP, volume = "3", number = "2", pages = "146--158", month = jun, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Mon Nov 22 06:20:04 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Fung:2004:MEC, author = "Pascale Fung and Grace Ngai and Yongsheng Yang and Benfeng Chen", title = "A maximum-entropy {Chinese} parser augmented by transformation-based learning", journal = j-TALIP, volume = "3", number = "2", pages = "159--168", month = jun, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Mon Nov 22 06:20:04 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Li:2004:AMF, author = "Yujia Li and Tan Lee and Yao Qian", title = "Analysis and modeling of {F0} contours for {Cantonese} text-to-speech", journal = j-TALIP, volume = "3", number = "3", pages = "169--180", month = sep, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Apr 14 12:20:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Huang:2004:UWB, author = "Chien-Chung Huang and Shui-Lung Chuang and Lee-Feng Chien", title = "Using a {Web}-based categorization approach to generate thematic metadata from texts", journal = j-TALIP, volume = "3", number = "3", pages = "190--212", month = sep, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Apr 14 12:20:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Myaeng:2004:ISI, author = "Sung Hyon Myaeng", title = "Introduction to the special issue on computer processing of oriental languages", journal = j-TALIP, volume = "3", number = "4", pages = "213--213", month = dec, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Apr 14 12:20:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Baoli:2004:AKN, author = "Li Baoli and Lu Qin and Yu Shiwen", title = "An adaptive $k$-nearest neighbor text categorization strategy", journal = j-TALIP, volume = "3", number = "4", pages = "215--226", month = dec, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Apr 14 12:20:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kim:2004:UTI, author = "Pyung Kim and Sung Hyon Myaeng", title = "Usefulness of temporal information automatically extracted from news articles for topic tracking", journal = j-TALIP, volume = "3", number = "4", pages = "227--242", month = dec, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Apr 14 12:20:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2004:ESS, author = "Le Zhang and Jingbo Zhu and Tianshun Yao", title = "An evaluation of statistical spam filtering techniques", journal = j-TALIP, volume = "3", number = "4", pages = "243--269", month = dec, year = "2004", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Apr 14 12:20:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Wu:2005:DSF, author = "Chung-Hsien Wu and Jui-Feng Yeh and Ming-Jun Chen", title = "Domain-specific {FAQ} retrieval using independent aspects", journal = j-TALIP, volume = "4", number = "1", pages = "1--17", month = mar, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Jul 7 13:48:21 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Murata:2005:CEV, author = "Masaki Murata and Masao Utiyama and Kiyotaka Uchimoto and Hitoshi Isahara and Qing Ma", title = "Correction of errors in a verb modality corpus for machine translation with a machine-learning method", journal = j-TALIP, volume = "4", number = "1", pages = "18--37", month = mar, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Jul 7 13:48:21 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Hendessi:2005:SSP, author = "F. Hendessi and A. Ghayoori and T. A. Gulliver", title = "A speech synthesizer for {Persian} text using a neural network with a smooth ergodic {HMM}", journal = j-TALIP, volume = "4", number = "1", pages = "38--52", month = mar, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Jul 7 13:48:21 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2005:COT, author = "Ying Zhang and Phil Vines and Justin Zobel", title = "{Chinese} {OOV} translation and post-translation query expansion in {Chinese--English} cross-lingual information retrieval", journal = j-TALIP, volume = "4", number = "2", pages = "57--77", month = jun, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Dec 17 08:07:33 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Qu:2005:TES, author = "Yan Qu and David A. Hull and Gregory Grefenstette and David A. Evans and Motoko Ishikawa and Setsuko Nara and Toshiya Ueda and Daisuke Noda and Kousaku Arita and Yuki Funakoshi and Hiroshi Matsuda", title = "Towards effective strategies for monolingual and bilingual information retrieval: {Lessons} learned from {NTCIR-4}", journal = j-TALIP, volume = "4", number = "2", pages = "78--110", month = jun, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Dec 17 08:07:33 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sakai:2005:FPR, author = "Tetsuya Sakai and Toshihiko Manabe and Makoto Koyama", title = "Flexible pseudo-relevance feedback via selective sampling", journal = j-TALIP, volume = "4", number = "2", pages = "111--135", month = jun, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Dec 17 08:07:33 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kwok:2005:RRP, author = "Kui Lam Kwok and Sora Choi and Norbert Dinstl", title = "Rich results from poor resources: {NTCIR-4} monolingual and cross-lingual retrieval of {Korean} texts using {Chinese} and {English}", journal = j-TALIP, volume = "4", number = "2", pages = "135--158", month = jun, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Dec 17 08:07:33 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Savoy:2005:CSM, author = "Jacques Savoy", title = "Comparative study of monolingual and multilingual search models for use with {Asian} languages", journal = j-TALIP, volume = "4", number = "2", pages = "159--185", month = jun, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Dec 17 08:07:33 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Mase:2005:PTS, author = "Hisao Mase and Tadataka Matsubayashi and Yuichi Ogawa and Makoto Iwayama and Tadaaki Oshio", title = "Proposal of two-stage patent retrieval method considering the claim structure", journal = j-TALIP, volume = "4", number = "2", pages = "186--202", month = jun, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Dec 17 08:07:33 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Nakagawa:2005:PSI, author = "Hiroshi Nakagawa and Tatsunori Mori and Noriko Kando", title = "Preface to the special issues on {NTCIR-4}", journal = j-TALIP, volume = "4", number = "3", pages = "237--242", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kato:2005:ODQ, author = "Tsuneaki Kato and Jun'ichi Fukumoto and Fumito Masui and Noriko Kando", title = "Are open-domain question answering technologies useful for information access dialogues?---an empirical study and a proposal of a novel challenge", journal = j-TALIP, volume = "4", number = "3", pages = "243--262", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Isozaki:2005:AHP, author = "Hideki Isozaki", title = "An analysis of a high-performance {Japanese} question answering system", journal = j-TALIP, volume = "4", number = "3", pages = "263--279", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Mori:2005:JQA, author = "Tatsunori Mori", title = "{Japanese} question-answering system using {A*} search and its improvement", journal = j-TALIP, volume = "4", number = "3", pages = "280--304", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Mori:2005:MAF, author = "Tatsunori Mori and Masanori Nozawa and Yoshiaki Asada", title = "Multi-answer-focused multi-document summarization using a question-answering engine", journal = j-TALIP, volume = "4", number = "3", pages = "305--320", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Okazaki:2005:ICO, author = "Naoaki Okazaki and Yutaka Matsuo and Mitsuru Ishizuka", title = "Improving chronological ordering of sentences extracted from multiple newspaper articles", journal = j-TALIP, volume = "4", number = "3", pages = "321--339", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Yoshioka:2005:CPB, author = "Masaharu Yoshioka and Makoto Haraguchi", title = "On a combination of probabilistic and {Boolean} {IR} models for {WWW} document retrieval", journal = j-TALIP, volume = "4", number = "3", pages = "340--356", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Lingpeng:2005:CIR, author = "Yang Lingpeng and Ji Donghong and Tang Li and Niu Zhengyu", title = "{Chinese} information retrieval based on terms and relevant terms", journal = j-TALIP, volume = "4", number = "3", pages = "357--374", month = sep, year = "2005", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Jan 26 08:28:41 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sakai:2006:ISI, author = "Tetsuya Sakai and Yuji Matsumoto", title = "Introduction to the special issue: {Recent} advances in information processing and access for {Japanese}", journal = j-TALIP, volume = "4", number = "4", pages = "375--376", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Feb 16 10:54:02 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Doi:2006:EBM, author = "Takao Doi and Hirofumi Yamamoto and Eiichiro Sumita", title = "Example-based machine translation using efficient sentence retrieval based on edit-distance", journal = j-TALIP, volume = "4", number = "4", pages = "377--399", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Feb 16 10:54:02 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Tomiura:2006:ESS, author = "Yoichi Tomiura and Shosaku Tanaka and Toru Hitaka", title = "Estimating satisfactoriness of selectional restriction from corpus without a thesaurus", journal = j-TALIP, volume = "4", number = "4", pages = "400--416", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Feb 16 10:54:02 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Iida:2006:ARA, author = "Ryu Iida and Kentaro Inui and Yuji Matsumoto", title = "Anaphora resolution by antecedent identification followed by anaphoricity determination", journal = j-TALIP, volume = "4", number = "4", pages = "417--434", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Feb 16 10:54:02 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Inui:2006:ACK, author = "Takashi Inui and Kentaro Inui and Yuji Matsumoto", title = "Acquiring causal knowledge from text using the connective marker {\em tame\/}", journal = j-TALIP, volume = "4", number = "4", pages = "435--474", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Feb 16 10:54:02 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Ma:2006:TSB, author = "Qiang Ma and Katsumi Tanaka", title = "Topic-structure-based complementary information retrieval and its application", journal = j-TALIP, volume = "4", number = "4", pages = "475--503", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu Feb 16 10:54:02 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Park:2006:ATM, author = "Jong C. Park and Gary Geunbae Lee and Limsoon Wong", title = "{AUTHOR}: {Text} mining and management in biomedicine", journal = j-TALIP, volume = "5", number = "1", pages = "1--3", month = mar, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu May 11 11:29:25 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Park:2006:MBB, author = "Kyung-Mi Park and Seon-Ho Kim and Hae-Chang Rim and Young-Sook Hwang", title = "{ME}-based biomedical named entity recognition using lexical knowledge", journal = j-TALIP, volume = "5", number = "1", pages = "4--21", month = mar, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu May 11 11:29:25 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Nenadic:2006:MSR, author = "Goran Nenadi{\'c} and Sophia Ananiadou", title = "Mining semantically related terms from biomedical literature", journal = j-TALIP, volume = "5", number = "1", pages = "22--43", month = mar, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu May 11 11:29:25 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kim:2006:ECI, author = "Jung-Jae Kim and Jong C. Park", title = "Extracting contrastive information from negation patterns in biomedical literature", journal = j-TALIP, volume = "5", number = "1", pages = "44--60", month = mar, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu May 11 11:29:25 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kim:2006:TPL, author = "Eunju Kim and Yu Song and Cheongjae Lee and Kyoungduk Kim and Gary Geunbae Lee and Byoung-Kee Yi and Jeongwon Cha", title = "Two-phase learning for biological event extraction and verification", journal = j-TALIP, volume = "5", number = "1", pages = "61--73", month = mar, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu May 11 11:29:25 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Mima:2006:TBK, author = "Hideki Mima and Sophia Ananiadou and Katsumori Matsushima", title = "Terminology-based knowledge mining for new knowledge discovery", journal = j-TALIP, volume = "5", number = "1", pages = "74--88", month = mar, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Thu May 11 11:29:25 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Carpuat:2006:AWS, author = "Marine Carpuat and Pascale Fung and Grace Ngai", title = "Aligning word senses using bilingual corpora", journal = j-TALIP, volume = "5", number = "2", pages = "89--120", month = jun, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1165255.1165256", ISSN = "1530-0226", bibdate = "Thu Oct 5 07:00:29 MDT 2006", bibsource = "http://portal.acm.org/", abstract = "The growing importance of multilingual information retrieval and machine translation has made multilingual ontologies extremely valuable resources. Since the construction of an ontology from scratch is a very expensive and time-consuming undertaking, it is attractive to consider ways of automatically aligning monolingual ontologies, which already exist for many of the world's major languages. Previous research exploited similarity in the structure of the ontologies to align, or manually created bilingual resources. These approaches cannot be used to align ontologies with vastly different structures and can only be applied to much studied language pairs for which expensive resources are already available. In this paper, we propose a novel approach to align the ontologies at the node level: Given a concept represented by a particular word sense in one ontology, our task is to find the best corresponding word sense in the second language ontology. To this end, we present a language-independent, corpus-based method that borrows from techniques used in information retrieval and machine translation. We show its efficiency by applying it to two very different ontologies in very different languages: the Mandarin Chinese HowNet and the American English WordNet. Moreover, we propose a methodology to measure bilingual corpora comparability and show that our method is robust enough to use noisy nonparallel bilingual corpora efficiently, when clean parallel corpora are not available.", acknowledgement = ack-nhfb, } @Article{Lee:2006:ABN, author = "Chun-Jen Lee and Jason S. Chang and Jyh-Shing R. Jang", title = "Alignment of bilingual named entities in parallel corpora using statistical models and multiple knowledge sources", journal = j-TALIP, volume = "5", number = "2", pages = "121--145", month = jun, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1165255.1165257", ISSN = "1530-0226", bibdate = "Thu Oct 5 07:00:29 MDT 2006", bibsource = "http://portal.acm.org/", abstract = "Named entity (NE) extraction is one of the fundamental tasks in natural language processing (NLP). Although many studies have focused on identifying NEs within monolingual documents, aligning NEs in bilingual documents has not been investigated extensively due to the complexity of the task. In this article we introduce a new approach to aligning bilingual NEs in parallel corpora by incorporating statistical models with multiple knowledge sources. In our approach, we model the process of translating an English NE phrase into a Chinese equivalent using lexical translation\slash transliteration probabilities for word translation and alignment probabilities for word reordering. The method involves automatically learning phrase alignment and acquiring word translations from a bilingual phrase dictionary and parallel corpora, and automatically discovering transliteration transformations from a training set of name-transliteration pairs. The method also involves language-specific knowledge functions, including handling abbreviations, recognizing Chinese personal names, and expanding acronyms. At runtime, the proposed models are applied to each source NE in a pair of bilingual sentences to generate and evaluate the target NE candidates; the source and target NEs are then aligned based on the computed probabilities. Experimental results demonstrate that the proposed approach, which integrates statistical models with extra knowledge sources, is highly feasible and offers significant improvement in performance compared to our previous work, as well as the traditional approach of IBM Model 4.", acknowledgement = ack-nhfb, } @Article{Shirado:2006:UJH, author = "Tamotsu Shirado and Satoko Marumoto and Masaki Murata and Hitoshi Isahara", title = "Using {Japanese} honorific expressions: {A} psychological study", journal = j-TALIP, volume = "5", number = "2", pages = "146--164", month = jun, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1165255.1165258", ISSN = "1530-0226", bibdate = "Thu Oct 5 07:00:29 MDT 2006", bibsource = "http://portal.acm.org/", abstract = "We investigated, via experiment, knowledge of normative honorific expressions as used in textbooks and in practice by people. Forty subjects divided into four groups according to age (younger\slash older) and gender (male\slash female) participated in the experiments. The results show that knowledge about the use of normative honorific expressions in textbooks is similar to that demonstrated by the younger subject groups, but differed from that of the older subject groups. The knowledge of the older subjects was more complex than that shown in textbooks or demonstrated by the younger subjects. A model that can identify misuse of honorific expressions in sentences is the framework for this investigation. The model is minimal, but could represent 76\% to 92\% of the subjects' knowledge regarding each honorific element. This model will be useful in the development of computer-aided systems to help teach how honorific expressions should be used.", acknowledgement = ack-nhfb, } @Article{Wu:2006:ERT, author = "Chung-Hsien Wu and Ze-Jing Chuang and Yu-Chung Lin", title = "Emotion recognition from text using semantic labels and separable mixture models", journal = j-TALIP, volume = "5", number = "2", pages = "165--183", month = jun, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1165255.1165259", ISSN = "1530-0226", bibdate = "Thu Oct 5 07:00:29 MDT 2006", bibsource = "http://portal.acm.org/", abstract = "This study presents a novel approach to automatic emotion recognition from text. First, emotion generation rules (EGRs) are manually deduced from psychology to represent the conditions for generating emotion. Based on the EGRs, the emotional state of each sentence can be represented as a sequence of semantic labels (SLs) and attributes (ATTs); SLs are defined as the domain-independent features, while ATTs are domain-dependent. The emotion association rules (EARs) represented by SLs and ATTs for each emotion are automatically derived from the sentences in an emotional text corpus using the a priori algorithm. Finally, a separable mixture model (SMM) is adopted to estimate the similarity between an input sentence and the EARs of each emotional state. Since some features defined in this approach are domain-dependent, a dialog system focusing on the students' daily expressions is constructed, and only three emotional states, happy, unhappy, and neutral, are considered for performance evaluation. According to the results of the experiments, given the domain corpus, the proposed approach is promising, and easily ported into other domains.", acknowledgement = ack-nhfb, } @Article{Dale:2006:ISS, author = "Robert Dale", title = "Introduction to the {Special} section: {Extended} best papers from {IJCNLP 2005}", journal = j-TALIP, volume = "5", number = "3", pages = "183--184", month = sep, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:36 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Oh:2006:MTM, author = "Jong-Hoon Oh and Key-Sun Choi and Hitoshi Isahara", title = "A machine transliteration model based on correspondence between graphemes and phonemes", journal = j-TALIP, volume = "5", number = "3", pages = "185--208", month = sep, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:36 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gao:2006:ESL, author = "Jianfeng Gao and Hisami Suzuki and Wei Yuan", title = "An empirical study on language model adaptation", journal = j-TALIP, volume = "5", number = "3", pages = "209--227", month = sep, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:36 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Ye:2006:SRL, author = "Patrick Ye and Timothy Baldwin", title = "Semantic role labeling of prepositional phrases", journal = j-TALIP, volume = "5", number = "3", pages = "228--244", month = sep, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:36 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Chung:2006:APD, author = "Tze Leung Chung and Robert Wing Pong Luk and Kam Fai Wong and Kui Lam Kwok and Dik Lun Lee", title = "Adapting pivoted document-length normalization for query size: {Experiments} in {Chinese} and {English}", journal = j-TALIP, volume = "5", number = "3", pages = "245--263", month = sep, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:36 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Matsumura:2006:ERB, author = "Atsushi Matsumura and Atsuhiro Takasu and Jun Adachi", title = "Effect of relationships between words on {Japanese} information retrieval", journal = j-TALIP, volume = "5", number = "3", pages = "264--289", month = sep, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:36 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Song:2006:ISI, author = "Dawei Song and Jian-Yun Nie", title = "Introduction to special issue on reasoning in natural language information processing", journal = j-TALIP, volume = "5", number = "4", pages = "291--295", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Nie:2006:ILM, author = "Jian-Yun Nie and Guihong Cao and Jing Bai", title = "Inferential language models for information retrieval", journal = j-TALIP, volume = "5", number = "4", pages = "296--322", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gao:2006:SQT, author = "Jianfeng Gao and Jian-Yun Nie and Ming Zhou", title = "Statistical query translation models for cross-language information retrieval", journal = j-TALIP, volume = "5", number = "4", pages = "323--359", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Liu:2006:SFQ, author = "Yi Liu and Rong Jin and Joyce Y. Chai", title = "A statistical framework for query translation disambiguation", journal = j-TALIP, volume = "5", number = "4", pages = "360--387", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Li:2006:TTT, author = "Baoli Li and Wenjie Li and Qin Lu", title = "Topic tracking with time granularity reasoning", journal = j-TALIP, volume = "5", number = "4", pages = "388--412", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Phan:2006:IDS, author = "Xuan-Hieu Phan and Le-Minh Nguyen and Yasushi Inoguchi and Tu-Bao Ho and Susumu Horiguchi", title = "Improving discriminative sequential learning by discovering important association of statistics", journal = j-TALIP, volume = "5", number = "4", pages = "413--438", month = dec, year = "2006", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Chen:2007:UDM, author = "Yong Chen and Kwok-Ping Chan", title = "Using data mining techniques and rough set theory for language modeling", journal = j-TALIP, volume = "6", number = "1", pages = "??--??", month = apr, year = "2007", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "2", } @Article{Hsu:2007:MSB, author = "Chung-Chian Hsu and Chien-Hsing Chen and Tien-Teng Shih and Chun-Kai Chen", title = "Measuring similarity between transliterations against noise data", journal = j-TALIP, volume = "6", number = "1", pages = "??--??", month = apr, year = "2007", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "5", } @Article{Sakai:2007:RFQ, author = "Tetsuya Sakai", title = "On the reliability of factoid question answering evaluation", journal = j-TALIP, volume = "6", number = "1", pages = "??--??", month = apr, year = "2007", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "3", } @Article{Wiseman:2007:CBC, author = "Yair Wiseman and Irit Gefner", title = "Conjugation-based compression for {Hebrew} texts", journal = j-TALIP, volume = "6", number = "1", pages = "??--??", month = apr, year = "2007", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "4", } @Article{Wu:2007:TBS, author = "Chung-Hsien Wu and Hung-Yu Su and Yu-Hsien Chiu and Chia-Hung Lin", title = "Transfer-based statistical translation of {Taiwanese} sign language using {PCFG}", journal = j-TALIP, volume = "6", number = "1", pages = "??--??", month = apr, year = "2007", CODEN = "????", ISSN = "1530-0226", bibdate = "Sat Apr 14 10:21:37 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "1", } @Article{Kuo:2007:PSM, author = "Jin-Shea Kuo and Haizhou Li and Ying-Kuei Yang", title = "A phonetic similarity model for automatic extraction of transliteration pairs", journal = j-TALIP, volume = "6", number = "2", pages = "6:1--6:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1282080.1282081", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:11:28 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "This article proposes an approach for the automatic extraction of transliteration pairs from Chinese Web corpora. In this approach, we formulate the machine transliteration process using a syllable-based phonetic similarity model which consists of phonetic confusion matrices and a Chinese character n -gram language model. With the phonetic similarity model, the extraction of transliteration pairs becomes a two-step process of recognition followed by validation: First, in the recognition process, we identify the most probable transliteration in the k -neighborhood of a recognized English word. Then, in the validation process, we qualify the transliteration pair candidates with a hypothesis test. We carry out an analytical study on the statistics of several key factors in English-Chinese transliteration to help formulate phonetic similarity modeling. We then conduct both supervised and unsupervised learning of a phonetic similarity model on a development database. The experimental results validate the effectiveness of the phonetic similarity model by achieving an $F$-measure of 0.739 in supervised learning. The unsupervised learning approach works almost as well as the supervised one, thus allowing us to deploy automatic extraction of transliteration pairs in the Web space.", acknowledgement = ack-nhfb, articleno = "6", keywords = "extraction of transliteration pairs; machine translation; machine transliteration; phonetic confusion probability; phonetic similarity modeling", } @Article{Xiao:2007:SNM, author = "Jinghui Xiao and Xiaolong Wang and Bingquan Liu", title = "The study of a nonstationary maximum entropy {Markov} model and its application on the pos-tagging task", journal = j-TALIP, volume = "6", number = "2", pages = "7:1--7:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1282080.1282082", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:11:28 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Sequence labeling is a core task in natural language processing. The maximum entropy Markov model (MEMM) is a powerful tool in performing this task. This article enhances the traditional MEMM by exploiting the positional information of language elements. The stationary hypothesis is relaxed in MEMM, and the nonstationary MEMM (NS-MEMM) is proposed. Several related issues are discussed in detail, including the representation of positional information, NS-MEMM implementation, smoothing techniques, and the space complexity issue. Furthermore, the asymmetric NS-MEMM presents a more flexible way to exploit positional information. In the experiments, NS-MEMM is evaluated on both the Chinese and the English pos-tagging tasks. According to the experimental results, NS-MEMM yields effective improvements over MEMM by exploiting positional information. The smoothing techniques in this article effectively solve the NS-MEMM data-sparseness problem; the asymmetric NS-MEMM is also an improvement by exploiting positional information in a more flexible way.", acknowledgement = ack-nhfb, articleno = "7", keywords = "data sparseness problem; Markov property; MEMM; pos-tagging; stationary hypothesis", } @Article{Zhuang:2007:IHD, author = "Yl Zhuang and Yueting Zhuang and Qing Li and Lei Chen", title = "Interactive high-dimensional index for large {Chinese} calligraphic character databases", journal = j-TALIP, volume = "6", number = "2", pages = "8:1--8:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1282080.1282083", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:11:28 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The large numbers of Chinese calligraphic scripts in existence are valuable part of the Chinese cultural heritage. However, due to the shape complexity of these characters, it is hard to employ existing techniques to effectively retrieve and efficiently index them. In this article, using a novel shape-similarity- based retrieval method in which shapes of calligraphic characters are represented by their contour points extracted from the character images, we propose an interactive partial-distance-map (PDM)- based high-dimensional indexing scheme which is designed specifically to speed up the retrieval performance of the large Chinese calligraphic character databases effectively. Specifically, we use the approximate minimal bounding sphere of a query character and utilize users' relevance feedback to refine the query gradually. Comprehensive experiments are conducted to testify the efficiency and effectiveness of this method. In addition, a new $k$-NN search called Pseudo $k$-NN (P $k$-NN) search is presented to better facilitate the PDM-based character retrieval.", acknowledgement = ack-nhfb, articleno = "8", keywords = "Chinese calligraphic character; hyper-centre relocation; Pseudo k-NN", } @Article{Saraswathi:2007:CPE, author = "S. Saraswathi and T. V. Geetha", title = "Comparison of performance of enhanced morpheme-based language model with different word-based language models for improving the performance of {Tamil} speech recognition system", journal = j-TALIP, volume = "6", number = "3", pages = "9:1--9:??", month = nov, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1290002.1290003", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:11:45 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "This paper describes a new technique of language modeling for a highly inflectional Dravidian language, Tamil. It aims to alleviate the main problems encountered in processing of Tamil language, like enormous vocabulary growth caused by the large number of different forms derived from one word. The size of the vocabulary was reduced by, decomposing the words into stems and endings and storing these sub word units (morphemes) in the vocabulary separately. A enhanced morpheme-based language model was designed for the inflectional language Tamil. The enhanced morpheme-based language model was trained on the decomposed corpus. The perplexity and Word Error Rate (WER) were obtained to check the efficiency of the model for Tamil speech recognition system. The results were compared with word-based bigram and trigram language models, distance based language model, dependency based language model and class based language model. From the results it was analyzed that the enhanced morpheme-based trigram model with Katz back-off smoothing effect improved the performance of the Tamil speech recognition system when compared to the word-based language models.", acknowledgement = ack-nhfb, articleno = "9", keywords = "language model; morphemes; perplexity; word error rate and speech recognition", } @Article{Hussain:2007:DLS, author = "Sarmad Hussain and Sana Gul and Afifah Waseem", title = "Developing lexicographic sorting: {An} example for {Urdu}", journal = j-TALIP, volume = "6", number = "3", pages = "10:1--10:??", month = nov, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1290002.1290004", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:11:45 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Collation or lexicographic sorting is essential to develop multilingual computing. This paper presents the challenges faced in developing collation sequence for a language. The paper discusses both theoretical linguistic and practical standardization and encoding related considerations that need to be addressed for languages for which relevant standards and/or solutions have not been defined. The paper also defines the process, by giving the details of the procedure followed for Urdu language, which is the national language of Pakistan and is spoken by more than 100 million people across the world. The paper is oriented towards organizations involved in developing and using collation standards and the localization industry, and not focused on theoretical issues.", acknowledgement = ack-nhfb, articleno = "10", keywords = "text processing; Urdu", } @Article{Fukumoto:2007:TTB, author = "Fumiyo Fukumoto and Yoshimi Suzuki", title = "Topic tracking based on bilingual comparable corpora and semisupervised clustering", journal = j-TALIP, volume = "6", number = "3", pages = "11:1--11:??", month = nov, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1290002.1290005", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:11:45 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "In this paper, we address the problem of skewed data in topic tracking: the small number of stories labeled positive as compared to negative stories and propose a method for estimating effective training stories for the topic-tracking task. For a small number of labeled positive stories, we use bilingual comparable, i.e., English, and Japanese corpora, together with the EDR bilingual dictionary, and extract story pairs consisting of positive and associated stories. To overcome the problem of a large number of labeled negative stories, we classified them into clusters. This is done using a semisupervised clustering algorithm, combining $k$ means with EM. The method was tested on the TDT English corpus and the results showed that the system works well when the topic under tracking is talking about an event originating in the source language country, even for a small number of initial positive training stories.", acknowledgement = ack-nhfb, articleno = "11", keywords = "bilingual comparable corpora; clustering; EM algorithm; N-gram model; topic detection and tracking", } @Article{Iida:2007:ZAR, author = "Ryu Iida and Kentaro Inui and Yuji Matsumoto", title = "Zero-anaphora resolution by learning rich syntactic pattern features", journal = j-TALIP, volume = "6", number = "4", pages = "1:1--1:22", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1316457.1316458", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:11:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We approach the zero-anaphora resolution problem by decomposing it into intrasentential and intersentential zero-anaphora resolution tasks. For the former task, syntactic patterns of zeropronouns and their antecedents are useful clues. Taking Japanese as a target language, we empirically demonstrate that incorporating rich syntactic pattern features in a state-of-the-art learning-based anaphora resolution model dramatically improves the accuracy of intrasentential zero-anaphora, which consequently improves the overall performance of zero-anaphora resolution.", acknowledgement = ack-nhfb, } @Article{Adriani:2007:SIC, author = "Mirna Adriani and Jelita Asian and Bobby Nazief and S. M. M. Tahaghoghi and Hugh E. Williams", title = "Stemming {Indonesian}: {A} confix-stripping approach", journal = j-TALIP, volume = "6", number = "4", pages = "2:1--2:33", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1316457.1316458", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:11:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Stemming words to (usually) remove suffixes has applications in text search, machine translation, document summarization, and text classification. For example, English stemming reduces the words 'computer,' 'computing,' 'computation,' and 'computability' to their common morphological root, 'comput-.' In text search, this permits a search for 'computers' to find documents containing all words with the stem 'comput-.' In the Indonesian language, stemming is of crucial importance: words have prefixes, suffixes, infixes, and confixes that make matching related words difficult.\par This work surveys existing techniques for stemming Indonesian words to their morphological roots, presents our novel and highly accurate CS algorithm, and explores the effectiveness of stemming in the context of general-purpose text information retrieval through ad hoc queries.", acknowledgement = ack-nhfb, keywords = "Indonesian; information retrieval; stemming", } @Article{Thao:2007:NER, author = "Pham Thi Xuan Thao and Tran Quoc Tri and Dinh Dien and Nigel Collier", title = "Named entity recognition in {Vietnamese} using classifier voting", journal = j-TALIP, volume = "6", number = "4", pages = "3:1--3:18", month = dec, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1316457.1316460", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:11:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Named entity recognition (NER) is one of the fundamental tasks in natural-language processing (NLP). Though the combination of different classifiers has been widely applied in several well-studied languages, this is the first time this method has been applied to Vietnamese. In this article, we describe how voting techniques can improve the performance of Vietnamese NER. By combining several state-of-the-art machine-learning algorithms using voting strategies, our final result outperforms individual algorithms and gained an $F$-measure of 89.12. A detailed discussion about the challenges of NER in Vietnamese is also presented.", acknowledgement = ack-nhfb, keywords = "C4.5; Conditional Random Fields; Na{\"\i}ve Bayes named entity recognition; support vector machines; transformation based learning; Vietnamese; voting", } @Article{Chen:2008:SBM, author = "Yufeng Chen and Chengqing Zong", title = "A Structure-Based Model for {Chinese} Organization Name Translation", journal = j-TALIP, volume = "7", number = "1", pages = "1:1--1:??", month = feb, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1330291.1330292", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:12:10 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Named entity (NE) translation is a fundamental task in multilingual natural language processing. The performance of a machine translation system depends heavily on precise translation of the inclusive NEs. Furthermore, organization name (ON) is the most complex NE for translation among all the NEs. In this article, the structure formulation of ONs is investigated and a hierarchical structure-based ON translation model for Chinese-to-English translation system is presented.\par First, the model performs ON chunking; then both the translation of words within chunks and the process of chunk-reordering are achieved by synchronous context-free grammar (CFG). The CFG rules are extracted from bilingual ON pairs in a training program.\par The main contributions of this article are: (1) defining appropriate chunk-units for analyzing the internal structure of Chinese ONs; (2) making the chunk-based ON translation feasible and flexible via a hierarchical CFG derivation; and (3) proposing a training architecture to automatically learn the synchronous CFG for constructing ONs with chunk-units from aligned bilingual ON pairs. The experiments show that the proposed approach translates the Chinese ONs into English with an accuracy of 93.75\% and significantly improves the performance of a baseline statistical machine translation (SMT) system.", acknowledgement = ack-nhfb, articleno = "1", keywords = "alignment; chunk; hierarchical derivation; machine translation; named entity; organization name; rules extraction; structural analysis; synchronous context-free grammar", } @Article{Jeong:2008:ISR, author = "Minwoo Jeong and Gary Geunbae Lee", title = "Improving Speech Recognition and Understanding using Error-Corrective Reranking", journal = j-TALIP, volume = "7", number = "1", pages = "2:1--2:??", month = feb, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1330291.1330293", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:12:10 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The main issues of practical spoken-language applications for human-computer interface are how to overcome speech recognition errors and guarantee the reasonable end-performance of spoken-language applications. Therefore, handling the erroneously recognized outputs is a key in developing robust spoken-language systems. To address this problem, we present a method to improve the accuracy of speech recognition and performance of spoken-language applications. The proposed error corrective reranking approach exploits recognition environment characteristics and domain-specific semantic information to provide robustness and adaptability for a spoken-language system. We demonstrate some experiments of spoken dialogue tasks and empirical results that show an improvement in accuracy for both speech recognition and spoken-language understanding. In our experiment, we show an error reduction of up to 9.7\% and 16.8\%; of word error rate, and 5.5\% and 7.9\% of understanding error for the air travel and telebanking service domains.", acknowledgement = ack-nhfb, articleno = "2", keywords = "automatic speech recognition; error-corrective reranking; improving spoken dialogue system; spoken-language understanding", } @Article{Kuo:2008:MSG, author = "June-Jei Kuo and Hsin-Hsi Chen", title = "Multidocument Summary Generation: Using Informative and Event Words", journal = j-TALIP, volume = "7", number = "1", pages = "3:1--3:??", month = feb, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1330291.1330294", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:12:10 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Summary generation for multiple documents poses a number of issues including sentence selection, sentence ordering, and sentence reduction over single-document summarization. In addition, the temporal resolution among extracted sentences is also important. This article considers informative words and event words to deal with multidocument summarization. These words indicate the important concepts and relationships in a document or among a set of documents, and can be used to select salient sentences. We present a temporal resolution algorithm, using focusing time and coreference chains, to convert Chinese temporal expressions in a document into calendrical forms. Moreover, we consider the last calendrical form of a sentence as a sentence time stamp to address sentence ordering. Informative words, event words, and temporal words are introduced to a sentence reduction algorithm, which deals with both length constraints and information coverage. Experiments on Chinese-news data sets show significant improvements of both information coverage and readability.", acknowledgement = ack-nhfb, articleno = "3", keywords = "latent semantic analysis; multidocument summary generation; sentence ordering; sentence reduction; sentence selection; temporal processing", } @Article{Kando:2008:INS, author = "Noriko Kando and Teruko Mitamura and Tetsuya Sakai", title = "Introduction to the {NTCIR-6 Special Issue}", journal = j-TALIP, volume = "7", number = "2", pages = "4:1--4:??", month = jun, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1362782.1362783", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:12:23 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "4", } @Article{Zhou:2008:HTE, author = "Dong Zhou and Mark Truran and Tim Brailsford and Helen Ashman", title = "A Hybrid Technique for {English--Chinese} Cross Language Information Retrieval", journal = j-TALIP, volume = "7", number = "2", pages = "5:1--5:??", month = jun, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1362782.1362784", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:12:23 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "In this article we describe a hybrid technique for dictionary-based query translation suitable for English-Chinese cross language information retrieval. This technique marries a graph-based model for the resolution of candidate term ambiguity with a pattern-based method for the translation of out-of-vocabulary (OOV) terms. We evaluate the performance of this hybrid technique in an experiment using several NTCIR test collections. Experimental results indicate a substantial increase in retrieval effectiveness over various baseline systems incorporating machine- and dictionary-based translation.", acknowledgement = ack-nhfb, articleno = "5", keywords = "cross language information retrieval; disambiguation; graph-based analysis; patterns; unknown term translation", } @Article{Higashinaka:2008:AAC, author = "Ryuichiro Higashinaka and Hideki Isozaki", title = "Automatically Acquiring Causal Expression Patterns from Relation-annotated Corpora to Improve Question Answering for why-Questions", journal = j-TALIP, volume = "7", number = "2", pages = "6:1--6:??", month = jun, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1362782.1362785", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:12:23 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "This article describes our approach for answering why-questions that we initially introduced at NTCIR-6 QAC-4. The approach automatically acquires causal expression patterns from relation-annotated corpora by abstracting text spans annotated with a causal relation and by mining syntactic patterns that are useful for distinguishing sentences annotated with a causal relation from those annotated with other relations. We use these automatically acquired causal expression patterns to create features to represent answer candidates, and use these features together with other possible features related to causality to train an answer candidate ranker that maximizes the QA performance with regards to the corpus of why-questions and answers. NAZEQA, a Japanese why-QA system based on our approach, clearly outperforms baselines with a Mean Reciprocal Rank (top-5) of 0.223 when sentences are used as answers and with a MRR (top-5) of 0.326 when paragraphs are used as answers, making it presumably the best-performing fully implemented why-QA system. Experimental results also verified the usefulness of the automatically acquired causal expression patterns.", acknowledgement = ack-nhfb, articleno = "6", keywords = "causal expression; pattern mining; question answering; relation-annotated corpus", } @Article{Li:2008:ASV, author = "Yaoyong Li and Kalina Bontcheva", title = "Adapting Support Vector Machines for $F$-term-based Classification of Patents", journal = j-TALIP, volume = "7", number = "2", pages = "7:1--7:??", month = jun, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1362782.1362786", ISSN = "1530-0226", bibdate = "Mon Jun 16 17:12:23 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Support Vector Machines (SVM) have obtained state-of-the-art results on many applications including document classification. However, previous works on applying SVMs to the $F$-term patent classification task did not obtain as good results as other learning algorithms such as k-NN. This is due to the fact that $F$-term patent classification is different from conventional document classification in several aspects, mainly because it is a multiclass, multilabel classification problem with semi-structured documents and multi-faceted hierarchical categories.\par This article describes our SVM-based system and several techniques we developed successfully to adapt SVM for the specific features of the $F$-term patent classification task. We evaluate the techniques using the NTCIR-6 $F$-term classification terms assigned to Japanese patents. Moreover, our system participated in the NTCIR-6 patent classification evaluation and obtained the best results according to two of the three metrics used for task performance evaluation. Following the NTCIR-6 participation, we developed two new techniques, which achieved even better scores using all three NTCIR-6 metrics, effectively outperforming all participating systems. This article presents this new work and the experimental results that demonstrate the benefits of the latest approach.", acknowledgement = ack-nhfb, articleno = "7", keywords = "F-term classification; patent processing; support vector machines", } @Article{Fukumoto:2008:ICL, author = "Fumiyo Fukumoto and Yoshimi Suzuki", title = "Integrating Cross-Language Hierarchies and Its Application to Retrieving Relevant Documents", journal = j-TALIP, volume = "7", number = "3", pages = "8:1--8:??", month = aug, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1386869.1386870", ISSN = "1530-0226", bibdate = "Fri Aug 22 13:11:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Internet directories such as Yahoo! are an approach to improve the efficacy and efficiency of Information Retrieval (IR) on the Web, as pages (documents) are organized into hierarchical categories, and similar pages are grouped together. Most of the search engines on the Web service find documents that are assigned to a single classification hierarchy. Categories in the hierarchy are carefully defined by human experts and documents are well organized. However, a single hierarchy in one language is often insufficient to find all relevant material, as each hierarchy tends to have some bias in both defining hierarchical structure and classifying documents. Moreover, documents written in a language other than the user's native language often include large amounts of information related to the user's request. In this article, we propose a method of integrating cross-language (CL) category hierarchies, that is, Reuters '96 hierarchy and UDC code hierarchy of Japanese by estimating category similarities. The method does not simply merge two different hierarchies into one large hierarchy but instead extracts sets of similar categories, where each element of the sets is relevant with each other. It consists of three steps. First, we classify documents from one hierarchy into categories with another hierarchy using a cross-language text classification (CLTC) technique, and extract category pairs of two hierarchies. Next, we apply {\em {\chi}\/}$^2$ statistics to these pairs to obtain similar category pairs, and finally we apply the generating function of the Apriori algorithm (Apriori-Gen) to the category pairs, and find sets of similar categories. Moreover, we examined whether integrating hierarchies helps to support retrieval of documents with similar contents. The retrieval results showed a 42.7\% improvement over the baseline nonhierarchy model, and a 21.6\% improvement over a single hierarchy.", acknowledgement = ack-nhfb, articleno = "8", keywords = "cross-language hierarchies; information integration; retrieval of relevant documents; text classification", } @Article{Sharma:2008:AMI, author = "Utpal Sharma and Jugal K. Kalita and Rajib K. Das", title = "Acquisition of Morphology of an {Indic} Language from Text Corpus", journal = j-TALIP, volume = "7", number = "3", pages = "9:1--9:??", month = aug, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1386869.1386871", ISSN = "1530-0226", bibdate = "Fri Aug 22 13:11:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "This article describes an approach to unsupervised learning of morphology from an unannotated corpus for a highly inflectional Indo-European language called Assamese spoken by about 30 million people. Although Assamese is one of India's national languages, it utterly lacks computational linguistic resources. There exists no prior computational work on this language spoken widely in northeast India. The work presented is pioneering in this respect. In this article, we discuss salient issues in Assamese morphology where the presence of a large number of suffixal determiners, sandhi, samas, and the propensity to use suffix sequences make approximately 50\% of the words used in written and spoken text inflected. We implement methods proposed by Gaussier and Goldsmith on acquisition of morphological knowledge, and obtain F-measure performance below 60\%. This motivates us to present a method more suitable for handling suffix sequences, enabling us to increase the F-measure performance of morphology acquisition to almost 70\%. We describe how we build a morphological dictionary for Assamese from the text corpus. Using the morphological knowledge acquired and the morphological dictionary, we are able to process small chunks of data at a time as well as a large corpus. We achieve approximately 85\% precision and recall during the analysis of small chunks of coherent text.", acknowledgement = ack-nhfb, articleno = "9", keywords = "Assamese; Indo-European languages; machine learning; morphology", } @Article{Chen:2008:TTR, author = "Jiang-Chun Chen and Jyh-Shing Roger Jang", title = "{TRUES}: {Tone Recognition Using Extended Segments}", journal = j-TALIP, volume = "7", number = "3", pages = "10:1--10:??", month = aug, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1386869.1386872", ISSN = "1530-0226", bibdate = "Fri Aug 22 13:11:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Tone recognition has been a basic but important task for speech recognition and assessment of tonal languages, such as Mandarin Chinese. Most previously proposed approaches adopt a two-step approach where syllables within an utterance are identified via forced alignment first, and tone recognition using a variety of classifiers---such as neural networks, Gaussian mixture models (GMM), hidden Markov models (HMM), support vector machines (SVM)---is then performed on each segmented syllable to predict its tone. However, forced alignment does not always generate accurate syllable boundaries, leading to unstable voiced-unvoiced detection and deteriorating performance in tone recognition. Aiming to alleviate this problem, we propose a robust approach called Tone Recognition Using Extended Segments (TRUES) for HMM-based continuous tone recognition. The proposed approach extracts an unbroken pitch contour from a given utterance based on dynamic programming over time-domain acoustic features of average magnitude difference function (AMDF). The pitch contour of each syllable is then extended for tri-tone HMM modeling, such that the influence from inaccurate syllable boundaries is lessened. Our experimental results demonstrate that the proposed TRUES achieves 49.13\% relative error rate reduction over that of the recently proposed supratone modeling, which is deemed the state of the art of tone recognition that outperforms several previously proposed approaches. The encouraging improvement demonstrates the effectiveness and robustness of the proposed TRUES, as well as the corresponding pitch determination algorithm which produces unbroken pitch contours.", acknowledgement = ack-nhfb, articleno = "10", keywords = "context-dependent tone modeling; continuous tone recognition; extended segment for tone recognition; HMM; Mandarin Chinese; supratone modeling", } @Article{Lin:2008:VCD, author = "Jeng-Wei Lin and Jan-Ming Ho and Li-Ming Tseng and Feipei Lai", title = "Variant {Chinese} Domain Name Resolution", journal = j-TALIP, volume = "7", number = "4", pages = "11:1--11:??", month = nov, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1450295.1450296", ISSN = "1530-0226", bibdate = "Mon Dec 8 13:56:10 MST 2008", bibsource = "http://portal.acm.org/", abstract = "Many efforts in past years have been made to lower the linguistic barriers for non-native English speakers to access the Internet. Internet standard RFC 3490, referred to as IDNA (Internationalizing Domain Names in Applications), focuses on access to IDNs (Internationalized Domain Names) in a range of scripts that is broader in scope than the original ASCII. However, the use of character variants that have similar appearances and/or interpretations could create confusion. A variant IDL (Internationalized Domain Label), derived from an IDL by replacing some characters with their variants, should match the original IDL; and thus a variant IDN does. In RFC 3743, referred to as JET (Joint Engineering Team) Guidelines, it is suggested that zone administrators model this concept of equivalence as an atomic IDL package. When an IDL is registered, an IDL package is created that contains its variant IDLs generated according to the zone-specific Language Variant Tables (LVTs). In addition to the registered IDL, the name holder can request the domain registry to activate some of the variant IDLs, free or by an extra fee. The activated variant IDLs are stored in the zone files, and thus become resolvable. However, an issue of scalability arises when there is a large number of variant IDLs to be activated.\par In this article, the authors present a resolution protocol that resolves the variant IDLs into the registered IDL, specifically for Han character variants. Two Han characters are said to be variants of each other if they have the same meaning and are pronounced the same. Furthermore, Han character variants usually have similar appearances. It is not uncommon that a Chinese IDL has a large number of variant IDLs. The proposed protocol introduces a new RR (resource record) type, denoted as VarIdx RR, to associate a variant expression of the variant IDLs with the registered IDL. The label of the VarIdx RR, denoted as the variant index, is assigned by an indexing function that is designed to give the same value to all of the variant IDLs enumerated by the variant expression. When one of the variant IDLs is accessed, Internet applications can compute the variant index, look up the VarIdx RRs, and resolve the variant IDL into the registered IDL.\par The authors examine two sets of Chinese IDLs registered in TWNIC and CNNIC, respectively. The results show that for a registered Chinese IDL, a very small number of VarIdx RRs, usually one or two, are sufficient to activate all of its variant IDLs. The authors also represent a Web redirection service that employs the proposed resolution protocol to redirect a URL addressed by a variant IDN to the URL addressed by the registered IDN. The experiment results show that the proposed protocol successfully resolves the variant IDNs into the registered IDNs.", acknowledgement = ack-nhfb, articleno = "11", keywords = "conversion between traditional Chinese and simplified Chinese; Han character folding; Han character variant; IDN spoof; internationalized domain name; localization", } @Article{Lee:2008:BCQ, author = "Cheng-Wei Lee and Min-Yuh Day and Cheng-Lung Sung and Yi-Hsun Lee and Tian-Jian Jiang and Chia-Wei Wu and Cheng-Wei Shih and Yu-Ren Chen and Wen-Lian Hsu", title = "Boosting {Chinese} Question Answering with Two Lightweight Methods: {ABSPs} and {SCO-QAT}", journal = j-TALIP, volume = "7", number = "4", pages = "12:1--12:??", month = nov, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1450295.1450297", ISSN = "1530-0226", bibdate = "Mon Dec 8 13:56:10 MST 2008", bibsource = "http://portal.acm.org/", abstract = "Question Answering (QA) research has been conducted in many languages. Nearly all the top performing systems use heavy methods that require sophisticated techniques, such as parsers or logic provers. However, such techniques are usually unavailable or unaffordable for under-resourced languages or in resource-limited situations. In this article, we describe how a top-performing Chinese QA system can be designed by using lightweight methods effectively. We propose two lightweight methods, namely the Sum of Co-occurrences of Question and Answer Terms (SCO-QAT) and Alignment-based Surface Patterns (ABSPs). SCO-QAT is a co-occurrence-based answer-ranking method that does not need extra knowledge, word-ignoring heuristic rules, or tools. It calculates co-occurrence scores based on the passage retrieval results. ABSPs are syntactic patterns trained from question-answer pairs with a multiple alignment algorithm. They are used to capture the relations between terms and then use the relations to filter answers. We attribute the success of the ABSPs and SCO-QAT methods to the effective use of local syntactic information and global co-occurrence information.\par By using SCO-QAT and ABSPs, we improved the RU-Accuracy of our testbed QA system, ASQA, from 0.445 to 0.535 on the NTCIR-5 dataset. It also achieved the top 0.5 RU-Accuracy on the NTCIR-6 dataset. The result shows that lightweight methods are not only cheaper to implement, but also have the potential to achieve state-of-the-art performances.", acknowledgement = ack-nhfb, articleno = "12", keywords = "answer filtering; answer ranking; Chinese question answering; co-occurrence; lightweight method; surface pattern", } @Article{Che:2008:UHC, author = "Wanxiang Che and Min Zhang and AiTi Aw and ChewLim Tan and Ting Liu and Sheng Li", title = "Using a Hybrid Convolution Tree Kernel for Semantic Role Labeling", journal = j-TALIP, volume = "7", number = "4", pages = "13:1--13:??", month = nov, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1450295.1450298", ISSN = "1530-0226", bibdate = "Mon Dec 8 13:56:10 MST 2008", bibsource = "http://portal.acm.org/", abstract = "As a kind of Shallow Semantic Parsing, Semantic Role Labeling (SRL) is gaining more attention as it benefits a wide range of natural language processing applications. Given a sentence, the task of SRL is to recognize semantic arguments (roles) for each predicate (target verb or noun). Feature-based methods have achieved much success in SRL and are regarded as the state-of-the-art methods for SRL. However, these methods are less effective in modeling structured features. As an extension of feature-based methods, kernel-based methods are able to capture structured features more efficiently in a much higher dimension. Application of kernel methods to SRL has been achieved by selecting the tree portion of a predicate and one of its arguments as feature space, which is named as predicate-argument feature (PAF) kernel. The PAF kernel captures the syntactic tree structure features using convolution tree kernel, however, it does not distinguish between the path structure and the constituent structure. In this article, a hybrid convolution tree kernel is proposed to model different linguistic objects. The hybrid convolution tree kernel consists of two individual convolution tree kernels. They are a Path kernel, which captures predicate-argument link features, and a Constituent Structure kernel, which captures the syntactic structure features of arguments. Evaluations on the data sets of the CoNLL-2005 SRL shared task and the Chinese PropBank (CPB) show that our proposed hybrid convolution tree kernel statistically significantly outperforms the previous tree kernels. Moreover, in order to maximize the system performance, we present a composite kernel through combining our hybrid convolution tree kernel method with a feature-based method extended by the polynomial kernel. The experimental results show that the composite kernel achieves better performance than each of the individual methods and outperforms the best reported system on the CoNLL-2005 corpus when only one syntactic parser is used and on the CPB corpus when automated syntactic parse results and correct syntactic parse results are used respectively.", acknowledgement = ack-nhfb, articleno = "13", keywords = "hybrid convolution tree kernel; semantic role labeling", } @Article{Wu:2009:ISI, author = "Chung-Hsien Wu and Haizhou Li", title = "Introduction to the Special Issue on Recent Advances in {Asian} Language Spoken Document Retrieval", journal = j-TALIP, volume = "8", number = "1", pages = "1:1--1:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1482343.1482344", ISSN = "1530-0226", bibdate = "Mon Mar 23 16:32:22 MDT 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "1", } @Article{Chen:2009:WTM, author = "Berlin Chen", title = "Word Topic Models for Spoken Document Retrieval and Transcription", journal = j-TALIP, volume = "8", number = "1", pages = "2:1--2:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1482343.1482345", ISSN = "1530-0226", bibdate = "Mon Mar 23 16:32:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Statistical language modeling (LM), which aims to capture the regularities in human natural language and quantify the acceptability of a given word sequence, has long been an interesting yet challenging research topic in the speech and language processing community. It also has been introduced to information retrieval (IR) problems, and provided an effective and theoretically attractive probabilistic framework for building IR systems. In this article, we propose a word topic model (WTM) to explore the co-occurrence relationship between words, as well as the long-span latent topical information, for language modeling in spoken document retrieval and transcription. The document or the search history as a whole is modeled as a composite WTM model for generating a newly observed word. The underlying characteristics and different kinds of model structures are extensively investigated, while the performance of WTM is thoroughly analyzed and verified by comparison with the well-known probabilistic latent semantic analysis (PLSA) model as well as the other models. The IR experiments are performed on the TDT Chinese collections (TDT-2 and TDT-3), while the large vocabulary continuous speech recognition (LVCSR) experiments are conducted on the Mandarin broadcast news collected in Taiwan. Experimental results seem to indicate that WTM is a promising alternative to the existing models.", acknowledgement = ack-nhfb, articleno = "2", keywords = "adaptation; information retrieval; language model; speech recognition; word topic model", } @Article{Lin:2009:CSP, author = "Shih-Hsiang Lin and Berlin Chen and Hsin-Min Wang", title = "A Comparative Study of Probabilistic Ranking Models for {Chinese} Spoken Document Summarization", journal = j-TALIP, volume = "8", number = "1", pages = "3:1--3:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1482343.1482346", ISSN = "1530-0226", bibdate = "Mon Mar 23 16:32:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Extractive document summarization automatically selects a number of indicative sentences, passages, or paragraphs from an original document according to a target summarization ratio, and sequences them to form a concise summary. In this article, we present a comparative study of various probabilistic ranking models for spoken document summarization, including supervised classification-based summarizers and unsupervised probabilistic generative summarizers. We also investigate the use of unsupervised summarizers to improve the performance of supervised summarizers when manual labels are not available for training the latter. A novel training data selection approach that leverages the relevance information of spoken sentences to select reliable document-summary pairs derived by the probabilistic generative summarizers is explored for training the classification-based summarizers. Encouraging initial results on Mandarin Chinese broadcast news data are demonstrated.", acknowledgement = ack-nhfb, articleno = "3", keywords = "extractive summarization; probabilistic ranking models; relevance information; spoken document summarization", } @Article{Chen:2009:TSH, author = "Boxing Chen and Min Zhang and Ai Ti Aw", title = "Two-Stage Hypotheses Generation for Spoken Language Translation", journal = j-TALIP, volume = "8", number = "1", pages = "4:1--4:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1482343.1482347", ISSN = "1530-0226", bibdate = "Mon Mar 23 16:32:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Spoken Language Translation (SLT) is the research area that focuses on the translation of speech or text between two spoken languages. Phrase-based and syntax-based methods represent the state-of-the-art for statistical machine translation (SMT). The phrase-based method specializes in modeling local reorderings and translations of multiword expressions. The syntax-based method is enhanced by using syntactic knowledge, which can better model long word reorderings, discontinuous phrases, and syntactic structure. In this article, we leverage on the strength of these two methods and propose a strategy based on multiple hypotheses generation in a two-stage framework for spoken language translation. The hypotheses are generated in two stages, namely, decoding and regeneration. In the decoding stage, we apply state-of-the-art, phrase-based, and syntax-based methods to generate basic translation hypotheses. Then in the regeneration stage, much more hypotheses that cannot be captured by the decoding algorithms are produced from the basic hypotheses. We study three regeneration methods: redecoding, n-gram expansion, and confusion network in the second stage. Finally, an additional reranking pass is introduced to select the translation outputs by a linear combination of rescoring models. Experimental results on the Chinese-to-English IWSLT-2006 challenge task of translating the transcription of spontaneous speech show that the proposed mechanism achieves significant improvements over the baseline of about 2.80 BLEU-score.", acknowledgement = ack-nhfb, articleno = "4", keywords = "hypotheses generation; spoken language translation; statistical machine translation", } @Article{Chiang:2009:ISI, author = "David Chiang and Philipp Koehn", title = "Introduction to the Special Issue on Machine Translation of {Asian} Language", journal = j-TALIP, volume = "8", number = "2", pages = "5:1--5:??", month = may, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1526252.1526253", ISSN = "1530-0226", bibdate = "Wed Jun 3 16:13:52 MDT 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "5", } @Article{He:2009:IMH, author = "Xiaodong He and Mei Yang and Jianfeng Gao and Patrick Nguyen and Robert Moore", title = "Improved Monolingual Hypothesis Alignment for Machine Translation System Combination", journal = j-TALIP, volume = "8", number = "2", pages = "6:1--6:??", month = may, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1526252.1526254", ISSN = "1530-0226", bibdate = "Wed Jun 3 16:13:52 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "This article presents a new hypothesis alignment method for combining outputs of multiple machine translation (MT) systems. An indirect hidden Markov model (IHMM) is proposed to address the synonym matching and word ordering issues in hypothesis alignment. Unlike traditional HMMs whose parameters are trained via maximum likelihood estimation (MLE), the parameters of the IHMM are estimated indirectly from a variety of sources including word semantic similarity, word surface similarity, and a distance-based distortion penalty. The IHMM-based method significantly outperforms the state-of-the-art, TER-based alignment model in our experiments on NIST benchmark datasets. Our combined SMT system using the proposed method achieved the best Chinese-to-English translation result in the constrained training track of the 2008 NIST Open MT Evaluation.", acknowledgement = ack-nhfb, articleno = "6", keywords = "hidden Markov model; statistical machine translation; system combination; word alignment", } @Article{Ma:2009:BMW, author = "Yanjun Ma and Andy Way", title = "Bilingually Motivated Word Segmentation for Statistical Machine Translation", journal = j-TALIP, volume = "8", number = "2", pages = "7:1--7:??", month = may, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1526252.1526255", ISSN = "1530-0226", bibdate = "Wed Jun 3 16:13:52 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "We introduce a bilingually motivated word segmentation approach to languages where word boundaries are not orthographically marked, with application to Phrase-Based Statistical Machine Translation (PB-SMT). Our approach is motivated from the insight that PB-SMT systems can be improved by optimizing the input representation to reduce the predictive power of translation models. We firstly present an approach to optimize the existing segmentation of both source and target languages for PB-SMT and demonstrate the effectiveness of this approach using a Chinese--English MT task, that is, to measure the influence of the segmentation on the performance of PB-SMT systems. We report a 5.44\% relative increase in Bleu score and a consistent increase according to other metrics. We then generalize this method for Chinese word segmentation without relying on any segmenters and show that using our segmentation PB-SMT can achieve more consistent state-of-the-art performance across two domains. There are two main advantages of our approach. First of all, it is adapted to the specific translation task at hand by taking the corresponding source (target) language into account. Second, this approach does not rely on manually segmented training data so that it can be automatically adapted for different domains.", acknowledgement = ack-nhfb, articleno = "7", keywords = "alignment; bilingually motivated; phrase-based statistical machine translation; word segmentation", } @Article{Venkatapathy:2009:DMT, author = "Sriram Venkatapathy and Srinivas Bangalore", title = "Discriminative Machine Translation Using Global Lexical Selection", journal = j-TALIP, volume = "8", number = "2", pages = "8:1--8:??", month = may, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1526252.1526256", ISSN = "1530-0226", bibdate = "Wed Jun 3 16:13:52 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Statistical phrase-based machine translation models crucially rely on word alignments. The search for word-alignments assumes a model of word locality between source and target languages that is violated in starkly different word-order languages such as English-Hindi. In this article, we present models that decouple the steps of lexical selection and lexical reordering with the aim of minimizing the role of word-alignment in machine translation. Indian languages are morphologically rich and have relatively free-word order where the grammatical role of content words is largely determined by their case markers and not just by their positions in the sentence. Hence, lexical selection plays a far greater role than lexical reordering. For lexical selection, we investigate models that take the entire source sentence into account and evaluate their performance for English-Hindi translation in a tourism domain.", acknowledgement = ack-nhfb, articleno = "8", keywords = "global lexical selection; machine translation", } @Article{Tsunakawa:2009:CJL, author = "Takashi Tsunakawa and Naoaki Okazaki and Xiao Liu and Jun'ichi Tsujii", title = "A {Chinese--Japanese} Lexical Machine Translation through a Pivot Language", journal = j-TALIP, volume = "8", number = "2", pages = "9:1--9:??", month = may, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1526252.1526257", ISSN = "1530-0226", bibdate = "Wed Jun 3 16:13:52 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "The bilingual lexicon is an expensive but critical resource for multilingual applications in natural language processing. This article proposes an integrated framework for building a bilingual lexicon between the Chinese and Japanese languages. Since the language pair Chinese-Japanese does not include English, which is a central language of the world, few large-scale bilingual resources between Chinese and Japanese have been constructed. One solution to alleviate this problem is to build a Chinese-Japanese bilingual lexicon through English as the pivot language. In addition to the pivotal approach, we can make use of the characteristics of Chinese and Japanese languages that use Han characters. We incorporate a translation model obtained from a small Chinese-Japanese lexicon and use the similarity of the hanzi and kanji characters by using the log-linear model. Our experimental results show that the use of the pivotal approach can improve the translation performance over the translation model built from a small Chinese-Japanese lexicon. The results also demonstrate that the similarity between the hanzi and kanji characters provides a positive effect for translating technical terms.", acknowledgement = ack-nhfb, articleno = "9", keywords = "bilingual lexicon; Han characters; hanzi; kanji; pivot language; statistical machine translation", }