%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.17", %%% date = "11 August 2009", %%% time = "18:30:01 MDT", %%% filename = "tcbb.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% FAX = "+1 801 581 4148", %%% URL = "http://www.math.utah.edu/~beebe", %%% checksum = "30271 8727 45125 408314", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "BibTeX; bibliography; IEEE/ACM Transactions %%% on Computational Biology and %%% Bioinformatics; TCBB", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE BibTeX bibliography for %%% IEEE/ACM Transactions on Computational %%% Biology and Bioinformatics (CODEN ITCBCY, %%% ISSN 1545-5963), covering all journal issues %%% from 2004 to date. %%% %%% At version 1.17, the COMPLETE journal %%% coverage looked like this: %%% %%% 2004 ( 23) 2006 ( 41) 2008 ( 59) %%% 2005 ( 37) 2007 ( 69) 2009 ( 52) %%% %%% Article: 281 %%% %%% Total entries: 281 %%% %%% The journal Web pages can be found at: %%% %%% http://www.acm.org/pubs/tcbb/ %%% http://portal.acm.org/browse_dl.cfm?idx=J954 %%% %%% Qualified subscribers can retrieve the full %%% text of recent articles in PDF form. %%% %%% The initial draft was extracted from the ACM %%% Web pages. %%% %%% ACM copyrights explicitly permit abstracting %%% with credit, so article abstracts, keywords, %%% and subject classifications have been %%% included in this bibliography wherever %%% available. Article reviews have been %%% omitted, until their copyright status has %%% been clarified. %%% %%% bibsource keys in the bibliography entries %%% below indicate the entry originally came %%% from the computer science bibliography %%% archive, even though it has likely since %%% been corrected and updated. %%% %%% URL keys in the bibliography point to %%% World Wide Web locations of additional %%% information about the entry. %%% %%% BibTeX citation tags are uniformly chosen %%% as name:year:abbrev, where name is the %%% family name of the first author or editor, %%% year is a 4-digit number, and abbrev is a %%% 3-letter condensation of important title %%% words. Citation tags were automatically %%% generated by software developed for the %%% BibNet Project. %%% %%% In this bibliography, entries are sorted in %%% publication order, using ``bibsort -byvolume.'' %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility." %%% } %%% ==================================================================== @Preamble{"\input bibnames.sty"} %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|http://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Journal abbreviations: @String{j-TCBB = "IEEE/ACM Transactions on Computational Biology and Bioinformatics"} %%% ==================================================================== %%% Bibliography entries: @Article{Williams:2004:WM, author = "Michael R. Williams", title = "Welcome Message", journal = j-TCBB, volume = "1", number = "1", pages = "1--1", month = jan, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Nov 22 06:42:56 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gusfield:2004:IIA, author = "Dan Gusfield", title = "Introduction to the {IEEE\slash ACM Transactions on Computational Biology and Bioinformatics}", journal = j-TCBB, volume = "1", number = "1", pages = "2--3", month = jan, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Nov 22 06:42:56 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Williams:2004:INA, author = "Michael R. Williams", title = "Introduction of New {Associate Editors}", journal = j-TCBB, volume = "1", number = "1", pages = "4--12", month = jan, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Nov 22 06:42:56 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Moret:2004:PNM, author = "Bernard M. E. Moret and Luay Nakhleh and Tandy Warnow and C. Randal Linder and Anna Tholse and Anneke Padolina and Jerry Sun and Ruth Timme", title = "Phylogenetic Networks: Modeling, Reconstructibility, and Accuracy", journal = j-TCBB, volume = "1", number = "1", pages = "13--23", month = jan, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Nov 22 06:42:56 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Madeira:2004:BAB, author = "Sara C. Madeira and Arlindo L. Oliveira", title = "Biclustering Algorithms for Biological Data Analysis: {A} Survey", journal = j-TCBB, volume = "1", number = "1", pages = "24--45", month = jan, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Nov 22 06:42:56 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Preparata:2004:SHR, author = "Franco P. Preparata", title = "Sequencing-by-Hybridization Revisited: The Analog-Spectrum Proposal", journal = j-TCBB, volume = "1", number = "1", pages = "46--52", month = jan, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Nov 22 06:42:56 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Hochsmann:2004:PMR, author = "Matthias H{\"o}chsmann and Bj{\"o}rn Voss and Robert Giegerich", title = "Pure Multiple {RNA} Secondary Structure Alignments: {A} Progressive Profile Approach", journal = j-TCBB, volume = "1", number = "1", pages = "53--62", month = jan, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Nov 22 06:42:56 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Anonymous:2004:INA, author = "Anonymous", title = "Introduction of New {Associate Editor}", journal = j-TCBB, volume = "1", number = "2", pages = "65--65", month = apr, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Nov 22 06:42:56 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Witwer:2004:PCR, author = "Christina Witwer and Ivo L. Hofacker and Peter F. Stadler", title = "Prediction of Consensus {RNA} Secondary Structures Including Pseudoknots", journal = j-TCBB, volume = "1", number = "2", pages = "66--77", month = apr, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Nov 22 06:42:56 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Bafna:2004:NRE, author = "Vineet Bafna and Vikas Bansal", title = "The Number of Recombination Events in a Sample History: Conflict Graph and Lower Bounds", journal = j-TCBB, volume = "1", number = "2", pages = "78--90", month = apr, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Nov 22 06:42:56 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Raphael:2004:UPM, author = "Benjamin Raphael and Lung-Tien Liu and George Varghese", title = "A Uniform Projection Method for Motif Discovery in {DNA} Sequences", journal = j-TCBB, volume = "1", number = "2", pages = "91--94", month = apr, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Nov 22 06:42:56 MST 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gusfield:2004:INA, author = "Dan Gusfield", title = "Introduction of New {Associate Editors}", journal = j-TCBB, volume = "1", number = "3", pages = "97--97", month = jul, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Jan 24 14:15:55 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Scheid:2004:SDS, author = "Stefanie Scheid and Rainer Spang", title = "A Stochastic Downhill Search Algorithm for Estimating the Local False Discovery Rate", journal = j-TCBB, volume = "1", number = "3", pages = "98--108", month = jul, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Jan 24 14:15:55 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Dress:2004:CSG, author = "Andreas W. M. Dress and Daniel H. Huson", title = "Constructing Splits Graphs", journal = j-TCBB, volume = "1", number = "3", pages = "109--115", month = jul, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Jan 24 14:15:55 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Cameron:2004:IGA, author = "Michael Cameron and Hugh E. Williams and Adam Cannane", title = "Improved Gapped Alignment in {BLAST}", journal = j-TCBB, volume = "1", number = "3", pages = "116--129", month = jul, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Jan 24 14:15:55 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Evans:2004:UDT, author = "Steven N. Evans and Tandy Warnow", title = "Unidentifiable Divergence Times in Rates-across-Sites Models", journal = j-TCBB, volume = "1", number = "3", pages = "130--134", month = jul, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Jan 24 14:15:55 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kim:2004:GEW, author = "Junhyong Kim and Inge Jonassen", title = "Guest Editorial: {WABI} Special Section Part 1", journal = j-TCBB, volume = "1", number = "4", pages = "137--138", month = oct, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Jan 24 14:15:55 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Csuros:2004:MSS, author = "Miklos Csuros", title = "Maximum-Scoring Segment Sets", journal = j-TCBB, volume = "1", number = "4", pages = "139--150", month = oct, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Jan 24 14:15:55 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Huson:2004:PSN, author = "Daniel H. Huson and Tobias Dezulian and Tobias Klopper and Mike A. Steel", title = "Phylogenetic Super-Networks from Partial Trees", journal = j-TCBB, volume = "1", number = "4", pages = "151--158", month = oct, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Jan 24 14:15:55 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Bannai:2004:ADO, author = "Hideo Bannai and Heikki Hyyro and Ayumi Shinohara and Masayuki Takeda and Kenta Nakai and Satoru Miyano", title = "An {$O(N^2)$} Algorithm for Discovering Optimal {Boolean} Pattern Pairs", journal = j-TCBB, volume = "1", number = "4", pages = "159--170", month = oct, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Jan 24 14:15:55 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gramm:2004:PTA, author = "Jens Gramm", title = "A Polynomial-Time Algorithm for the Matching of Crossing Contact-Map Patterns", journal = j-TCBB, volume = "1", number = "4", pages = "171--180", month = oct, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Jan 24 14:15:55 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Ye:2004:UUD, author = "Jieping Ye and Tao Li and Tao Xiong and Ravi Janardan", title = "Using Uncorrelated Discriminant Analysis for Tissue Classification with Gene Expression Data", journal = j-TCBB, volume = "1", number = "4", pages = "181--190", month = oct, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Jan 24 14:15:55 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Anonymous:2004:AI, author = "Anonymous", title = "Annual Index", journal = j-TCBB, volume = "1", number = "4", pages = "191--192", month = oct, year = "2004", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Jan 24 14:15:55 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kim:2005:GEW, author = "Junhyong Kim and Inge Jonassen", title = "Guest Editorial: {WABI} Special Section Part {II}", journal = j-TCBB, volume = "2", number = "1", pages = "1--2", month = jan, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Apr 12 07:11:54 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Allali:2005:NDH, author = "Julien Allali and Marie-France Sagot", title = "A New Distance for High Level {RNA} Secondary Structure Comparison", journal = j-TCBB, volume = "2", number = "1", pages = "3--14", month = jan, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Apr 12 07:11:54 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Bertrand:2005:TRL, author = "Denis Bertrand and Olivier Gascuel", title = "Topological Rearrangements and Local Search Method for Tandem Duplication Trees", journal = j-TCBB, volume = "2", number = "1", pages = "15--28", month = jan, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Apr 12 07:11:54 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Brown:2005:OMS, author = "Daniel G. Brown", title = "Optimizing Multiple Seeds for Protein Homology Search", journal = j-TCBB, volume = "2", number = "1", pages = "29--38", month = jan, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Apr 12 07:11:54 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gusfield:2005:EST, author = "Dan Gusfield", title = "Editorial-State of the Transaction", journal = j-TCBB, volume = "2", number = "1", pages = "39--39", month = jan, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Apr 12 07:11:54 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Pisanti:2005:BMG, author = "Nadia Pisanti and Maxime Crochemore and Roberto Grossi and Marie-France Sagot", title = "Bases of Motifs for Generating Repeated Patterns with Wild Cards", journal = j-TCBB, volume = "2", number = "1", pages = "40--50", month = jan, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Apr 12 07:11:54 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kucherov:2005:MLF, author = "Gregory Kucherov and Laurent Noe and Mikhail Roytberg", title = "Multiseed Lossless Filtration", journal = j-TCBB, volume = "2", number = "1", pages = "51--61", month = jan, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Apr 12 07:11:54 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Liu:2005:TMB, author = "Ying Liu and Shamkant B. Navathe and Jorge Civera and Venu Dasigi and Ashwin Ram and Brian J. Ciliax and Ray Dingledine", title = "Text Mining Biomedical Literature for Discovering Gene-to-Gene Relationships: {A} Comparative Study of Algorithms", journal = j-TCBB, volume = "2", number = "1", pages = "62--76", month = jan, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Apr 12 07:11:54 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Staff:2005:RL, author = "{IEEE/ACM Transactions on Computational Biology and Bioinformatics staff}", title = "2004 Reviewers List", journal = j-TCBB, volume = "2", number = "1", pages = "77--77", month = jan, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Apr 12 07:11:54 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Ling:2005:GEIa, author = "Charles X. Ling and William Stafford Noble and Qiang Yang", title = "{Guest Editors}' Introduction to the {Special Issue: Machine Learning for Bioinformatics---Part 1}", journal = j-TCBB, volume = "2", number = "2", pages = "81--82", month = apr, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Wed Jun 22 17:33:35 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Au:2005:ACG, author = "Wai-Ho Au and Keith C. C. Chan and Andrew K. C. Wong and Yang Wang", title = "Attribute Clustering for Grouping, Selection, and Classification of Gene Expression Data", journal = j-TCBB, volume = "2", number = "2", pages = "83--101", month = apr, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Wed Jun 22 17:33:35 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Biyani:2005:JCP, author = "Pravesh Biyani and Xiaolin Wu and Abhijit Sinha", title = "Joint Classification and Pairing of Human Chromosomes", journal = j-TCBB, volume = "2", number = "2", pages = "102--109", month = apr, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Wed Jun 22 17:33:35 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Furlanello:2005:SLM, author = "Cesare Furlanello and Maria Serafini and Stefano Merler and Giuseppe Jurman", title = "Semisupervised Learning for Molecular Profiling", journal = j-TCBB, volume = "2", number = "2", pages = "110--118", month = apr, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Wed Jun 22 17:33:35 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Mamitsuka:2005:ELK, author = "Hiroshi Mamitsuka", title = "Essential Latent Knowledge for Protein-Protein Interactions: Analysis by an Unsupervised Learning Approach", journal = j-TCBB, volume = "2", number = "2", pages = "119--130", month = apr, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Wed Jun 22 17:33:35 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Rajapakse:2005:MED, author = "Jagath C. Rajapakse and Loi Sy Ho", title = "{Markov} Encoding for Detecting Signals in Genomic Sequences", journal = j-TCBB, volume = "2", number = "2", pages = "131--142", month = apr, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Wed Jun 22 17:33:35 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Rogers:2005:LPD, author = "Simon Rogers and Mark Girolami and Colin Campbell and Rainer Breitling", title = "The Latent Process Decomposition of {cDNA} Microarray Data Sets", journal = j-TCBB, volume = "2", number = "2", pages = "143--156", month = apr, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Wed Jun 22 17:33:35 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Xu:2005:FRP, author = "Jinbo Xu", title = "Fold Recognition by Predicted Alignment Accuracy", journal = j-TCBB, volume = "2", number = "2", pages = "157--165", month = apr, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Wed Jun 22 17:33:35 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Shen:2005:DRB, author = "Li Shen and Eng Chong Tan", title = "Dimension Reduction-Based Penalized Logistic Regression for Cancer Classification Using Microarray Data", journal = j-TCBB, volume = "2", number = "2", pages = "166--175", month = apr, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Wed Jun 22 17:33:35 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Ling:2005:GEIb, author = "C. X. Ling and W. S. Noble and Q. Yang", title = "{Guest Editor}'s Introduction to the {Special Issue: Machine Learning for Bioinformatics---Part 2}", journal = j-TCBB, volume = "2", number = "3", pages = "177--178", month = jul, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Sep 20 06:11:25 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Schliep:2005:AGE, author = "Alexander Schliep and Ivan G. Costa and Christine Steinhoff and Alexander Schonhuth", title = "Analyzing Gene Expression Time-Courses", journal = j-TCBB, volume = "2", number = "3", pages = "179--193", month = jul, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Sep 20 06:11:25 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kundaje:2005:CST, author = "Anshul Kundaje and Manuel Middendorf and Feng Gao and Chris Wiggins and Christina Leslie", title = "Combining Sequence and Time Series Expression Data to Learn Transcriptional Modules", journal = j-TCBB, volume = "2", number = "3", pages = "194--202", month = jul, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Sep 20 06:11:25 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kaski:2005:ACE, author = "Samuel Kaski and Janne Nikkila and Janne Sinkkonen and Leo Lahti and Juha E. A. Knuuttila and Christophe Roos", title = "Associative Clustering for Exploring Dependencies between Functional Genomics Data Sets", journal = j-TCBB, volume = "2", number = "3", pages = "203--216", month = jul, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Sep 20 06:11:25 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2005:PMF, author = "Jingfen Zhang and Wen Gao and Jinjin Cai and Simin He and Rong Zeng and Runsheng Chen", title = "Predicting Molecular Formulas of Fragment Ions with Isotope Patterns in Tandem Mass Spectra", journal = j-TCBB, volume = "2", number = "3", pages = "217--230", month = jul, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Sep 20 06:11:25 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Keedwell:2005:DGN, author = "Edward Keedwell and Ajit Narayanan", title = "Discovering Gene Networks with a Neural-Genetic Hybrid", journal = j-TCBB, volume = "2", number = "3", pages = "231--242", month = jul, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Sep 20 06:11:25 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Hawkins:2005:ARN, author = "John Hawkins and Mikael Boden", title = "The Applicability of Recurrent Neural Networks for Biological Sequence Analysis", journal = j-TCBB, volume = "2", number = "3", pages = "243--253", month = jul, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Sep 20 06:11:25 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gustafsson:2005:CAL, author = "Mika Gustafsson and Michael Hornquist and Anna Lombardi", title = "Constructing and Analyzing a Large-Scale Gene-to-Gene Regulatory Network-Lasso-Constrained Inference and Biological Validation", journal = j-TCBB, volume = "2", number = "3", pages = "254--261", month = jul, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Sep 20 06:11:25 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Demir:2005:LTP, author = "Cigdem Demir and S. Humayun Gultekin and Bulent Yener", title = "Learning the Topological Properties of Brain Tumors", journal = j-TCBB, volume = "2", number = "3", pages = "262--270", month = jul, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Sep 20 06:11:25 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Anonymous:2005:CPS, author = "Anonymous", title = "Call for Papers for {Special Issue on Computational Intelligence Approaches in Computational Biology and Bioinformatics}", journal = j-TCBB, volume = "2", number = "3", pages = "271--271", month = jul, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Tue Sep 20 06:11:25 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Cickovski:2005:FTD, author = "Trevor M. Cickovski and Chengbang Huang and Rajiv Chaturvedi and Tilmann Glimm and H. George E. Hentschel and Mark S. Alber and James A. Glazier and Stuart A. Newman and Jesus A. Izaguirre", title = "A Framework for Three-Dimensional Simulation of Morphogenesis", journal = j-TCBB, volume = "2", number = "4", pages = "273--288", month = oct, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Fri Nov 18 05:22:15 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Boscolo:2005:GFN, author = "Riccardo Boscolo and Chiara Sabatti and James C. Liao and Vwani P. Roychowdhury", title = "A Generalized Framework for Network Component Analysis", journal = j-TCBB, volume = "2", number = "4", pages = "289--301", month = oct, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Fri Nov 18 05:22:15 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Chen:2005:AOG, author = "Xin Chen and Jie Zheng and Zheng Fu and Peng Nan and Yang Zhong and Stefano Lonardi and Tao Jiang", title = "Assignment of Orthologous Genes via Genome Rearrangement", journal = j-TCBB, volume = "2", number = "4", pages = "302--315", month = oct, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Fri Nov 18 05:22:15 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Turner:2005:BMS, author = "Heather L. Turner and Trevor C. Bailey and Wojtek J. Krzanowski and Cheryl A. Hemingway", title = "Biclustering Models for Structured Microarray Data", journal = j-TCBB, volume = "2", number = "4", pages = "316--329", month = oct, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Fri Nov 18 05:22:15 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sevilla:2005:CBG, author = "Jose L. Sevilla and Victor Segura and Adam Podhorski and Elizabeth Guruceaga and Jose M. Mato and Luis A. Martinez-Cruz and Fernando J. Corrales and Angel Rubio", title = "Correlation between Gene Expression and {GO} Semantic Similarity", journal = j-TCBB, volume = "2", number = "4", pages = "330--338", month = oct, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Fri Nov 18 05:22:15 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Yoon:2005:DCB, author = "Sungroh Yoon and Christine Nardini and Luca Benini and Giovanni De Micheli", title = "Discovering Coherent Biclusters from Gene Expression Data Using Zero-Suppressed Binary Decision Diagrams", journal = j-TCBB, volume = "2", number = "4", pages = "339--354", month = oct, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Fri Nov 18 05:22:15 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Tseng:2005:EMG, author = "Vincent S. Tseng and Ching-Pin Kao", title = "Efficiently Mining Gene Expression Data via a Novel Parameterless Clustering Method", journal = j-TCBB, volume = "2", number = "4", pages = "355--365", month = oct, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Fri Nov 18 05:22:15 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2005:SGN, author = "Shaojie Zhang and Brian Haas and Eleazar Eskin and Vineet Bafna", title = "Searching Genomes for Noncoding {RNA} Using {FastR}", journal = j-TCBB, volume = "2", number = "4", pages = "366--379", month = oct, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Fri Nov 18 05:22:15 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Anonymous:2005:AI, author = "Anonymous", title = "2005 Annual Index", journal = j-TCBB, volume = "2", number = "4", pages = "380--384", month = oct, year = "2005", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Fri Nov 18 05:22:15 MST 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gusfield:2006:SJ, author = "Dan Gusfield", title = "State of the Journal", journal = j-TCBB, volume = "3", number = "1", pages = "1--1", month = jan, year = "2006", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2006.12", ISSN = "1545-5963", bibdate = "Thu Feb 16 11:06:15 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Berger:2006:JAG, author = "John A. Berger and Sampsa Hautaniemi and Sanjit K. Mitra and Jaakko Astola", title = "Jointly Analyzing Gene Expression and Copy Number Data in Breast Cancer Using Data Reduction Models", journal = j-TCBB, volume = "3", number = "1", pages = "2--16", month = jan, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Feb 16 11:06:15 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sebastian:2006:STA, author = "Rafael Sebastian and Maria-Elena Diaz and Guillermo Ayala and Kresimir Letinic and Jose Moncho-Bogani and Derek Toomre", title = "Spatio-Temporal Analysis of Constitutive Exocytosis in Epithelial Cells", journal = j-TCBB, volume = "3", number = "1", pages = "17--32", month = jan, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Feb 16 11:06:15 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Hershkovitz:2006:SAR, author = "Eli Hershkovitz and Guillermo Sapiro and Allen Tannenbaum and Loren Dean Williams", title = "Statistical Analysis of {RNA} Backbone", journal = j-TCBB, volume = "3", number = "1", pages = "33--46", month = jan, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Feb 16 11:06:15 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Dawy:2006:GMM, author = "Zaher Dawy and Bernhard Goebel and Joachim Hagenauer and Christophe Andreoli and Thomas Meitinger and Jakob C. Mueller", title = "Gene Mapping and Marker Clustering Using {Shannon}'s Mutual Information", journal = j-TCBB, volume = "3", number = "1", pages = "47--56", month = jan, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Feb 16 11:06:15 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Goutsias:2006:HMM, author = "John Goutsias", title = "A Hidden {Markov} Model for Transcriptional Regulation in Single Cells", journal = j-TCBB, volume = "3", number = "1", pages = "57--71", month = jan, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Feb 16 11:06:15 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Rueda:2006:HCA, author = "Luis Rueda and Vidya Vidyadharan", title = "A Hill-Climbing Approach for Automatic Gridding of {cDNA} Microarray Images", journal = j-TCBB, volume = "3", number = "1", pages = "72--83", month = jan, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Feb 16 11:06:15 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Semple:2006:UNC, author = "Charles Semple and Mike Steel", title = "Unicyclic Networks: Compatibility and Enumeration", journal = j-TCBB, volume = "3", number = "1", pages = "84--91", month = jan, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Feb 16 11:06:15 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Roch:2006:SPP, author = "Sebastien Roch", title = "A Short Proof that Phylogenetic Tree Reconstruction by Maximum Likelihood Is Hard", journal = j-TCBB, volume = "3", number = "1", pages = "92--94", month = jan, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Feb 16 11:06:15 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Anonymous:2006:RL, author = "Anonymous", title = "2005 Reviewers List", journal = j-TCBB, volume = "3", number = "1", pages = "95--96", month = jan, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Feb 16 11:06:15 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gusfield:2006:INA, author = "Dan Gusfield", title = "Introduction of New {Associate Editors}", journal = j-TCBB, volume = "3", number = "2", pages = "97--97", month = apr, year = "2006", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2006.25", ISSN = "1545-5963", bibdate = "Wed Jun 7 06:38:18 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Chu:2006:BSM, author = "Wei Chu and Zoubin Ghahramani and Alexei Podtelezhnikov and David L. Wild", title = "{Bayesian} Segmental Models with Multiple Sequence Alignment Profiles for Protein Secondary Structure and Contact Map Prediction", journal = j-TCBB, volume = "3", number = "2", pages = "98--113", month = apr, year = "2006", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2006.17", ISSN = "1545-5963", bibdate = "Wed Jun 7 06:38:18 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Danziger:2006:FCM, author = "Samuel A. Danziger and S. Joshua Swamidass and Jue Zeng and Lawrence R. Dearth and Qiang Lu and Jonathan H. Chen and Jianlin Cheng and Vinh P. Hoang and Hiroto Saigo and Ray Luo and Pierre Baldi and Rainer K. Brachmann and Richard H. Lathrop", title = "Functional Census of Mutation Sequence Spaces: The Example of p53 Cancer Rescue Mutants", journal = j-TCBB, volume = "3", number = "2", pages = "114--125", month = apr, year = "2006", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2006.22", ISSN = "1545-5963", bibdate = "Wed Jun 7 06:38:18 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Carvalho:2006:EAI, author = "Alexandra M. Carvalho and Ana T. Freitas and Arlindo L. Oliveira and Marie-France Sagot", title = "An Efficient Algorithm for the Identification of Structured Motifs in {DNA} Promoter Sequences", journal = j-TCBB, volume = "3", number = "2", pages = "126--140", month = apr, year = "2006", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2006.16", ISSN = "1545-5963", bibdate = "Wed Jun 7 06:38:18 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Brown:2006:IPA, author = "Daniel G. Brown and Ian M. Harrower", title = "Integer Programming Approaches to Haplotype Inference by Pure Parsimony", journal = j-TCBB, volume = "3", number = "2", pages = "141--154", month = apr, year = "2006", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2006.24", ISSN = "1545-5963", bibdate = "Wed Jun 7 06:38:18 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Vass:2006:JMB, author = "Marc T. Vass and Clifford A. Shaffer and Naren Ramakrishnan and Layne T. Watson and John J. Tyson", title = "The {JigCell} Model Builder: {A} Spreadsheet Interface for Creating Biochemical Reaction Network Models", journal = j-TCBB, volume = "3", number = "2", pages = "155--164", month = apr, year = "2006", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2006.27", ISSN = "1545-5963", bibdate = "Wed Jun 7 06:38:18 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Chen:2006:MFS, author = "Duhong Chen and Oliver Eulenstein and David Fernandez-Baca and Michael Sanderson", title = "Minimum-Flip Supertrees: Complexity and Algorithms", journal = j-TCBB, volume = "3", number = "2", pages = "165--173", month = apr, year = "2006", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2006.26", ISSN = "1545-5963", bibdate = "Wed Jun 7 06:38:18 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sevon:2006:TTP, author = "Petteri Sevon and Hannu Toivonen and Vesa Ollikainen", title = "{TreeDT}: Tree Pattern Mining for Gene Mapping", journal = j-TCBB, volume = "3", number = "2", pages = "174--185", month = apr, year = "2006", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2006.28", ISSN = "1545-5963", bibdate = "Wed Jun 7 06:38:18 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Song:2006:CNS, author = "Yun S. Song", title = "A Concise Necessary and Sufficient Condition for the Existence of a Galled-Tree", journal = j-TCBB, volume = "3", number = "2", pages = "186--191", month = apr, year = "2006", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2006.15", ISSN = "1545-5963", bibdate = "Wed Jun 7 06:38:18 MDT 2006", bibsource = "http://portal.acm.org/", abstract = "Galled-trees are a special class of graphical representation of evolutionary history that has proven amenable to efficient, polynomial-time algorithms. The goal of this paper is to construct a concise necessary and sufficient condition for the existence of a galled-tree for $M$, a set of binary sequences that purportedly have evolved in the presence of recombination. Both root-known and root-unknown cases are considered here.", acknowledgement = ack-nhfb, } @Article{Daras:2006:TDS, author = "Petros Daras and Dimitrios Zarpalas and Apostolos Axenopoulos and Dimitrios Tzovaras and Michael Gerassimos Strintzis", title = "Three-Dimensional Shape-Structure Comparison Method for Protein Classification", journal = j-TCBB, volume = "3", number = "3", pages = "193--207", month = jul, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Sep 11 07:36:29 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Yu:2006:MPA, author = "Weichuan Yu and Xiaoye Li and Junfeng Liu and Baolin Wu and Kenneth R. Williams and Hongyu Zhao", title = "Multiple Peak Alignment in Sequential Data Analysis: {A} Scale-Space-Based Approach", journal = j-TCBB, volume = "3", number = "3", pages = "208--219", month = jul, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Sep 11 07:36:29 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Abul:2006:PAE, author = "Osman Abul and Reda Alhajj and Faruk Polat", title = "A Powerful Approach for Effective Finding of Significantly Differentially Expressed Genes", journal = j-TCBB, volume = "3", number = "3", pages = "220--231", month = jul, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Sep 11 07:36:29 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Nagarajan:2006:CSC, author = "Radhakrishnan Nagarajan and Meenakshi Upreti", title = "Correlation Statistics for {cDNA} Microarray Image Analysis", journal = j-TCBB, volume = "3", number = "3", pages = "232--238", month = jul, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Sep 11 07:36:29 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Song:2006:CAP, author = "Yun S. Song and Rune Lyngso and Jotun Hein", title = "Counting All Possible Ancestral Configurations of Sample Sequences in Population Genetics", journal = j-TCBB, volume = "3", number = "3", pages = "239--251", month = jul, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Sep 11 07:36:29 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Pirinen:2006:FCG, author = "Matti Pirinen and Dario Gasbarra", title = "Finding Consistent Gene Transmission Patterns on Large and Complex Pedigrees", journal = j-TCBB, volume = "3", number = "3", pages = "252--262", month = jul, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Sep 11 07:36:29 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Popescu:2006:FMG, author = "Mihail Popescu and James M. Keller and Joyce A. Mitchell", title = "Fuzzy Measures on the Gene Ontology for Gene Product Similarity", journal = j-TCBB, volume = "3", number = "3", pages = "263--274", month = jul, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Sep 11 07:36:29 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Bernt:2006:GRB, author = "Matthias Bernt and Daniel Merkle and Martin Middendorf", title = "Genome Rearrangement Based on Reversals that Preserve Conserved Intervals", journal = j-TCBB, volume = "3", number = "3", pages = "275--288", month = jul, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Sep 11 07:36:29 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Berry:2006:IPC, author = "Vincent Berry and Fran{\c{c}}ois Nicolas", title = "Improved Parameterized Complexity of the Maximum Agreement Subtree and Maximum Compatible Tree Problems", journal = j-TCBB, volume = "3", number = "3", pages = "289--302", month = jul, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Sep 11 07:36:29 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sharan:2006:ITP, author = "Roded Sharan and Bjarni V. Halldorsson and Sorin Istrail", title = "Islands of Tractability for Parsimony Haplotyping", journal = j-TCBB, volume = "3", number = "3", pages = "303--311", month = jul, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Sep 11 07:36:29 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2006:SGR, author = "Chaolin Zhang and Xuesong Lu and Xuegong Zhang", title = "Significance of Gene Ranking for Classification of Microarray Samples", journal = j-TCBB, volume = "3", number = "3", pages = "312--320", month = jul, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Sep 11 07:36:29 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Casadio:2006:GEI, author = "Rita Casadio", title = "{Guest Editor}'s Introduction to the Special Issue on Computational Biology and Bioinformatics -- Part 1", journal = j-TCBB, volume = "3", number = "4", pages = "321--322", month = oct, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Nov 30 19:05:58 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Snir:2006:UMC, author = "Sagi Snir and Satish Rao", title = "Using Max Cut to Enhance Rooted Trees Consistency", journal = j-TCBB, volume = "3", number = "4", pages = "323--333", month = oct, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Nov 30 19:05:58 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Ganapathy:2006:PIB, author = "Ganeshkumar Ganapathy and Barbara Goodson and Robert Jansen and Hai-son Le and Vijaya Ramachandran and Tandy Warnow", title = "Pattern Identification in Biogeography", journal = j-TCBB, volume = "3", number = "4", pages = "334--346", month = oct, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Nov 30 19:05:58 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Wernicke:2006:EDN, author = "Sebastian Wernicke", title = "Efficient Detection of Network Motifs", journal = j-TCBB, volume = "3", number = "4", pages = "347--359", month = oct, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Nov 30 19:05:58 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Lacroix:2006:MSG, author = "Vincent Lacroix and Cristina G. Fernandes and Marie-France Sagot", title = "Motif Search in Graphs: Application to Metabolic Networks", journal = j-TCBB, volume = "3", number = "4", pages = "360--368", month = oct, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Nov 30 19:05:58 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Elias:2006:AAS, author = "Isaac Elias and Tzvika Hartman", title = "A $1.375$-Approximation Algorithm for Sorting by Transpositions", journal = j-TCBB, volume = "3", number = "4", pages = "369--379", month = oct, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Nov 30 19:05:58 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Labarre:2006:NBT, author = "Anthony Labarre", title = "New Bounds and Tractable Instances for the Transposition Distance", journal = j-TCBB, volume = "3", number = "4", pages = "380--394", month = oct, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Nov 30 19:05:58 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sammeth:2006:CTR, author = "Michael Sammeth and Jens Stoye", title = "Comparing Tandem Repeats with Duplications and Excisions of Variable Degree", journal = j-TCBB, volume = "3", number = "4", pages = "395--407", month = oct, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Nov 30 19:05:58 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Bilu:2006:FAO, author = "Yonatan Bilu and Pankaj K. Agarwal and Rachel Kolodny", title = "Faster Algorithms for Optimal Multiple Sequence Alignment Based on Pairwise Comparisons", journal = j-TCBB, volume = "3", number = "4", pages = "408--422", month = oct, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Nov 30 19:05:58 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Song:2006:EPA, author = "Yinglei Song and Chunmei Liu and Xiuzhen Huang and Russell L. Malmberg and Ying Xu and Liming Cai", title = "Efficient Parameterized Algorithms for Biopolymer Structure-Sequence Alignment", journal = j-TCBB, volume = "3", number = "4", pages = "423--432", month = oct, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Nov 30 19:05:58 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Anonymous:2006:AI, author = "Anonymous", title = "Annual Index", journal = j-TCBB, volume = "3", number = "4", pages = "??--??", month = oct, year = "2006", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Nov 30 19:05:58 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gusfield:2007:SJ, author = "Dan Gusfield", title = "State of the {Journal}", journal = j-TCBB, volume = "4", number = "1", pages = "1--1", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gusfield:2007:AEAa, author = "Dan Gusfield", title = "{Associate Editor} Appreciation and Welcome", journal = j-TCBB, volume = "4", number = "1", pages = "2--2", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Casadio:2007:GEI, author = "Rita Casadio", title = "{Guest Editor}'s Introduction to the {Special Section on Computational Biology and Bioinformatics (WABI)} -- Part 2", journal = j-TCBB, volume = "4", number = "1", pages = "3--3", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Berard:2007:PSR, author = "Severine B{\'e}rard and Anne Bergeron and Cedric Chauve and Christophe Paul", title = "Perfect Sorting by Reversals Is Not Always Difficult", journal = j-TCBB, volume = "4", number = "1", pages = "4--16", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We propose new algorithms for computing pairwise rearrangement scenarios that conserve the combinatorial structure of genomes. More precisely, we investigate the problem of sorting signed permutations by reversals without breaking common intervals. We describe a combinatorial framework for this problem that allows us to characterize classes of signed permutations for which one can compute, in polynomial time, a shortest reversal scenario that conserves all common intervals. In particular, we define a class of permutations for which this computation can be done in linear time with a very simple algorithm that does not rely on the classical Hannenhalli-Pevzner theory for sorting by reversals. We apply these methods to the computation of rearrangement scenarios between permutations obtained from 16 synteny blocks of the X chromosomes of the human, mouse, and rat.", acknowledgement = ack-nhfb, keywords = "common intervals; evolution scenarios; reversals", } @Article{Vashist:2007:OCM, author = "Akshay Vashist and Casimir A. Kulikowski and Ilya Muchnik", title = "Ortholog Clustering on a Multipartite Graph", journal = j-TCBB, volume = "4", number = "1", pages = "17--27", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We present a method for automatically extracting groups of orthologous genes from a large set of genomes by a new clustering algorithm on a weighted multipartite graph. The method assigns a score to an arbitrary subset of genes from multiple genomes to assess the orthologous relationships between genes in the subset. This score is computed using sequence similarities between the member genes and the phylogenetic relationship between the corresponding genomes. An ortholog cluster is found as the subset with the highest score, so ortholog clustering is formulated as a combinatorial optimization problem. The algorithm for finding an ortholog cluster runs in time O(|E|+|V| log|V|), where V and E are the sets of vertices and edges, respectively, in the graph. However, if we discretize the similarity scores into a constant number of bins, the runtime improves to O(|E|+|V|). The proposed method was applied to seven complete eukaryote genomes on which the manually curated database of eukaryotic ortholog clusters, KOG, is constructed. A comparison of our results with the manually curated ortholog clusters shows that our clusters are well correlated with the existing clusters.", acknowledgement = ack-nhfb, keywords = "biology; clustering algorithms; genetics; Graph-theoretic methods", } @Article{Lasker:2007:EDH, author = "Keren Lasker and Oranit Dror and Maxim Shatsky and Ruth Nussinov and Haim J. Wolfson", title = "{EMatch}: Discovery of High Resolution Structural Homologues of Protein Domains in Intermediate Resolution Cryo-{EM} Maps", journal = j-TCBB, volume = "4", number = "1", pages = "28--39", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Cryo-EM has become an increasingly powerful technique for elucidating the structure, dynamics, and function of large flexible macromolecule assemblies that cannot be determined at atomic resolution. However, due to the relatively low resolution of cryo-EM data, a major challenge is to identify components of complexes appearing in cryo-EM maps. Here, we describe EMatch, a novel integrated approach for recognizing structural homologues of protein domains present in a 6-10{\AA} resolution cryo-EM map and constructing a quasi-atomic structural model of their assembly. The method is highly efficient and has been successfully validated on various simulated data. The strength of the method is demonstrated by a domain assembly of an experimental cryo-EM map of native GroEL at 6{\AA} resolution.", acknowledgement = ack-nhfb, keywords = "3D alignment of secondary structures; cyclic symmetry; intermediate resolution cryo-EM maps; macromolecular assemblies; structural bioinformatics", } @Article{Wang:2007:ACC, author = "Lipo Wang and Feng Chu and Wei Xie", title = "Accurate Cancer Classification Using Expressions of Very Few Genes", journal = j-TCBB, volume = "4", number = "1", pages = "40--53", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We aim at finding the smallest set of genes that can ensure highly accurate classification of cancers from microarray data by using supervised machine learning algorithms. The significance of finding the minimum gene subsets is three-fold: 1) It greatly reduces the computational burden and `noise' arising from irrelevant genes. In the examples studied in this paper, finding the minimum gene subsets even allows for extraction of simple diagnostic rules which lead to accurate diagnosis without the need for any classifiers. 2) It simplifies gene expression tests to include only a very small number of genes rather than thousands of genes, which can bring down the cost for cancer testing significantly. 3) It calls for further investigation into the possible biological relationship between these small numbers of genes and cancer development and treatment. Our simple yet very effective method involves two steps. In the first step, we choose some important genes using a feature importance ranking scheme. In the second step, we test the classification capability of all simple combinations of those important genes by using a good classifier. For three `small' and `simple' data sets with two, three, and four cancer (sub)types, our approach obtained very high accuracy with only two or three genes. For a `large' and `complex' data set with 14 cancer types, we divided the whole problem into a group of binary classification problems and applied the 2--step approach to each of these binary classification problems. Through this `divide-and-conquer' approach, we obtained accuracy comparable to previously reported results but with only 28 genes rather than 16,063 genes. In general, our method can significantly reduce the number of genes required for highly reliable diagnosis.", acknowledgement = ack-nhfb, keywords = "cancer classification; fuzzy; gene expression; neural networks; support vector machines.", } @Article{Zhi:2007:CBA, author = "Degui Zhi and Uri Keich and Pavel Pevzner and Steffen Heber and Haixu Tang", title = "Correcting Base-Assignment Errors in Repeat Regions of Shotgun Assembly", journal = j-TCBB, volume = "4", number = "1", pages = "54--64", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Accurate base-assignment in repeat regions of a whole genome shotgun assembly is an unsolved problem. Since reads in repeat regions cannot be easily attributed to a unique location in the genome, current assemblers may place these reads arbitrarily. As a result, the base-assignment error rate in repeats is likely to be much higher than that in the rest of the genome. We developed an iterative algorithm, EULER-AIR, that is able to correct base-assignment errors in finished genome sequences in public databases. The Wolbachia genome is among the best finished genomes. Using this genome project as an example, we demonstrated that EULER-AIR can 1) discover and correct base-assignment errors, 2) provide accurate read assignments, 3) utilize finishing reads for accurate base-assignment, and 4) provide guidance for designing finishing experiments. In the genome of Wolbachia, EULER-AIR found 16 positions with ambiguous base-assignment and two positions with erroneous bases. Besides Wolbachia, many other genome sequencing projects have significantly fewer finishing reads and, hence, are likely to contain more base-assignment errors in repeats. We demonstrate that EULER-AIR is a software tool that can be used to find and correct base-assignment errors in a genome assembly project.", acknowledgement = ack-nhfb, keywords = "expectation maximization; finishing; fragment assembly", } @Article{Xu:2007:MCC, author = "Rui Xu and Georgios C. Anagnostopoulos and Donald C. Wunsch", title = "Multiclass Cancer Classification Using Semisupervised Ellipsoid {ARTMAP} and Particle Swarm Optimization with Gene Expression Data", journal = j-TCBB, volume = "4", number = "1", pages = "65--77", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "It is crucial for cancer diagnosis and treatment to accurately identify the site of origin of a tumor. With the emergence and rapid advancement of DNA microarray technologies, constructing gene expression profiles for different cancer types has already become a promising means for cancer classification. In addition to research on binary classification such as normal versus tumor samples, which attracts numerous efforts from a variety of disciplines, the discrimination of multiple tumor types is also important. Meanwhile, the selection of genes which are relevant to a certain cancer type not only improves the performance of the classifiers, but also provides molecular insights for treatment and drug development. Here, we use Semisupervised Ellipsoid ARTMAP (ssEAM) for multiclass cancer discrimination and particle swarm optimization for informative gene selection. ssEAM is a neural network architecture rooted in Adaptive Resonance Theory and suitable for classification tasks. ssEAM features fast, stable, and finite learning and creates hyperellipsoidal clusters, inducing complex nonlinear decision boundaries. PSO is an evolutionary algorithm-based technique for global optimization. A discrete binary version of PSO is employed to indicate whether genes are chosen or not. The effectiveness of ssEAM/PSO for multiclass cancer diagnosis is demonstrated by testing it on three publicly available multiple-class cancer data sets. ssEAM/PSO achieves competitive performance on all these data sets, with results comparable to or better than those obtained by other classifiers.", acknowledgement = ack-nhfb, keywords = "cancer classification; gene expression profile; particle swarm optimization; semisupervised ellipsoid ARTMAP", } @Article{Huang:2007:PPP, author = "Chengbang Huang and Faruck Morcos and Simon P. Kanaan and Stefan Wuchty and Danny Z. Chen and Jesus A. Izaguirre", title = "Predicting Protein-Protein Interactions from Protein Domains Using a Set Cover Approach", journal = j-TCBB, volume = "4", number = "1", pages = "78--87", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "One goal of contemporary proteome research is the elucidation of cellular protein interactions. Based on currently available protein-protein interaction and domain data, we introduce a novel method, Maximum Specificity Set Cover (MSSC), for the prediction of protein-protein interactions. In our approach, we map the relationship between interactions of proteins and their corresponding domain architectures to a generalized weighted set cover problem. The application of a greedy algorithm provides sets of domain interactions which explain the presence of protein interactions to the largest degree of specificity. Utilizing domain and protein interaction data of S. cerevisiae, MSSC enables prediction of previously unknown protein interactions, links that are well supported by a high tendency of coexpression and functional homogeneity of the corresponding proteins. Focusing on concrete examples, we show that MSSC reliably predicts protein interactions in well-studied molecular systems, such as the 26S proteasome and RNA polymerase II of S. cerevisiae. We also show that the quality of the predictions is comparable to the Maximum Likelihood Estimation while MSSC is faster. This new algorithm and all data sets used are accessible through a Web portal at http://ppi.cse.nd.edu.", acknowledgement = ack-nhfb, keywords = "bioinformatics (genome or protein) databases; biology; Computations on discrete structures; genetics; graph algorithms", } @Article{Kim:2007:AAD, author = "Jong Hyun Kim and Michael S. Waterman and Lei M. Li", title = "Accuracy Assessment of Diploid Consensus Sequences", journal = j-TCBB, volume = "4", number = "1", pages = "88--97", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "If the origins of fragments are known in genome sequencing projects, it is straightforward to reconstruct diploid consensus sequences. In reality, however, this is not true. Although there are proposed methods to reconstruct haplotypes from genome sequencing projects, an accuracy assessment is required to evaluate the confidence of the estimated diploid consensus sequences. In this paper, we define the confidence score of diploid consensus sequences. It requires the calculation of the likelihood of an assembly. To calculate the likelihood, we propose a linear time algorithm with respect to the number of polymorphic sites. The likelihood calculation and confidence score are used for further improvements of haplotype estimation in two directions. One direction is that low-scored phases are disconnected. The other direction is that, instead of using nominal frequency 1/2, the haplotype frequency is estimated to reflect the actual contribution of each haplotype. Our method was evaluated on the simulated data whose polymorphism rate (1.2 percent) was based on Ciona intestinalis. As a result, the high accuracy of our algorithm was indicated: The true positive rate of the haplotype estimation was greater than 97 percent.", acknowledgement = ack-nhfb, keywords = "diploid; haplotype; polymorphism; shotgun sequencing", } @Article{Alekseyev:2007:CBG, author = "Max A. Alekseyev and Pavel A. Pevzner", title = "Colored de Bruijn Graphs and the Genome Halving Problem", journal = j-TCBB, volume = "4", number = "1", pages = "98--107", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Breakpoint graph analysis is a key algorithmic technique in studies of genome rearrangements. However, breakpoint graphs are defined only for genomes without duplicated genes, thus limiting their applications in rearrangement analysis. We discuss a connection between the breakpoint graphs and de Bruijn graphs that leads to a generalization of the notion of breakpoint graph for genomes with duplicated genes. We further use the generalized breakpoint graphs to study the Genome Halving Problem (first introduced and solved by Nadia El-Mabrouk and David Sankoff). The El-Mabrouk-Sankoff algorithm is rather complex, and, in this paper, we present an alternative approach that is based on generalized breakpoint graphs. The generalized breakpoint graphs make the El-Mabrouk-Sankoff result more transparent and promise to be useful in future studies of genome rearrangements.", acknowledgement = ack-nhfb, keywords = "breakpoint graph; de Bruijn graph; genome duplication; genome halving; genome rearrangement; reversal", } @Article{Mossel:2007:DMT, author = "Elchanan Mossel", title = "Distorted Metrics on Trees and Phylogenetic Forests", journal = j-TCBB, volume = "4", number = "1", pages = "108--116", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We study distorted metrics on binary trees in the context of phylogenetic reconstruction. Given a binary tree $T$ on $n$ leaves with a path metric $d$, consider the pairwise distances $d(u,v)$ between leaves. It is well known that these determine the tree and the $d$-length of all edges. Here, we consider distortions $\hat{d}$ of $d$ such that, for all leaves $u$ and $v$, it holds that $|d(u,v) - \hat{d}(u,v)| < f/2$ if either $d(u,v) < M + f/2$ or $\hat{d}(u,v) < M + f/2$, where $d$ satisfies $f \leq d(e) \leq g$ for all edges $e$. Given such distortions, we show how to reconstruct in polynomial time a forest $T_1, \ldots{}, T_\alpha$ such that the true tree $T$ may be obtained from that forest by adding $\alpha-1$ edges and $\alpha-1 \leq 2-\Omega(M/g) n$. Our distorted metric result implies a reconstruction algorithm of phylogenetic forests with a small number of trees from sequences of length logarithmic in the number of species. The reconstruction algorithm is applicable for the general Markov model. Both the distorted metric result and its applications to phylogeny are almost tight.", acknowledgement = ack-nhfb, keywords = "CFN; distortion; forest; Jukes--Cantor; metric; phylogenetics; tree", } @Article{Aeling:2007:DDE, author = "Kimberly A. Aeling and Nicholas R. Steffen and Matthew Johnson and G. Wesley Hatfield and Richard H. Lathrop and Donald F. Senear", title = "{DNA} Deformation Energy as an Indirect Recognition Mechanism in Protein-{DNA} Interactions", journal = j-TCBB, volume = "4", number = "1", pages = "117--125", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Proteins that bind to specific locations in genomic DNA control many basic cellular functions. Proteins detect their binding sites using both direct and indirect recognition mechanisms. Deformation energy, which models the energy required to bend DNA from its native shape to its shape when bound to a protein, has been shown to be an indirect recognition mechanism for one particular protein, Integration Host Factor (IHF). This work extends the analysis of deformation to two other DNA-binding proteins, CRP and SRF, and two endonucleases, I-CreI and I-PpoI. Known binding sites for all five proteins showed statistically significant differences in mean deformation energy as compared to random sequences. Binding sites for the three DNA-binding proteins and one of the endonucleases had mean deformation energies lower than random sequences. Binding sites for I-PpoI had mean deformation energy higher than random sequences. Classifiers that were trained using the deformation energy at each base pair step showed good cross-validated accuracy when classifying unseen sequences as binders or nonbinders. These results support DNA deformation energy as an indirect recognition mechanism across a wider range of DNA-binding proteins. Deformation energy may also have a predictive capacity for the underlying catalytic mechanism of DNA-binding enzymes.", acknowledgement = ack-nhfb, keywords = "deformation energy; DNA bending; DNA-protein binding; indirect readout; indirect recognition; perceptron learning", } @Article{Yang:2007:MFE, author = "Jing Yang and Sarawan Wongsa and Visakan Kadirkamanathan and Stephen A. Billings and Phillip C. Wright", title = "Metabolic Flux Estimation-{A} Self-Adaptive Evolutionary Algorithm with Singular Value Decomposition", journal = j-TCBB, volume = "4", number = "1", pages = "126--138", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Metabolic flux analysis is important for metabolic system regulation and intracellular pathway identification. A popular approach for intracellular flux estimation involves using ^{13}{\rm C} tracer experiments to label states that can be measured by nuclear magnetic resonance spectrometry or gas chromatography mass spectrometry. However, the bilinear balance equations derived from ^{13}{\rm C} tracer experiments and the noisy measurements require a nonlinear optimization approach to obtain the optimal solution. In this paper, the flux quantification problem is formulated as an error-minimization problem with equality and inequality constraints through the ^{13}{\rm C} balance and stoichiometric equations. The stoichiometric constraints are transformed to a null space by singular value decomposition. Self-adaptive evolutionary algorithms are then introduced for flux quantification. The performance of the evolutionary algorithm is compared with ordinary least squares estimation by the simulation of the central pentose phosphate pathway. The proposed algorithm is also applied to the central metabolism of Corynebacterium glutamicum under lysine-producing conditions. A comparison between the results from the proposed algorithm and data from the literature is given. The complexity of a metabolic system with bidirectional reactions is also investigated by analyzing the fluctuations in the flux estimates when available measurements are varied.", acknowledgement = ack-nhfb, keywords = "evolutionary computing; least squares method; metabolic flux analysis; singular value decomposition.", } @Article{Wu:2007:QBP, author = "Gang Wu and Jia-Huai You and Guohui Lin", title = "Quartet-Based Phylogeny Reconstruction with Answer Set Programming", journal = j-TCBB, volume = "4", number = "1", pages = "139--152", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "In this paper, a new representation is presented for the Maximum Quartet Consistency (MQC) problem, where solving the MQC problem becomes searching for an ultrametric matrix that satisfies a maximum number of given quartet topologies. A number of structural properties of the MQC problem in this new representation are characterized through formulating into answer set programming, a recent powerful logic programming tool for modeling and solving search problems. Using these properties, a number of optimization techniques are proposed to speed up the search process. The experimental results on a number of simulated data sets suggest that the new representation, combined with answer set programming, presents a unique perspective to the MQC problem.", acknowledgement = ack-nhfb, keywords = "Answer Set Programming (ASP); Maximum Quartet Consistency (MQC); phylogeny; quartet; ultrametric matrix.", } @Article{Reinert:2007:LLE, author = "Gesine Reinert and Michael S. Waterman", title = "On the Length of the Longest Exact Position Match in a Random Sequence", journal = j-TCBB, volume = "4", number = "1", pages = "153--156", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "A mixed Poisson approximation and a Poisson approximation for the length of the longest exact match of a random sequence across another sequence are provided, where the match is required to start at position 1 in the first sequence. This problem arises when looking for suitable anchors in whole genome alignments.", acknowledgement = ack-nhfb, keywords = "Chen-Stein method; length of longest match; mixed Poisson approximation; Poisson approximation", } @Article{Au:2007:CAC, author = "Wai-Ho Au and Keith C. C. Chan and Andrew K. C. Wong and Yang Wang", title = "Correction to {``Attribute Clustering for Grouping, Selection, and Classification of Gene Expression Data''}", journal = j-TCBB, volume = "4", number = "1", pages = "157--157", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "This is a correction to a typographical error in (11) in [1] which present the calculation of the sum of the multiple significant interdependence redundancy measure. Equation (11) in [1] should be: $$k=\arg\max\nolimits_{k\in\{2,\ldots,p\}}\sum_{r=1}^k \sum_{A_i\in\{C_r-\eta_r\}}R(A_i:\eta_r).$$(11)We remark that the experimental results reported in [1] are based on (11) above not (11) in [1].", acknowledgement = ack-nhfb, } @Article{Biology:2007:RL, author = "IEEE/ACM Transactions on Computational Biology and Bioinformatics staff", title = "2006 Reviewers List", journal = j-TCBB, volume = "4", number = "1", pages = "158--160", month = jan, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:20 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Rajapakse:2007:GEI, author = "Jagath C. Rajapakse and Yan-Qing Zhang and Gary B. Fogel", title = "{Guest Editors}' Introduction to the {Special Section: Computational Intelligence Approaches in Computational Biology and Bioinformatics}", journal = j-TCBB, volume = "4", number = "2", pages = "161--162", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Wang:2007:PBS, author = "Haiying Wang and Huiru Zheng and Francisco Azuaje", title = "{Poisson}-Based Self-Organizing Feature Maps and Hierarchical Clustering for Serial Analysis of Gene Expression Data", journal = j-TCBB, volume = "4", number = "2", pages = "163--175", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Serial analysis of gene expression (SAGE) is a powerful technique for global gene expression profiling, allowing simultaneous analysis of thousands of transcripts without prior structural and functional knowledge. Pattern discovery and visualization have become fundamental approaches to analyzing such large-scale gene expression data. From the pattern discovery perspective, clustering techniques have received great attention. However, due to the statistical nature of SAGE data (i.e., underlying distribution), traditional clustering techniques may not be suitable for SAGE data analysis. Based on the adaptation and improvement of Self-Organizing Maps and hierarchical clustering techniques, this paper presents two new clustering algorithms, namely, PoissonS and PoissonHC, for SAGE data analysis. Tested on synthetic and experimental SAGE data, these algorithms demonstrate several advantages over traditional pattern discovery techniques. The results indicate that, by incorporating statistical properties of SAGE data, PoissonS and PoissonHC, as well as a hybrid approach (neuro-hierarchical approach) based on the combination of PoissonS and PoissonHC, offer significant improvements in pattern discovery and visualization for SAGE data. Moreover, a user-friendly platform, which may improve and accelerate SAGE data mining, was implemented. The system is freely available on request from the authors for nonprofit use.", acknowledgement = ack-nhfb, keywords = "hybrid machine learning; Pattern discovery and visualization; Poisson distribution; self-organizing maps; serial analysis of gene expression.", } @Article{Sjahputera:2007:RAC, author = "Ozy Sjahputera and James M. Keller and J. Wade Davis and Kristen H. Taylor and Farahnaz Rahmatpanah and Huidong Shi and Derek T. Anderson and Samuel N. Blisard and Robert H. Luke and Mihail Popescu and Gerald C. Arthur and Charles W. Caldwell", title = "Relational Analysis of {CpG} Islands Methylation and Gene Expression in Human Lymphomas Using Possibilistic {C}-Means Clustering and Modified Cluster Fuzzy Density", journal = j-TCBB, volume = "4", number = "2", pages = "176--189", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Heterogeneous genetic and epigenetic alterations are commonly found in human non-Hodgkin's lymphomas (NHL). One such epigenetic alteration is aberrant methylation of gene promoter-related CpG islands, where hypermethylation frequently results in transcriptional inactivation of target genes, while a decrease or loss of promoter methylation (hypomethylation) is frequently associated with transcriptional activation. Discovering genes with these relationships in NHL or other types of cancers could lead to a better understanding of the pathobiology of these diseases. The simultaneous analysis of promoter methylation using Differential Methylation Hybridization (DMH) and its associated gene expression using Expressed CpG Island Sequence Tag (ECIST) microarrays generates a large volume of methylation-expression relational data. To analyze this data, we propose a set of algorithms based on fuzzy sets theory, in particular Possibilistic c-Means (PCM) and cluster fuzzy density. For each gene, these algorithms calculate measures of confidence of various methylation-expression relationships in each NHL subclass. Thus, these tools can be used as a means of high volume data exploration to better guide biological confirmation using independent molecular biology methods.", acknowledgement = ack-nhfb, keywords = "cluster density; clustering; expression; fuzzy sets; Methylation; microarray", } @Article{Lu:2007:ISL, author = "Yijuan Lu and Qi Tian and Feng Liu and Maribel Sanchez and Yufeng Wang", title = "Interactive Semisupervised Learning for Microarray Analysis", journal = j-TCBB, volume = "4", number = "2", pages = "190--203", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Microarray technology has generated vast amounts of gene expression data with distinct patterns. Based on the premise that genes of correlated functions tend to exhibit similar expression patterns, various machine learning methods have been applied to capture these specific patterns in microarray data. However, the discrepancy between the rich expression profiles and the limited knowledge of gene functions has been a major hurdle to the understanding of cellular networks. To bridge this gap so as to properly comprehend and interpret expression data, we introduce Relevance Feedback to microarray analysis and propose an interactive learning framework to incorporate the expert knowledge into the decision module. In order to find a good learning method and solve two intrinsic problems in microarray data, high dimensionality and small sample size, we also propose a semisupervised learning algorithm: Kernel Discriminant-EM (KDEM). This algorithm efficiently utilizes a large set of unlabeled data to compensate for the insufficiency of a small set of labeled data and it extends the linear algorithm in Discriminant-EM (DEM) to a kernel algorithm to handle nonlinearly separable data in a lower dimensional space. The Relevance Feedback technique and KDEM together construct an efficient and effective interactive semisupervised learning framework for microarray analysis. Extensive experiments on the yeast cell cycle regulation data set and Plasmodium falciparum red blood cell cycle data set show the promise of this approach.", acknowledgement = ack-nhfb, keywords = "kernel DEM; microarray analysis; relevance feedback; semisupervised learning", } @Article{Lerner:2007:CSI, author = "Boaz Lerner and Josepha Yeshaya and Lev Koushnir", title = "On the Classification of a Small Imbalanced Cytogenetic Image Database", journal = j-TCBB, volume = "4", number = "2", pages = "204--215", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Solving a multiclass classification task using a small imbalanced database of patterns of high dimension is difficult due to the curse-of-dimensionality and the bias of the training toward the majority classes. Such a problem has arisen while diagnosing genetic abnormalities by classifying a small database of fluorescence in situ hybridization signals of types having different frequencies of occurrence. We propose and experimentally study using the cytogenetic domain two solutions to the problem. The first is hierarchical decomposition of the classification task, where each hierarchy level is designed to tackle a simpler problem which is represented by classes that are approximately balanced. The second solution is balancing the data by up-sampling the minority classes accompanied by dimensionality reduction. Implemented by the naive Bayesian classifier or the multilayer perceptron neural network, both solutions have diminished the problem and contributed to accuracy improvement. In addition, the experiments suggest that coping with the smallness of the data is more beneficial than dealing with its imbalance.", acknowledgement = ack-nhfb, keywords = "classification; dimensionality reduction; genetic diagnosis; imbalanced data; multilayer perceptron (MLP); naive Bayesian classifier (NBC); small sample size.", } @Article{Igel:2007:GBO, author = "Christian Igel and Tobias Glasmachers and Britta Mersch and Nico Pfeifer and Peter Meinicke", title = "Gradient-Based Optimization of Kernel-Target Alignment for Sequence Kernels Applied to Bacterial Gene Start Detection", journal = j-TCBB, volume = "4", number = "2", pages = "216--226", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Biological data mining using kernel methods can be improved by a task-specific choice of the kernel function. Oligo kernels for genomic sequence analysis have proven to have a high discriminative power and to provide interpretable results. Oligo kernels that consider subsequences of different lengths can be combined and parameterized to increase their flexibility. For adapting these parameters efficiently, gradient-based optimization of the kernel-target alignment is proposed. The power of this new, general model selection procedure and the benefits of fitting kernels to problem classes are demonstrated by adapting oligo kernels for bacterial gene start detection.", acknowledgement = ack-nhfb, keywords = "kernel target alignment; model selection; oligo kernel; sequence analysis; support vector machines; translation initiation sites", } @Article{Ogul:2007:SLP, author = "Hasan Ogul and Erkan U. Mumcuo{\u{g}}lu", title = "Subcellular Localization Prediction with New Protein Encoding Schemes", journal = j-TCBB, volume = "4", number = "2", pages = "227--232", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Subcellular localization is one of the key properties in functional annotation of proteins. Support vector machines (SVMs) have been widely used for automated prediction of subcellular localizations. Existing methods differ in the protein encoding schemes used. In this study, we present two methods for protein encoding to be used for SVM-based subcellular localization prediction: n{\hbox{-}}\rm peptide compositions with reduced amino acid alphabets for larger values of n and pairwise sequence similarity scores based on whole sequence and N-terminal sequence. We tested the methods on a common benchmarking data set that consists of 2,427 eukaryotic proteins with four localization sites. As a result of 5-fold cross-validation tests, the encoding with n{\hbox{-}}\rm peptide compositions provided the accuracies of 84.5, 88.9, 66.3, and 94.3 percent for cytoplasmic, extracellular, mitochondrial, and nuclear proteins, where the overall accuracy was 87.1 percent. The second method provided 83.6, 87.7, 87.9, and 90.5 percent accuracies for individual locations and 87.8 percent overall accuracy. A hybrid system, which we called PredLOC, makes a final decision based on the results of the two presented methods which achieved an overall accuracy of 91.3 percent, which is better than the achievements of many of the existing methods. The new system also outperformed the recent methods in the experiments conducted on a new-unique SWISSPROT test set.", acknowledgement = ack-nhfb, keywords = "n{\hbox{-}}\rm peptide composition; probabilistic suffix tree; subcellular localization; support vector machines.", } @Article{Li:2007:DSD, author = "Wenyuan Li and Ying Liu and Hung-Chung Huang and Yanxiong Peng and Yongjing Lin and Wee-Keong Ng and Kok-Leong Ong", title = "Dynamical Systems for Discovering Protein Complexes and Functional Modules from Biological Networks", journal = j-TCBB, volume = "4", number = "2", pages = "233--250", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Recent advances in high throughput experiments and annotations via published literature have provided a wealth of interaction maps of several biomolecular networks, including metabolic, protein-protein, and protein-DNA interaction networks. The architecture of these molecular networks reveals important principles of cellular organization and molecular functions. Analyzing such networks, i.e., discovering dense regions in the network, is an important way to identify protein complexes and functional modules. This task has been formulated as the problem of finding heavy subgraphs, the Heaviest k{\hbox{-}}\rm Subgraph Problem (k{\hbox{-}}\rm HSP), which itself is NP-hard. However, any method based on the k{\hbox{-}}\rm HSP requires the parameter k and an exact solution of k{\hbox{-}}\rm HSP may still end up as a `spurious' heavy subgraph, thus reducing its practicability in analyzing large scale biological networks. We proposed a new formulation, called the rank-HSP, and two dynamical systems to approximate its results. In addition, a novel metric, called the Standard deviation and Mean Ratio (SMR), is proposed for use in `spurious' heavy subgraphs to automate the discovery by setting a fixed threshold. Empirical results on both the simulated graphs and biological networks have demonstrated the efficiency and effectiveness of our proposal.", acknowledgement = ack-nhfb, keywords = "bioinformatics databases; evolutionary computing; Graph algorithms; neural nets", } @Article{Hu:2007:DMP, author = "Xiaohua Hu and Daniel D. Wu", title = "Data Mining and Predictive Modeling of Biomolecular Network from Biomedical Literature Databases", journal = j-TCBB, volume = "4", number = "2", pages = "251--263", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "In this paper, we present a novel approach Bio-IEDM (Biomedical Information Extraction and Data Mining) to integrate text mining and predictive modeling to analyze biomolecular network from biomedical literature databases. Our method consists of two phases. In phase 1, we discuss a semisupervised efficient learning approach to automatically extract biological relationships such as protein-protein interaction, protein-gene interaction from the biomedical literature databases to construct the biomolecular network. Our method automatically learns the patterns based on a few user seed tuples and then extracts new tuples from the biomedical literature based on the discovered patterns. The derived biomolecular network forms a large scale-free network graph. In phase 2, we present a novel clustering algorithm to analyze the biomolecular network graph to identify biologically meaningful subnetworks (communities). The clustering algorithm considers the characteristics of the scale-free network graphs and is based on the local density of the vertex and its neighborhood functions that can be used to find more meaningful clusters with different density level. The experimental results indicate our approach is very effective in extracting biological knowledge from a huge collection of biomedical literature. The integration of data mining and information extraction provides a promising direction for analyzing the biomolecular network.", acknowledgement = ack-nhfb, keywords = "biological complexes (communities); biomolecular network; information extraction; scale-free network; semisupervised learning", } @Article{Neri:2007:AMA, author = "Ferrante Neri and Jari Toivanen and Giuseppe Leonardo Cascella and Yew-Soon Ong", title = "An Adaptive Multimeme Algorithm for Designing {HIV} Multidrug Therapies", journal = j-TCBB, volume = "4", number = "2", pages = "264--278", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "This paper proposes a period representation for modeling the multidrug HIV therapies and an Adaptive Multimeme Algorithm (AMmA) for designing the optimal therapy. The period representation offers benefits in terms of flexibility and reduction in dimensionality compared to the binary representation. The AMmA is a memetic algorithm which employs a list of three local searchers adaptively activated by an evolutionary framework. These local searchers, having different features according to the exploration logic and the pivot rule, have the role of exploring the decision space from different and complementary perspectives and, thus, assisting the standard evolutionary operators in the optimization process. Furthermore, the AMmA makes use of an adaptation which dynamically sets the algorithmic parameters in order to prevent stagnation and premature convergence. The numerical results demonstrate that the application of the proposed algorithm leads to very efficient medication schedules which quickly stimulate a strong immune response to HIV. The earlier termination of the medication schedule leads to lesser unpleasant side effects for the patient due to strong antiretroviral therapy. A numerical comparison shows that the AMmA is more efficient than three popular metaheuristics. Finally, a statistical test based on the calculation of the tolerance interval confirms the superiority of the AMmA compared to the other methods for the problem under study.", acknowledgement = ack-nhfb, keywords = "adaptive algorithms; HIV therapy design; memetic algorithms; nonlinear integer programming.", } @Article{Handl:2007:MOB, author = "Julia Handl and Douglas B. Kell and Joshua Knowles", title = "Multiobjective Optimization in Bioinformatics and Computational Biology", journal = j-TCBB, volume = "4", number = "2", pages = "279--292", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "This paper reviews the application of multiobjective optimization in the fields of bioinformatics and computational biology. A survey of existing work, organized by application area, forms the main body of the review, following an introduction to the key concepts in multiobjective optimization. An original contribution of the review is the identification of five distinct `contexts,' giving rise to multiple objectives: These are used to explain the reasons behind the use of multiobjective optimization in each application area and also to point the way to potential future uses of the technique.", acknowledgement = ack-nhfb, keywords = "bioinformatics (genome or protein) databases; classification and association rules; clustering; experimental design; global optimization; interactive data exploration and discovery; machine learning", } @Article{Bontempi:2007:BSI, author = "Gianluca Bontempi", title = "A Blocking Strategy to Improve Gene Selection for Classification of Gene Expression Data", journal = j-TCBB, volume = "4", number = "2", pages = "293--300", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Because of high dimensionality, machine learning algorithms typically rely on feature selection techniques in order to perform effective classification in microarray gene expression data sets. However, the large number of features compared to the number of samples makes the task of feature selection computationally hard and prone to errors. This paper interprets feature selection as a task of stochastic optimization, where the goal is to select among an exponential number of alternative gene subsets the one expected to return the highest generalization in classification. Blocking is an experimental design strategy which produces similar experimental conditions to compare alternative stochastic configurations in order to be confident that observed differences in accuracy are due to actual differences rather than to fluctuations and noise effects. We propose an original blocking strategy for improving feature selection which aggregates in a paired way the validation outcomes of several learning algorithms to assess a gene subset and compare it to others. This is a novelty with respect to conventional wrappers, which commonly adopt a sole learning algorithm to evaluate the relevance of a given set of variables. The rationale of the approach is that, by increasing the amount of experimental conditions under which we validate a feature subset, we can lessen the problems related to the scarcity of samples and consequently come up with a better selection. The paper shows that the blocking strategy significantly improves the performance of a conventional forward selection for a set of 16 publicly available cancer expression data sets. The experiments involve six different classifiers and show that improvements take place independent of the classification algorithm used after the selection step. Two further validations based on available biological annotation support the claim that blocking strategies in feature selection may improve the accuracy and the quality of the solution. The first validation is based on retrieving PubMEd abstracts associated to the selected genes and matching them to regular expressions describing the biological phenomenon underlying the expression data sets. The biological validation that follows is based on the use of the Bioconductor package GoStats in order to perform Gene Ontology statistical analysis.", acknowledgement = ack-nhfb, keywords = "bioinformatics (genome or protein) databases; data mining; feature evaluation and selection; machine learning", } @Article{Diekmann:2007:EUR, author = "Yoan Diekmann and Marie-France Sagot and Eric Tannier", title = "Evolution under Reversals: Parsimony and Conservation of Common Intervals", journal = j-TCBB, volume = "4", number = "2", pages = "301--309", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "In comparative genomics, gene order data is often modeled as signed permutations. A classical problem for genome comparison is to detect common intervals in permutations, that is, genes that are colocalized in several species, indicating that they remained grouped during evolution. A second largely studied problem related to gene order is to compute a minimum scenario of reversals that transforms a signed permutation into another. Several studies began to mix the two problems and it was observed that their results are not always compatible: Often, parsimonious scenarios of reversals break common intervals. If a scenario does not break any common interval, it is called perfect. In two recent studies, B{\'e}rard et al. defined a class of permutations for which building a perfect scenario of reversals sorting a permutation was achieved in polynomial time and stated as an open question whether it is possible to decide, given a permutation, if there exists a minimum scenario of reversals that is perfect. In this paper, we give a solution to this problem and prove that this widens the class of permutations addressed by the aforementioned studies. We implemented and tested this algorithm on gene order data of chromosomes from several mammal species and we compared it to other methods. The algorithm helps to choose among several possible scenarios of reversals and indicates that the minimum scenario of reversals is not always the most plausible.", acknowledgement = ack-nhfb, keywords = "common intervals; computational biology; genome rearrangements; perfect sorting; signed permutations; sorting by reversals", } @Article{Weskamp:2007:MGA, author = "Nils Weskamp and Eyke Hullermeier and Daniel Kuhn and Gerhard Klebe", title = "Multiple Graph Alignment for the Structural Analysis of Protein Active Sites", journal = j-TCBB, volume = "4", number = "2", pages = "310--320", month = apr, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:57:55 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Graphs are frequently used to describe the geometry and also the physicochemical composition of protein active sites. Here, the concept of graph alignment as a novel method for the structural analysis of protein binding pockets is presented. Using inexact graph-matching techniques, one is able to identify both conserved areas and regions of difference among different binding pockets. Thus, using multiple graph alignments, it is possible to characterize functional protein families and to examine differences among related protein families independent of sequence or fold homology. Optimized algorithms are described for the efficient calculation of multiple graph alignments for the analysis of physicochemical descriptors representing protein binding pockets. Additionally, it is shown how the calculated graph alignments can be analyzed to identify structural features that are characteristic for a given protein family and also features that are discriminative among related families. The methods are applied to a substantial high-quality subset of the PDB database and their ability to successfully characterize and classify 10 highly populated functional protein families is shown. Additionally, two related protein families from the group of serine proteases are examined and important structural differences are detected automatically and efficiently.", acknowledgement = ack-nhfb, keywords = "drug design; fuzzy patterns; graph mining; knowledge discovery in databases; structural pattern discovery", } @Article{Gusfield:2007:AEAb, author = "Dan Gusfield", title = "{Associate Editor} Appreciation and Welcome", journal = j-TCBB, volume = "4", number = "3", pages = "321--321", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Fujarewicz:2007:ASM, author = "Krzysztof Fujarewicz and Marek Kimmel and Tomasz Lipniacki and Andrzej Swierniak", title = "Adjoint Systems for Models of Cell Signaling Pathways and their Application to Parameter Fitting", journal = j-TCBB, volume = "4", number = "3", pages = "322--335", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The paper concerns the problem of fitting mathematical models of cell signaling pathways. Such models frequently take the form of sets of nonlinear ordinary differential equations. While the model is continuous in time, the performance index used in the fitting procedure, involves measurements taken at discrete time moments. Adjoint sensitivity analysis is a tool, which can be used for finding the gradient of a performance index in the space of parameters of the model. In the paper a structural formulation of adjoint sensitivity analysis called the Generalized Backpropagation Through Time (GBPTT) is used. The method is especially suited for hybrid, continuous-discrete time systems. As an example we use the mathematical model of the NF-kB regulatory module, which plays a major role in the innate immune response in animals.", acknowledgement = ack-nhfb, keywords = "biology and genetics; modeling; ordinary differential equations; parameter learning", } @Article{Wan:2007:CCN, author = "Xiang Wan and Guohui Lin", title = "{CISA}: Combined {NMR} Resonance Connectivity Information Determination and Sequential Assignment", journal = j-TCBB, volume = "4", number = "3", pages = "336--348", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "A nearly complete sequential resonance assignment is a key factor leading to successful protein structure determination via NMR spectroscopy. Assuming the availability of a set of NMR spectral peak lists, most of the existing assignment algorithms first use the differences between chemical shift values for common nuclei across multiple spectra to provide the evidence that some pairs of peaks should be assigned to sequentially adjacent amino acid residues in the target protein. They then use these connectivities as constraints to produce a sequential assignment. At various levels of success, these algorithms typically generate a large number of potential connectivity constraints, and it grows exponentially as the quality of spectral data decreases. A key observation used in our sequential assignment program, CISA, is that chemical shift residual signature information can be used to improve the connectivity determination, and thus to dramatically decrease the number of predicted connectivity constraints. Fewer connectivity constraints lead to less ambiguities in the sequential assignment. Extensive simulation studies on several large test datasets demonstrated that CISA is efficient and effective, compared to three most recently proposed sequential resonance assignment programs RANDOM, PACES, and MARS.", acknowledgement = ack-nhfb, keywords = "NMR sequential resonance assignment; spin system; spin system assignment; spin system residual signature; spin system sequential connectivity", } @Article{Cameron:2007:CCS, author = "Michael Cameron and Hugh Williams", title = "Comparing Compressed Sequences for Faster Nucleotide {BLAST} Searches", journal = j-TCBB, volume = "4", number = "3", pages = "349--364", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Molecular biologists, geneticists, and other life scientists use the BLAST homology search package as their first step for discovery of information about unknown or poorly annotated genomic sequences. There are two main variants of BLAST: BLASTP for searching protein collections and BLASTN for nucleotide collections. Surprisingly, BLASTN has had very little attention; for example, the algorithms it uses do not follow those described in the 1997 BLAST paper [1] and no exact description has been published. It is important that BLASTN is state-of-the-art: Nucleotide collections such as GenBank dwarf the protein collections in size, they double in size almost yearly, and they take many minutes to search on modern general purpose workstations. This paper proposes significant improvements to the BLASTN algorithms. Each of our schemes is based on compressed bytepacked formats that allow queries and collection sequences to be compared four bases at a time, permitting very fast query evaluation using lookup tables and numeric comparisons. Our most significant innovations are two new, fast gapped alignment schemes that allow accurate sequence alignment without decompression of the collection sequences. Overall, our innovations more than double the speed of BLASTN with no effect on accuracy and have been integrated into our new version of BLAST that is freely available for download from http://www.fsa-blast.org/.", acknowledgement = ack-nhfb, keywords = "BLAST; compression; Four Russians algorithm; homology search; sequence alignment", } @Article{Tang:2007:DTS, author = "Yuchun Tang and Yan-Qing Zhang and Zhen Huang", title = "Development of Two-Stage {SVM}-{RFE} Gene Selection Strategy for Microarray Expression Data Analysis", journal = j-TCBB, volume = "4", number = "3", pages = "365--381", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Extracting a subset of informative genes from microarray expression data is a critical data preparation step in cancer classification and other biological function analyses. Though many algorithms have been developed, the Support Vector Machine - Recursive Feature Elimination (SVM-RFE) algorithm is one of the best gene feature selection algorithms. It assumes that a smaller `filter-out' factor in the SVM-RFE, which results in a smaller number of gene features eliminated in each recursion, should lead to extraction of a better gene subset. Because the SVM-RFE is highly sensitive to the `filter-out' factor, our simulations have shown that this assumption is not always correct and that the SVM-RFE is an unstable algorithm. To select a set of key gene features for reliable prediction of cancer types or subtypes and other applications, a new two-stage SVM-RFE algorithm has been developed. It is designed to effectively eliminate most of the irrelevant, redundant and noisy genes while keeping information loss small at the first stage. A fine selection for the final gene subset is then performed at the second stage. The two-stage SVM-RFE overcomes the instability problem of the SVM-RFE to achieve better algorithm utility. We have demonstrated that the two-stage SVM-RFE is significantly more accurate and more reliable than the SVM-RFE and three correlation-based methods based on our analysis of three publicly available microarray expression datasets. Furthermore, the two-stage SVM-RFE is computationally efficient because its time complexity is $O(d * \log{_2d})$, where $d$ is the size of the original gene set.", acknowledgement = ack-nhfb, keywords = "bioinformatics; cancer classification; feature selection; gene selection; microarray gene expression data analysis; recursive feature elimination; support vector machines", } @Article{Ng:2007:NGW, author = "Lydia Ng and Sayan Pathak and Chihchau Kuan and Chris Lau and Hong-wei Dong and Andrew Sodt and Chinh Dang and Brian Avants and Paul Yushkevich and James Gee and David Haynor and Ed Lein and Allan Jones and Mike Hawrylycz", title = "Neuroinformatics for Genome-Wide {$3$-D} Gene Expression Mapping in the Mouse Brain", journal = j-TCBB, volume = "4", number = "3", pages = "382--393", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Large scale gene expression studies in the mammalian brain offer the promise of understanding the topology, networks and ultimately the function of its complex anatomy, opening previously unexplored avenues in neuroscience. High-throughput methods permit genome-wide searches to discover genes that are uniquely expressed in brain circuits and regions that control behavior. Previous gene expression mapping studies in model organisms have employed situ hybridization (ISH), a technique that uses labeled nucleic acid probes to bind to specific mRNA transcripts in tissue sections. A key requirement for this effort is the development of fast and robust algorithms for anatomically mapping and quantifying gene expression for ISH. We describe a neuroinformatics pipeline for automatically mapping expression profiles of ISH data and its use to produce the first genomic scale 3-D mapping of gene expression in a mammalian brain. The pipeline is fully automated and adaptable to other organisms and tissues. Our automated study of over 20,000 genes indicates that at least 78.8\% are expressed at some level in the adult C56BL/6J mouse brain. In addition to providing a platform for genomic scale search, high-resolution images and visualization tools for expression analysis are available at the Allen Brain Atlas web site (http://www.brain-map.org).", acknowledgement = ack-nhfb, keywords = "bioinformatics (genome or protein) databases; data mining; information visualization; registration; segmentation", } @Article{Nguyen:2007:RRN, author = "C. Thach Nguyen and Nguyen Bao Nguyen and Wing-Kin Sung and Louxin Zhang", title = "Reconstructing Recombination Network from Sequence Data: The Small Parsimony Problem", journal = j-TCBB, volume = "4", number = "3", pages = "394--402", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The small parsimony problem is studied for reconstructing recombination networks from sequence data. The small parsimony problem is polynomial-time solvable for phylogenetic trees. However, the problem is proved NP-hard even for galled recombination networks. A dynamic programming algorithm is also developed to solve the small parsimony problem. It takes $O(dn2^{3h})$ time on an input recombination network over length-$d$ sequences in which there are $h$ recombination and $n - h$ tree nodes.", acknowledgement = ack-nhfb, keywords = "approximability; combination network; dynamic programming; NP-hardness; parsimony method; phylogenetic network", } @Article{Lones:2007:RMD, author = "Michael Lones and Andy Tyrrell", title = "Regulatory Motif Discovery Using a Population Clustering Evolutionary Algorithm", journal = j-TCBB, volume = "4", number = "3", pages = "403--414", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "This paper describes a novel evolutionary algorithm for regulatory motif discovery in DNA promoter sequences. The algorithm uses data clustering to logically distribute the evolving population across the search space. Mating then takes place within local regions of the population, promoting overall solution diversity and encouraging discovery of multiple solutions. Experiments using synthetic data sets have demonstrated the algorithm's capacity to find position frequency matrix models of known regulatory motifs in relatively long promoter sequences. These experiments have also shown the algorithm's ability to maintain diversity during search and discover multiple motifs within a single population. The utility of the algorithm for discovering motifs in real biological data is demonstrated by its ability to find meaningful motifs within muscle-specific regulatory sequences.", acknowledgement = ack-nhfb, keywords = "evolutionary computation; motif discovery; muscle-specific gene expression; population-based data clustering; transcription factor binding sites", } @Article{Yip:2007:SIS, author = "Andy M. Yip and Michael K. Ng and Edmond H. Wu and Tony F. Chan", title = "Strategies for Identifying Statistically Significant Dense Regions in Microarray Data", journal = j-TCBB, volume = "4", number = "3", pages = "415--429", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We propose and study the notion of dense regions for the analysis of categorized gene expression data and present some searching algorithms for discovering them. The algorithms can be applied to any categorical data matrices derived from gene expression level matrices. We demonstrate that dense regions are simple but useful and statistically significant patterns that can be used to 1) identify genes and/or samples of interest and 2) eliminate genes and/or samples corresponding to outliers, noise, or abnormalities. Some theoretical studies on the properties of the dense regions are presented which allow us to characterize dense regions into several classes and to derive tailor-made algorithms for different classes of regions. Moreover, an empirical simulation study on the distribution of the size of dense regions is carried out which is then used to assess the significance of dense regions and to derive effective pruning methods to speed up the searching algorithms. Real microarray data sets are employed to test our methods. Comparisons with six other well-known clustering algorithms using synthetic and real data are also conducted which confirm the superiority of our methods in discovering dense regions. The DRIFT code and a tutorial are available as supplemental material, which can be found on the Computer Society Digital Library at http://computer.org/tcbb/archives.htm.", acknowledgement = ack-nhfb, keywords = "bicluster; categorical data; clustering; coexpressed genes; dense region; gene expression; microarray", } @Article{Liang:2007:BBD, author = "Kuo-ching Liang and Xiaodong Wang and Dimitris Anastassiou", title = "Bayesian Basecalling for {DNA} Sequence Analysis Using Hidden Markov Models", journal = j-TCBB, volume = "4", number = "3", pages = "430--440", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "It has been shown that electropherograms of DNA sequences can be modeled with hidden Markov models. Basecalling, the procedure that determines the sequence of bases from the given eletropherogram, can then be performed using the Viterbi algorithm. A training step is required prior to basecalling in order to estimate the HMM parameters. In this paper, we propose a Bayesian approach which employs the Markov chain Monte Carlo (MCMC) method to perform basecalling. Such an approach not only allows one to naturally encode the prior biological knowledge into the basecalling algorithm, it also exploits both the training data and the basecalling data in estimating the HMM parameters, leading to more accurate estimates. Using the recently sequenced genome of the organism Legionella pneumophila we show that the MCMC basecaller outperforms the state-of-the-art basecalling algorithm in terms of total errors while requiring much less training than other proposed statistical basecallers.", acknowledgement = ack-nhfb, keywords = "basecalling; DNA sequencing; electropherogram; hidden Markov model (HMM); Markov chain Monte Carlo (MCMC)", } @Article{Thireou:2007:BLS, author = "Trias Thireou and Martin Reczko", title = "Bidirectional Long Short-Term Memory Networks for Predicting the Subcellular Localization of Eukaryotic Proteins", journal = j-TCBB, volume = "4", number = "3", pages = "441--446", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "An algorithm called Bidirectional Long Short-Term Memory Networks (BLSTM) for processing sequential data is introduced. This supervised learning method trains a special recurrent neural network to use very long ranged symmetric sequence context using a combination of nonlinear processing elements and linear feedback loops for storing long-range context. The algorithm is applied to the sequence-based prediction of protein localization and predicts 93.3\% novel non-plant proteins and 88.4\% novel plant proteins correctly, which is an improvement over feedforward and standard recurrent networks solving the same problem. The BLSTM system is available as a web-service (http://www.stepc.gr/~synaptic/blstm.html).", acknowledgement = ack-nhfb, keywords = "biological sequence analysis; long short-term memory; protein subcellular localization prediction; recurrent neural networks", } @Article{Korodi:2007:CAN, author = "Gergely Korodi and Ioan Tabus", title = "Compression of Annotated Nucleotide Sequences", journal = j-TCBB, volume = "4", number = "3", pages = "447--457", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "This article introduces an algorithm for the lossless compression of DNA files, which contain annotation text besides the nucleotide sequence. First a grammar is specifically designed to capture the regularities of the annotation text. A revertible transformation uses the grammar rules in order to equivalently represent the original file as a collection of parsed segments and a sequence of decisions made by the grammar parser. This decomposition enables the efficient use of state-of-the-art encoders for processing the parsed segments. The output size of the decision-making process of the grammar is optimized by extending the states to account for high-order Markovian dependencies. The practical implementation of the algorithm achieves a significant improvement when compared to the general-purpose methods currently used for DNA files.", acknowledgement = ack-nhfb, keywords = "4 [Data]: Coding and Information Theory | Data compaction and compression; Annotation; Compression; F.4 [Theory of Computation]: Mathematical Logic and Formal Languages | Formal languages; Formal Grammars; G.3 [Mathematics of Computing]: Probability and Statistics | Markov processes; J.3 [Computer Applications]: Life and Medical Sciences | Biology and genetics; nucleotide sequences", } @Article{Bordewich:2007:CHN, author = "Magnus Bordewich and Charles Semple", title = "Computing the Hybridization Number of Two Phylogenetic Trees Is Fixed-Parameter Tractable", journal = j-TCBB, volume = "4", number = "3", pages = "458--466", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Reticulation processes in evolution mean that the ancestral history of certain groups of present-day species is non-tree-like. These processes include hybridization, lateral gene transfer, and recombination. Despite the existence of reticulation, such events are relatively rare and so a fundamental problem for biologists is the following: given a collection of rooted binary phylogenetic trees on sets of species that correctly represent the tree-like evolution of different parts of their genomes, what is the smallest number of `reticulation' vertices in any network that explains the evolution of the species under consideration. It has been previously shown that this problem is NP-hard even when the collection consists of only two rooted binary phylogenetic trees. However, in this paper, we show that the problem is fixed-parameter tractable in the two-tree instance, when parameterized by this smallest number of reticulation vertices.", acknowledgement = ack-nhfb, keywords = "agreement forest; hybridization network; reticulate evolution; rooted phylogenetic tree; subtree prune and regraft", } @Article{Huang:2007:EGS, author = "D. Huang and Tommy Chow", title = "Effective Gene Selection Method With Small Sample Sets Using Gradient-Based and Point Injection Techniques", journal = j-TCBB, volume = "4", number = "3", pages = "467--475", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Microarray gene expression data usually consist of a large amount of genes. Among these genes, only a small fraction is informative for performing cancer diagnostic test. This paper focuses on effective identification of informative genes. We analyze gene selection models from the perspective of optimization theory. As a result, a new strategy is designed to modify conventional search engines. Also, as overfitting is likely to occur in microarray data because of their small sample set, a point injection technique is developed to address the problem of overfitting. The proposed strategies have been evaluated on three kinds of cancer diagnosis. Our results show that the proposed strategies can improve the performance of gene selection substantially. The experimental results also indicate that the proposed methods are very robust under all the investigated cases.", acknowledgement = ack-nhfb, keywords = "gene selection; gradient based learning; optimization theory; point injection", } @Article{Hecht:2007:HTL, author = "David Hecht and Gary Fogel", title = "High-Throughput Ligand Screening via Preclustering and Evolved Neural Networks", journal = j-TCBB, volume = "4", number = "3", pages = "476--484", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The pathway for novel lead drug discovery has many major deficiencies, the most significant of which is the immense size of small molecule diversity space. Methods that increase the search efficiency and/or reduce the size of the search space, increase the rate at which useful lead compounds are identified. Artificial neural networks optimized via evolutionary computation provide a cost and time-effective solution to this problem. Here, we present results that suggest preclustering of small molecules prior to neural network optimization is useful for generating models of quantitative structure-activity relationships for a set of HIV inhibitors. Using these methods, it is possible to prescreen compounds to separate active from inactive compounds or even actives and mildly active compounds from inactive compounds with high predictive accuracy while simultaneously reducing the feature space. It is also possible to identify `human interpretable' features from the best models that can be used for proposal and synthesis of new compounds in order to optimize potency and specificity.", acknowledgement = ack-nhfb, keywords = "artificial neural networks; computational intelligence; evolutionary computation; medicine and science", } @Article{Zhang:2007:MCU, author = "Runxuan Zhang and Guang-Bin Huang and N. Sundararajan and P. Saratchandran", title = "Multicategory Classification Using An Extreme Learning Machine for Microarray Gene Expression Cancer Diagnosis", journal = j-TCBB, volume = "4", number = "3", pages = "485--495", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "In this paper, the recently developed Extreme Learning Machine (ELM) is used for direct multicategory classification problems in the cancer diagnosis area. ELM avoids problems like local minima, improper learning rate and overfitting commonly faced by iterative learning methods and completes the training very fast. We have evaluated the multi-category classification performance of ELM on three benchmark microarray datasets for cancer diagnosis, namely, the GCM dataset, the Lung dataset and the Lymphoma dataset. The results indicate that ELM produces comparable or better classification accuracies with reduced training time and implementation complexity compared to artificial neural networks methods like conventional back-propagation ANN, Linder's SANN, and Support Vector Machine methods like SVM-OVO and Ramaswamy's SVM-OVA. ELM also achieves better accuracies for classification of individual categories.", acknowledgement = ack-nhfb, keywords = "extreme learning machine; gene expression; microarray; multi-category classification; SVM", } @Article{Zhang:2007:SSS, author = "Louxin Zhang", title = "Superiority of Spaced Seeds for Homology Search", journal = j-TCBB, volume = "4", number = "3", pages = "496--505", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "In homology search, good spaced seeds have higher sensitivity for the same cost (weight). However, elucidating the mechanism that confers power to spaced seeds and characterizing optimal spaced seeds still remain unsolved. This paper investigates these two important open questions by formally analyzing the average number of non-overlapping hits and the hit probability of a spaced seed in the Bernoulli sequence model. We prove that when the length of a non-uniformly spaced seed is bounded above by an exponential function of the seed weight, the seed outperforms strictly the traditional consecutive seed of the same weight in both (i) the average number of non-overlapping hits and (ii) the asymptotic hit probability. This clearly answers the first problem mentioned above in the Bernoulli sequence model. The theoretical study in this paper also gives a new solution to finding long optimal seeds.", acknowledgement = ack-nhfb, keywords = "homology search; pattern matching; renewal theory; run statistics; sequence alignment; spaced seeds", } @Article{Matsen:2007:OCT, author = "Frederick Matsen", title = "Optimization Over a Class of Tree Shape Statistics", journal = j-TCBB, volume = "4", number = "3", pages = "506--512", month = jul, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:24 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Tree shape statistics quantify some aspect of the shape of a phylogenetic tree. They are commonly used to compare reconstructed trees to evolutionary models and to find evidence of tree reconstruction bias. Historically, to find a useful tree shape statistic, formulas have been invented by hand and then evaluated for utility. This article presents the first method which is capable of optimizing over a class of tree shape statistics, called Binary Recursive Tree Shape Statistics (BRTSS). After defining the BRTSS class, a set of algebraic expressions is defined which can be used in the recursions. The tree shape statistics definable using these expressions in the BRTSS is very general, and includes many of the statistics with which phylogenetic researchers are already familiar. We then present a practical genetic algorithm which is capable of performing optimization over BRTSS given any objective function. The chapter concludes with a successful application of the methods to find a new statistic which indicates a significant difference between two distributions on trees which were previously postulated to have similar properties.", acknowledgement = ack-nhfb, keywords = "biology and genetics; evolutionary computing and genetic algorithms", } @Article{Mandoiu:2007:GEI, author = "Ion I. M{\~a}ndoiu and Yi Pan and Alexander Zelikovsky", title = "{Guest Editors}' Introduction to the {Special Section on Bioinformatics Research and Applications}", journal = j-TCBB, volume = "4", number = "4", pages = "513--514", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zheng:2007:RNA, author = "Chunfang Zheng and Qian Zhu and David Sankoff", title = "Removing Noise and Ambiguities from Comparative Maps in Rearrangement Analysis", journal = j-TCBB, volume = "4", number = "4", pages = "515--522", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Comparison of genomic maps is hampered by errors and ambiguities introduced by mapping technology, incorrectly resolved paralogy, small samples of markers and extensive genome rearrangement. We design an analysis to remove or resolve most of these problems and to extract corrected data where markers occur in consecutive strips in both genomes. To do this we introduce the notion of pre-strip, an efficient way of generating these, and a compatibility analysis culminating in a Maximum Weighted Clique (MWC) search. The output can be directly analyzed with genome rearrangement algorithms, allowing the restoration of some of the data not incorporated into the clique solution. We investigate the trade-off between criteria for discarding excessive pre-strips to make MWC feasible, in terms of retaining as many markers as possible in the solution and producing an economical rearrangement analysis. We explore these questions through simulation and through comparison of the rice and sorghum genomes.", acknowledgement = ack-nhfb, keywords = "genome rearrangements; Maximum Weight Clique; rice; sorghum; synteny blocks", } @Article{Blin:2007:CGD, author = "Guillaume Blin and Cedric Chauve and Guillaume Fertin and Romeo Rizzi and Stephane Vialette", title = "Comparing Genomes with Duplications: {A} Computational Complexity Point of View", journal = j-TCBB, volume = "4", number = "4", pages = "523--534", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "In this paper, we are interested in the computational complexity of computing (dis)similarity measures between two genomes when they contain duplicated genes or genomic markers, a problem that happens frequently when comparing whole nuclear genomes. Recently, several methods ( [1], [2]) have been proposed that are based on two steps to compute a given (dis)similarity measure M between two genomes G_1 and G_2: first, one establishes a one-to-one correspondence between genes of G_1 and genes of G_2 ; second, once this correspondence is established, it defines explicitly a permutation and it is then possible to quantify their similarity using classical measures defined for permutations, like the number of breakpoints. Hence these methods rely on two elements: a way to establish a one-to-one correspondence between genes of a pair of genomes, and a (dis)similarity measure for permutations. The problem is then, given a (dis)similarity measure for permutations, to compute a correspondence that defines an optimal permutation for this measure. We are interested here in two models to compute a one-to-one correspondence: the exemplar model, where all but one copy are deleted in both genomes for each gene family, and the matching model, that computes a maximal correspondence for each gene family. We show that for these two models, and for three (dis)similarity measures on permutations, namely the number of common intervals, the maximum adjacency disruption (MAD) number and the summed adjacency disruption (SAD) number, the problem of computing an optimal correspondence is NP-complete, and even APXhard for the MAD number and SAD number.", acknowledgement = ack-nhfb, keywords = "common intervals; comparative genomics; computational complexity; maximum adjacency disruption number; summed adjacency disruption number", } @Article{Bonizzoni:2007:ELC, author = "Paola Bonizzoni and Gianluca Della Vedova and Riccardo Dondi and Guillaume Fertin and Raffaella Rizzi and Stephane Vialette", title = "Exemplar Longest Common Subsequence", journal = j-TCBB, volume = "4", number = "4", pages = "535--543", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "In this paper, we investigate the computational and approximation complexity of the Exemplar Longest Common Subsequence of a set of sequences (ELCS problem), a generalization of the Longest Common Subsequence problem, where the input sequences are over the union of two disjoint sets of symbols, a set of mandatory symbols and a set of optional symbols. We show that different versions of the problem are APX-hard even for instances with two sequences. Moreover, we show that the related problem of determining the existence of a feasible solution of the Exemplar Longest Common Subsequence of two sequences is NP-hard. On the positive side, we first present an efficient algorithm for the ELCS problem over instances of two sequences where each mandatory symbol can appear in total at most three times in the sequences. Furthermore, we present two fixed-parameter algorithms for the ELCS problem over instances of two sequences where the parameter is the number of mandatory symbols.", acknowledgement = ack-nhfb, keywords = "algorithm design and analysis; analysis of algorithms and problem complexity; combinatorial algorithms; comparative genomics; longest common subsequence", } @Article{Davila:2007:FPA, author = "Jaime Davila and Sudha Balla and Sanguthevar Rajasekaran", title = "Fast and Practical Algorithms for Planted (l, d) Motif Search", journal = j-TCBB, volume = "4", number = "4", pages = "544--552", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We consider the planted (l, d) motif search problem, which consists of finding a substring of length l that occurs in a set of input sequences {s1, . . . , sn} with up to d errors, a problem that arises from the need to find transcription factor-binding sites in genomic information. We propose a sequence of practical algorithms, which start based on the ideas considered in PMS1. These algorithms are exact, have little space requirements, and are able to tackle challenging instances with bigger d, taking less time in the instances reported solved by exact algorithms. In particular, one of the proposed algorithms, PMSprune, is able to solve the challenging instances, such as (17, 6) and (19, 7), which were not previously reported as solved in the literature.", acknowledgement = ack-nhfb, keywords = "branch and bound algorithms; challenging instances; exact algorithms; planted motif search problem", } @Article{Schneider:2007:SDM, author = "Adrian Schneider and Gaston Gonnet and Gina Cannarozzi", title = "{SynPAM---A} Distance Measure Based on Synonymous Codon Substitutions", journal = j-TCBB, volume = "4", number = "4", pages = "553--560", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Measuring evolutionary distances between DNA or protein sequences forms the basis of many applications in computational biology and evolutionary studies. Of particular interest are distances based on synonymous substitutions, since these substitutions are considered to be under very little selection pressure and therefore assumed to accumulate in an almost clock-like manner. SynPAM, the method presented here, allows the estimation of distances between coding DNA sequences based on synonymous codon substitutions. The problem of estimating an accurate distance from the observed substitution pattern is solved by maximum-likelihood with empirical codon substitution matrices employed for the underlying Markov model. Comparisons with established measures of synonymous distance indicate that SynPAM has less variance and yields useful results over a longer time range.", acknowledgement = ack-nhfb, keywords = "dS; evolutionary distance; molecular evolution; synonymous substitutions; SynPAM", } @Article{Sridhar:2007:AEN, author = "Srinath Sridhar and Kedar Dhamdhere and Guy Blelloch and Eran Halperin and R. Ravi and Russell Schwartz", title = "Algorithms for Efficient Near-Perfect Phylogenetic Tree Reconstruction in Theory and Practice", journal = j-TCBB, volume = "4", number = "4", pages = "561--571", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We consider the problem of reconstructing near-perfect phylogenetic trees using binary character states (referred to as BNPP). A perfect phylogeny assumes that every character mutates at most once in the evolutionary tree, yielding an algorithm for binary character states that is computationally efficient but not robust to imperfections in real data. A near-perfect phylogeny relaxes the perfect phylogeny assumption by allowing at most a constant number of additional mutations. We develop two algorithms for constructing optimal near-perfect phylogenies and provide empirical evidence of their performance. The first simple algorithm is fixed parameter tractable when the number of additional mutations and the number of characters that share four gametes with some other character are constants. The second, more involved algorithm for the problem is fixed parameter tractable when only the number of additional mutations is fixed. We have implemented both algorithms and shown them to be extremely efficient in practice on biologically significant data sets. This work proves the BNPP problem fixed parameter tractable and provides the first practical phylogenetic tree reconstruction algorithms that find guaranteed optimal solutions while being easily implemented and computationally feasible for data sets of biologically meaningful size and complexity.", acknowledgement = ack-nhfb, keywords = "biology and genetics; computations on discrete structures; trees", } @Article{Chen:2007:CBR, author = "Jinmiao Chen and Narendra Chaudhari", title = "Cascaded Bidirectional Recurrent Neural Networks for Protein Secondary Structure Prediction", journal = j-TCBB, volume = "4", number = "4", pages = "572--582", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Protein secondary structure (PSS) prediction is an important topic in bioinformatics. Our study on a large set of non-homologous proteins shows that long-range interactions commonly exist and negatively affect PSS prediction. Besides, we also reveal strong correlations between secondary structure (SS) elements. In order to take into account the long-range interactions and SS-SS correlations, we propose a novel prediction system based on cascaded bidirectional recurrent neural network (BRNN). We compare the cascaded BRNN against another two BRNN architectures, namely the original BRNN architecture used for speech recognition as well as Pollastri's BRNN that was proposed for PSS prediction. Our cascaded BRNN achieves an overall three state accuracy Q3 of 74.38\%, and reaches a high Segment OVerlap (SOV) of 66.0455. It outperforms the original BRNN and Pollastri's BRNN in both Q3 and SOV. Specifically, it improves the SOV score by 4-6\%.", acknowledgement = ack-nhfb, } @Article{Xiong:2007:DDK, author = "Huilin Xiong and Ya Zhang and Xue-Wen Chen", title = "Data-Dependent Kernel Machines for Microarray Data Classification", journal = j-TCBB, volume = "4", number = "4", pages = "583--595", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "One important application of gene expression analysis is to classify tissue samples according to their gene expression levels. Gene expression data are typically characterized by high dimensionality and small sample size, which makes the classification task quite challenging. In this paper, we present a data-dependent kernel for microarray data classification. This kernel function is engineered so that the class separability of the training data is maximized. A bootstrapping-based resampling scheme is introduced to reduce the possible training bias. The effectiveness of this adaptive kernel for microarray data classification is illustrated with a k-Nearest Neighbor (KNN) classifier. Our experimental study shows that the data-dependent kernel leads to a significant improvement in the accuracy of KNN classifiers. Furthermore, this kernel-based KNN scheme has been demonstrated to be competitive to, if not better than, more sophisticated classifiers such as Support Vector Machines (SVMs) and the Uncorrelated Linear Discriminant Analysis (ULDA) for classifying gene expression data.", acknowledgement = ack-nhfb, keywords = "bootstrapping resampling; cancer classification; kernel machines; kernel optimization; microarray data analysis", } @Article{Michal:2007:FCM, author = "Shahar Michal and Tor Ivry and Omer Cohen and Moshe Sipper and Danny Barash", title = "Finding a Common Motif of {RNA} Sequences Using Genetic Programming: The {GeRNAMo} System", journal = j-TCBB, volume = "4", number = "4", pages = "596--610", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We focus on finding a consensus motif of a set of homologous or functionally related RNA molecules. Recent approaches to this problem have been limited to simple motifs, require sequence alignment, and make prior assumptions concerning the data set. We use genetic programming to predict RNA consensus motifs based solely on the data set. Our system -- dubbed GeRNAMo (Genetic programming of RNA Motifs) -- predicts the most common motifs without sequence alignment and is capable of dealing with any motif size. Our program only requires the maximum number of stems in the motif, and if prior knowledge is available the user can specify other attributes of the motif (e.g., the range of the motif's minimum and maximum sizes), thereby increasing both sensitivity and speed. We describe several experiments using either ferritin iron response element (IRE); signal recognition particle (SRP); or microRNA sequences, showing that the most common motif is found repeatedly, and that our system offers substantial advantages over previous methods.", acknowledgement = ack-nhfb, } @Article{McIntosh:2007:HCR, author = "Tara McIntosh and Sanjay Chawla", title = "High Confidence Rule Mining for Microarray Analysis", journal = j-TCBB, volume = "4", number = "4", pages = "611--623", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We present an association rule mining method for mining high confidence rules, which describe interesting gene relationships from microarray datasets. Microarray datasets typically contain an order of magnitude more genes than experiments, rendering many data mining methods impractical as they are optimised for sparse datasets. A new family of row-enumeration rule mining algorithms have emerged to facilitate mining in dense datasets. These algorithms rely on pruning infrequent relationships to reduce the search space by using the support measure. This major shortcoming results in the pruning of many potentially interesting rules with low support but high confidence. We propose a new row-enumeration rule mining method, MaxConf, to mine high confidence rules from microarray data. MaxConf is a support-free algorithm which directly uses the confidence measure to effectively prune the search space. Experiments on three microarray datasets show that MaxConf outperforms support-based rule mining with respect to scalability and rule extraction. Furthermore, detailed biological analyses demonstrate the effectiveness of our approach -- the rules discovered by MaxConf are substantially more interesting and meaningful compared with support-based methods.", acknowledgement = ack-nhfb, keywords = "association rules; data mining; high confidence rule mining; microarray analysis", } @Article{Ponzoni:2007:IAR, author = "Ignacio Ponzoni and Francisco Azuaje and Juan Augusto and David Glass", title = "Inferring Adaptive Regulation Thresholds and Association Rules from Gene Expression Data through Combinatorial Optimization Learning", journal = j-TCBB, volume = "4", number = "4", pages = "624--634", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "There is a need to design computational methods to support the prediction of gene regulatory networks. Such models should offer both biologically-meaningful and computationally-accurate predictions, which in combination with other techniques may improve large-scale, integrative studies. This paper presents a new machine learning method for the prediction of putative regulatory associations from expression data, which exhibit properties never or only partially addressed by other techniques recently published. The method was tested on a Saccharomyces cerevisiae gene expression dataset. The results were statistically validated and compared with the relationships inferred by two machine learning approaches to gene regulatory network prediction. Furthermore, the resulting predictions were assessed using domain knowledge. The proposed algorithm may be able to accurately predict relevant biological associations between genes. One of the most relevant features of this new method is the prediction of adaptive regulation thresholds for the discretization of gene expression values, which is required prior to the rule association learning process. Moreover, an important advantage consists of its low computational cost to infer association rules. The proposed system may significantly support exploratory, large-scale studies of automated identification of potentially-relevant gene expression associations.", acknowledgement = ack-nhfb, keywords = "combinatorial optimization; decision trees; gene expression data; genetic regulatory networks; machine-learning", } @Article{Noman:2007:IGR, author = "Nasimul Noman and Hitoshi Iba", title = "Inferring Gene Regulatory Networks using Differential Evolution with Local Search Heuristics", journal = j-TCBB, volume = "4", number = "4", pages = "634--647", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We present a memetic algorithm for evolving the structure of biomolecular interactions and inferring the effective kinetic parameters from the time series data of gene expression using the decoupled system formalism. We propose an Information Criteria based fitness evaluation for gene network model selection instead of the conventional Mean Squared Error (MSE) based fitness evaluation. A hill-climbing local-search method has been incorporated in our evolutionary algorithm for efficiently attaining the skeletal architecture which is most frequently observed in biological networks. The suitability of the method is tested in gene circuit reconstruction experiments, varying the network dimension and/or characteristics, the amount of gene expression data used for inference and the noise level present in expression profiles. The reconstruction method inferred the network topology and the regulatory parameters with high accuracy. Nevertheless, the performance is limited to the amount of expression data used and the noise level present in the data. The proposed fitness function has been found more suitable for identifying correct network topology and for estimating the accurate parameter values compared to the existing ones. Finally, we applied the methodology for analyzing the cell-cycle gene expression data of budding yeast and reconstructed the network of some key regulators.", acknowledgement = ack-nhfb, keywords = "biology and genetics; gene regulatory system; global optimization; inverse problems; medicine and science; memetic algorithm; microarray data; transcriptional regulation", } @Article{Ho:2007:ITS, author = "Shinn-Ying Ho and Chih-Hung Hsieh and Fu-Chieh Yu and Hui-Ling Huang", title = "An Intelligent Two-Stage Evolutionary Algorithm for Dynamic Pathway Identification From Gene Expression Profiles", journal = j-TCBB, volume = "4", number = "4", pages = "648--704", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "From gene expression profiles, it is desirable to rebuild cellular dynamic regulation networks to discover more delicate and substantial functions in molecular biology, biochemistry, bioengineering and pharmaceutics. S-system model is suitable to characterize biochemical network systems and capable to analyze the regulatory system dynamics. However, inference of an S-system model of N-gene genetic networks has 2N(N+1) parameters in a set of non-linear differential equations to be optimized. This paper proposes an intelligent two-stage evolutionary algorithm (iTEA) to efficiently infer the S-system models of genetic networks from time-series data of gene expression. To cope with curse of dimensionality, the proposed algorithm consists of two stages where each uses a divide-and-conquer strategy. The optimization problem is first decomposed into N subproblems having 2(N+1) parameters each. At the first stage, each subproblem is solved using a novel intelligent genetic algorithm (IGA) with intelligent crossover based on orthogonal experimental design (OED). At the second stage, the obtained N solutions to the N subproblems are combined and refined using an OED-based simulated annealing algorithm for handling noisy gene expression profiles. The effectiveness of iTEA is evaluated using simulated expression patterns with and without noise running on a single-processor PC. It is shown that 1) IGA is efficient enough to solve subproblems; 2) IGA is significantly superior to the existing method SPXGA; and 3) iTEA performs well in inferring S-system models for dynamic pathway identification.", acknowledgement = ack-nhfb, keywords = "divide-and-conquer; evolutionary algorithm; genetic network; orthogonal experimental design; pathway identification; S-system model", } @Article{Bereg:2007:PNB, author = "Sergey Bereg and Yuanyi Zhang", title = "Phylogenetic Networks Based on the Molecular Clock Hypothesis", journal = j-TCBB, volume = "4", number = "4", pages = "661--667", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "A classical result in phylogenetic trees is that a binary phylogenetic tree adhering to the molecular clock hypothesis exists if and only if the matrix of distances between taxa is ultrametric. The ultrametric condition is very restrictive. In this paper we study phylogenetic networks that can be constructed assuming the molecular clock hypothesis. We characterize distance matrices that admit such networks for 3 and 4 taxa. We also design two algorithms for constructing networks optimizing the least-squares fit.", acknowledgement = ack-nhfb, keywords = "least-squares fit; molecular clock hypothesis; Phylogenetic Networks", } @Article{Blazewicz:2007:SPD, author = "Jacek Blazewicz and Edmund Burke and Marta Kasprzak and Alexandr Kovalev and Mikhail Kovalyov", title = "Simplified Partial Digest Problem: Enumerative and Dynamic Programming Algorithms", journal = j-TCBB, volume = "4", number = "4", pages = "668--680", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We study the Simplified Partial Digest Problem (SPDP), which is a mathematical model for a new simplified partial digest method of genome mapping. This method is easy for laboratory implementation and robust with respect to the experimental errors. SPDP is NP-hard in the strong sense. We present an $O(n2^n)$ time enumerative algorithm and an $O(n^{2q})$ time dynamic programming algorithm for the error-free SPDP, where $n$ is the number of restriction sites and $q$ is the number of distinct intersite distances. We also give examples of the problem, in which there are $2^{\frac{n+2}{3}-1}$ non-congruent solutions. These examples partially answer a question recently posed in the literature about the number of solutions of SPDP. We adapt our enumerative algorithm for handling SPDP with imprecise input data. Finally, we describe and discuss the results of the computer experiments with our algorithms.", acknowledgement = ack-nhfb, keywords = "algorithm design and analysis; dynamic programming; genome mapping; imprecise information; restriction site analysis", } @Article{Xu:2007:IGR, author = "Rui Xu and Donald {Wunsch II} and Ronald Frank", title = "Inference of Genetic Regulatory Networks with Recurrent Neural Network Models Using Particle Swarm Optimization", journal = j-TCBB, volume = "4", number = "4", pages = "681--692", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Genetic regulatory network inference is critically important for revealing fundamental cellular processes, investigating gene functions, and understanding their relations. The availability of time series gene expression data makes it possible to investigate the gene activities of whole genomes, rather than those of only a pair of genes or among several genes. However, current computational methods do not sufficiently consider the temporal behavior of this type of data and lack the capability to capture the complex nonlinear system dynamics. We propose a recurrent neural network (RNN) and particle swarm optimization (PSO) approach to infer genetic regulatory networks from time series gene expression data. Under this framework, gene interaction is explained through a connection weight matrix. Based on the fact that the measured time points are limited and the assumption that the genetic networks are usually sparsely connected, we present a PSO-based search algorithm to unveil potential genetic network constructions that fit well with the time series data and explore possible gene interactions. Furthermore, PSO is used to train the RNN and determine the network parameters. Our approach has been applied to both synthetic and real data sets. The results demonstrate that the RNN/PSO can provide meaningful insights in understanding the nonlinear dynamics of the gene expression time series and revealing potential regulatory interactions between genes.", acknowledgement = ack-nhfb, keywords = "genetic regulatory networks; particle swarm optimization; recurrent neural networks; time series gene expression data", } @Article{Agius:2007:TSA, author = "Phaedra Agius and Barry Kreiswirth and Steve Naidich and Kristin Bennett", title = "Typing Staphylococcus aureus Using the spa Gene and Novel Distance Measures", journal = j-TCBB, volume = "4", number = "4", pages = "693--704", month = oct, year = "2007", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:58:47 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We developed an approach for identifying groups or families of Staphylococcus aureus bacteria based on genotype data. With the emergence of drug resistant strains, S. aureus represents a significant human health threat. Identifying the family types efficiently and quickly is crucial in community settings. Here, we develop a hybrid sequence algorithm approach to type this bacterium using only its spa gene. Two of the sequence algorithms we used are well established, while the third, the Best Common Gap-Weighted Sequence (BCGS), is novel. We combined the sequence algorithms with a weighted match/mismatch algorithm for the spa sequence ends. Normalized similarity scores and distances between the sequences were derived and used within unsupervised clustering methods. The resulting spa groupings correlated strongly with the groups defined by the well-established Multi locus sequence typing (MLST) method. Spa typing is preferable to MLST typing which types seven genes instead of just one. Furthermore, our spa clustering methods can be fine-tuned to be more discriminative than MLST, identifying new strains that the MLST method may not. Finally, we performed a multidimensional scaling of our distance matrices to visualize the relationship between isolates. The proposed methodology provides a promising new approach to molecular epidemiology.", acknowledgement = ack-nhfb, keywords = "clustering; genotyping; molecular epidemiology; sequence algorithms; staphylococcus aureus", } @Article{Congdon:2008:EIC, author = "Clare Bates Congdon and Joseph C. Aman and Gerardo M. Nava and H. Rex Gaskins and Carolyn J. Mattingly", title = "An Evaluation of Information Content as a Metric for the Inference of Putative Conserved Noncoding Regions in {DNA} Sequences Using a Genetic Algorithms Approach", journal = j-TCBB, volume = "5", number = "1", pages = "1--14", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "In previous work, we presented GAMI [1], an approach to motif inference that uses a genetic algorithms search. GAMI is designed specifically to find putative conserved regulatory motifs in noncoding regions of divergent species, and is designed to allow for analysis of long nucleotide sequences. In this work, we compare GAMI's performance when run with its original fitness function (a simple count of the number of matches) and when run with information content, as well as several variations on these metrics. Results indicate that information content does not identify highly conserved regions, and thus is not the appropriate metric for this task, while variations on information content as well as the original metric succeed in identifying putative conserved regions.", acknowledgement = ack-nhfb, keywords = "biology and genetics; evolutionary computing and genetic algorithms", } @Article{Boscolo:2008:ITE, author = "Riccardo Boscolo and James C. Liao and Vwani P. Roychowdhury", title = "An Information Theoretic Exploratory Method for Learning Patterns of Conditional Gene Coexpression from Microarray Data", journal = j-TCBB, volume = "5", number = "1", pages = "15--24", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "In this article, we introduce an exploratory framework for learning patterns of conditional co-expression in gene expression data. The main idea behind the proposed approach consists of estimating how the information content shared by a set of M nodes in a network (where each node is associated to an expression profile) varies upon conditioning on a set of L conditioning variables (in the simplest case represented by a separate set of expression profiles). The method is non-parametric and it is based on the concept of statistical co-information, which, unlike conventional correlation based techniques, is not restricted in scope to linear conditional dependency patterns. Moreover, such conditional co-expression relationships can potentially indicate regulatory interactions that do not manifest themselves when only pair-wise relationships are considered. A moment based approximation of the co-information measure is derived that efficiently gets around the problem of estimating high-dimensional multi-variate probability density functions from the data, a task usually not viable due to the intrinsic sample size limitations that characterize expression level measurements. By applying the proposed exploratory method, we analyzed a whole genome microarray assay of the eukaryote Saccharomices cerevisiae and were able to learn statistically significant patterns of conditional co-expression. A selection of such interactions that carry a meaningful biological interpretation are discussed.", acknowledgement = ack-nhfb, keywords = "co-information; entropy; gene expression data; information theory; statistical analysis", } @Article{Wiese:2008:REA, author = "Kay C. Wiese and Alain A. Deschenes and Andrew G. Hendriks", title = "{RnaPredict---An} Evolutionary Algorithm for {RNA} Secondary Structure Prediction", journal = j-TCBB, volume = "5", number = "1", pages = "25--41", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "This paper presents two in-depth studies on RnaPredict, an evolutionary algorithm for RNA secondary structure prediction. The first study is an analysis of the performance of two thermodynamic models, INN and INN-HB. The correlation between the free energy of predicted structures and the sensitivity is analyzed for 19 RNA sequences. Although some variance is shown, there is a clear trend between a lower free energy and an increase in true positive base pairs. With increasing sequence length, this correlation generally decreases. In the second experiment, the accuracy of the predicted structures for these 19 sequences are compared against the accuracy of the structures generated by the mfold dynamic programming algorithm (DPA) and also to known structures. RnaPredict is shown to outperform the minimum free energy structures produced by mfold and has comparable performance when compared to sub-optimal structures produced by mfold.", acknowledgement = ack-nhfb, keywords = "evolutionary computation; RnaPredict; RNA secondary structure prediction", } @Article{Rother:2008:SCP, author = "Diego Rother and Guillermo Sapiro and Vijay Pande", title = "Statistical Characterization of Protein Ensembles", journal = j-TCBB, volume = "5", number = "1", pages = "42--55", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "When accounting for structural fluctuations or measurement errors, a single rigid structure may not be sufficient to represent a protein. One approach to solve this problem is to represent the possible conformations as a discrete set of observed conformations, an ensemble. In this work, we follow a different richer approach, and introduce a framework for estimating probability density functions in very high dimensions, and then apply it to represent ensembles of folded proteins. This proposed approach combines techniques such as kernel density estimation, maximum likelihood, cross-validation, and bootstrapping. We present the underlying theoretical and computational framework and apply it to artificial data and protein ensembles obtained from molecular dynamics simulations. We compare the results with those obtained experimentally, illustrating the potential and advantages of this representation.", acknowledgement = ack-nhfb, keywords = "Bayesian networks; bootstrapping; cross-validation; density estimation; graphical models; maximum likelihood; protein ensembles", } @Article{Cui:2008:EAA, author = "Yun Cui and Lusheng Wang and Daming Zhu and Xiaowen Liu", title = "A $(1.5 + {\epsilon})$-Approximation Algorithm for Unsigned Translocation Distance", journal = j-TCBB, volume = "5", number = "1", pages = "56--66", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Genome rearrangement is an important area in computational biology and bioinformatics. The translocation operation is one of the popular operations for genome rearrangement. It was proved that computing the unsigned translocation distance is NP-hard. In this paper, we present a $(1.5 + \epsilon)$-approximation algorithm for computing unsigned translocation distance which improves upon the best known 1.75-ratio. The running time of our algorithm is $O(n^2 + (4/\epsilon)^1.5 \surd \log(4/\epsilon)2 4^\epsilon)$, where $n$ is the total number of genes in the genome.", acknowledgement = ack-nhfb, keywords = "and approximation algorithms; genome rearrangement; unsigned translocation", } @Article{Tan:2008:NBP, author = "Tuan Zea Tan and Geok See Ng and Chai Quek", title = "A Novel Biologically and Psychologically Inspired Fuzzy Decision Support System: Hierarchical Complementary Learning", journal = j-TCBB, volume = "5", number = "1", pages = "67--79", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "A computational intelligent system that models the human cognitive abilities may promise significant performance in problem learning because human is effective in learning and problem solving. Functionally modeling the human cognitive abilities not only avoids the details of the underlying neural mechanisms performing the tasks, but also reduces the complexity of the system. The complementary learning mechanism is responsible for human pattern recognition, i.e. human attends to positive and negative samples when making decision. Furthermore, human concept learning is organized in a hierarchical fashion. Such hierarchical organization allows the divide-and-conquer approach to the problem. Thus, integrating the functional models of hierarchical organization and complementary learning can potentially improve the performance in pattern recognition. Hierarchical complementary learning exhibits many of the desirable features of pattern recognition. It is further supported by the experimental results that verify the rationale of the integration and that the hierarchical complementary learning system is a promising pattern recognition tool.", acknowledgement = ack-nhfb, keywords = "cognitive learning; complementary learning; decision support; fuzzy neural network; hierarchical model", } @Article{Ciocchetta:2008:ATS, author = "Federica Ciocchetta and Corrado Priami and Paola Quaglia", title = "An Automatic Translation of {SBML} into Beta-Binders", journal = j-TCBB, volume = "5", number = "1", pages = "80--90", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "A translation of SBML (Systems Biology Markup Language) into a process algebra is proposed in order to allow the formal specification, the simulation and the formal analysis of biological models. Beta-binders, a language with a quantitative stochastic extension, is chosen for the translation. The proposed translation focuses on the main components of SBML models, as species and reactions. Furthermore, it satisfies the compositional property, i.e. the translation of the whole model is obtained by composing the translation of the subcomponents. An automatic translator tool of SBML models into Beta-binders has been implemented as well. Finally, the translation of a simple model is reported.", acknowledgement = ack-nhfb, keywords = "biological systems; modeling; Process algebras; systems biology; Systems Biology Markup Language (SBML); translation tool", } @Article{Bocker:2008:CAM, author = "Sebastian Bocker and Veli Makinen", title = "Combinatorial Approaches for Mass Spectra Recalibration", journal = j-TCBB, volume = "5", number = "1", pages = "91--100", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Mass spectrometry has become one of the most popular analysis techniques in Proteomics and Systems Biology. With the creation of larger datasets, the automated recalibration of mass spectra becomes important to ensure that every peak in the sample spectrum is correctly assigned to some peptide and protein. Algorithms for recalibrating mass spectra have to be robust with respect to wrongly assigned peaks, as well as efficient due to the amount of mass spectrometry data. The recalibration of mass spectra leads us to the problem of finding an optimal matching between mass spectra under measurement errors. We have developed two deterministic methods that allow robust computation of such a matching: The first approach uses a computational geometry interpretation of the problem, and tries to find two parallel lines with constant distance that stab a maximal number of points in the plane. The second approach is based on finding a maximal common approximate subsequence, and improves existing algorithms by one order of magnitude exploiting the sequential nature of the matching problem. We compare our results to a computational geometry algorithm using a topological line-sweep.", acknowledgement = ack-nhfb, keywords = "biotechnology; combinatorial pattern matching; computational geometry; mass spectrometry", } @Article{Barzuza:2008:CPP, author = "Tamar Barzuza and Jacques S. Beckmann and Ron Shamir and Itsik Pe'er", title = "Computational Problems in Perfect Phylogeny Haplotyping: Typing without Calling the Allele", journal = j-TCBB, volume = "5", number = "1", pages = "101--109", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "A haplotype is an m-long binary vector. The xor-genotype of two haplotypes is the m-vector of their coordinate-wise xor. We study the following problem: Given a set of xor-genotypes, reconstruct their haplotypes so that the set of resulting haplotypes can be mapped onto a perfect phylogeny tree. The question is motivated by studying population evolution in human genetics, and is a variant of the perfect phylogeny haplotyping problem that has received intensive attention recently. Unlike the latter problem, in which the input is `full' genotypes, here we assume less informative input, and so may be more economical to obtain experimentally. Building on ideas of Gusfield, we show how to solve the problem in polynomial time, by a reduction to the graph realization problem. The actual haplotypes are not uniquely determined by that tree they map onto, and the tree itself may or may not be unique. We show that tree uniqueness implies uniquely determined haplotypes, up to inherent degrees of freedom, and give a sufficient condition for the uniqueness. To actually determine the haplotypes given the tree, additional information is necessary. We show that two or three full genotypes suffice to reconstruct all the haplotypes, and present a linear algorithm for identifying those genotypes.", acknowledgement = ack-nhfb, keywords = "graph realization; haplotypes; perfect phylogeny; XOR-genotypes", } @Article{Chin:2008:DMR, author = "Francis Chin and Henry C. M. Leung", title = "{DNA} Motif Representation with Nucleotide Dependency", journal = j-TCBB, volume = "5", number = "1", pages = "110--119", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The problem of discovering novel motifs of binding sites is important to the understanding of gene regulatory networks. Motifs are generally represented by matrices (PWM or PSSM) or strings. However, these representations cannot model biological binding sites well because they fail to capture nucleotide interdependence. It has been pointed out by many researchers that the nucleotides of the DNA binding site cannot be treated independently, e.g. the binding sites of zinc finger in proteins. In this paper, a new representation called Scored PositionSpecific Pattern (SPSP), which is a generalization of the matrix and string representations, is introduced which takes into consideration the dependent occurrences of neighboring nucleotides. Even though the problem of discovering the optimal motif in SPSP representation is proved to beNP-hard, we introduce a heuristic algorithm called SPSP-Finder, which can effectively find optimal motifs in most simulated cases and some real cases for which existing popular motif finding software, such as Weeder, MEME and AlignACE, fail.", acknowledgement = ack-nhfb, keywords = "Computing methodologies; design methodology; pattern analysis; pattern recognition", } @Article{Yin:2008:NAC, author = "Zong-Xian Yin and Jung-Hsien Chiang", title = "Novel Algorithm for Coexpression Detection in Time-Varying Microarray Data Sets", journal = j-TCBB, volume = "5", number = "1", pages = "120--135", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "When analyzing the results of microarray experiments, biologists generally use unsupervised categorization tools. However, such tools regard each time point as an independent dimension and utilize the Euclidean distance to compute the similarities between expressions. Furthermore, some of these methods require the number of clusters to be determined in advance, which is clearly impossible in the case of a new dataset. Therefore, this study proposes a novel scheme, designated as the Variation-based Co-expression Detection (VCD) algorithm, to analyze the trends of expressions based on their variation over time. The proposed algorithm has two advantages. First, it is unnecessary to determine the number of clusters in advance since the algorithm automatically detects those genes whose profiles are grouped together and creates patterns for these groups. Second, the algorithm features a new measurement criterion for calculating the degree of change of the expressions between adjacent time points and evaluating their trend similarities. Three real-world microarray datasets are employed to evaluate the performance of the proposed algorithm.", acknowledgement = ack-nhfb, keywords = "bioinformatics; clustering; data mining; gene expression; pattern analysis; time series analysis", } @Article{Goeffon:2008:PTN, author = "Adrien Goeffon and Jean-Michel Richer and Jin-Kao Hao", title = "Progressive Tree Neighborhood Applied to the Maximum Parsimony Problem", journal = j-TCBB, volume = "5", number = "1", pages = "136--145", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The Maximum Parsimony problem aims at reconstructing a phylogenetic tree from DNA sequences while minimizing the number of genetic transformations. To solve this NP-complete problem, heuristic methods have been developed, often based on local search. In this article, we focus on the influence of the neighborhood relations. After analyzing the advantages and drawbacks of the well-known NNI, SPR and TBR neighborhoods, we introduce the concept of Progressive Neighborhood which consists in constraining progressively the size of the neighborhood as the search advances. We empirically show that applied to the Maximum Parsimony problem, this progressive neighborhood turns out to be more efficient and robust than the classic neighborhoods using a descent algorithm. Indeed, it allows to find better solutions with a smaller number of iterations or trees evaluated.", acknowledgement = ack-nhfb, keywords = "combinatorial algorithms; maximum parsimony; optimization; phylogeny reconstruction", } @Article{Anonymous:2008:RL, author = "Anonymous", title = "2007 Reviewers List", journal = j-TCBB, volume = "5", number = "1", pages = "146--147", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Anonymous:2008:AI, author = "Anonymous", title = "2007 Annual Index", journal = j-TCBB, volume = "5", number = "1", pages = "148--158", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Anonymous:2008:CAE, author = "Anonymous", title = "Call for Applications for {Editor-in-Chief}", journal = j-TCBB, volume = "5", number = "1", pages = "159--159", month = jan, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:11 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Jackson:2008:CGM, author = "Benjamin N. Jackson and Patrick S. Schnable and Srinivas Aluru", title = "Consensus Genetic Maps as Median Orders from Inconsistent Sources", journal = j-TCBB, volume = "5", number = "2", pages = "161--171", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "A genetic map is an ordering of genetic markers calculated from a population of known lineage. While traditionally a map has been generated from a single population for each species, recently researchers have created maps from multiple populations. In the face of these new data, we address the need to find a consensus map --- a map that combines the information from multiple partial and possibly inconsistent input maps. We model each input map as a partial order and formulate the consensus problem as finding a median partial order. Finding the median of multiple total orders (preferences or rankings)is a well studied problem in social choice. We choose to find the median using the weighted symmetric difference distance, a more general version of both the symmetric difference distance and the Kemeny distance. Finding a median order using this distance is NP-hard. We show that for our chosen weight assignment, a median order satisfies the positive responsiveness, extended Condorcet,and unanimity criteria. Our solution involves finding the maximum acyclic subgraph of a weighted directed graph. We present a method that dynamically switches between an exact branch and bound algorithm and a heuristic algorithm, and show that for real data from closely related organisms, an exact median can often be found. We present experimental results using seven populations of the crop plant Zea mays.", acknowledgement = ack-nhfb, keywords = "genetic map; Kemeny distance; median order; path and circuit problems; symmetric difference distance.", } @Article{Gupta:2008:EDS, author = "Anupam Gupta and Ziv Bar-Joseph", title = "Extracting Dynamics from Static Cancer Expression Data", journal = j-TCBB, volume = "5", number = "2", pages = "172--182", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Static expression experiments analyze samples from many individuals. These samples are often snapshots of the progression of a certain disease such as cancer. This raises an intriguing question: Can we determine a temporal order for these samples? Such an ordering can lead to better understanding of the dynamics of the disease and to the identification of genes associated with its progression. In this paper we formally prove, for the first time, that under a model for the dynamics of the expression levels of a single gene, it is indeed possible to recover the correct ordering of the static expression datasets by solving an instance of the traveling salesman problem (TSP). In addition, we devise an algorithm that combines a TSP heuristic and probabilistic modeling for inferring the underlying temporal order of the microarray experiments. This algorithm constructs probabilistic continuous curves to represent expression profiles leading to accurate temporal reconstruction for human data. Applying our method to cancer expression data we show that the ordering derived agrees well with survival duration. A classifier that utilizes this ordering improves upon other classifiers suggested for this task. The set of genes displaying consistent behavior for the determined ordering are enriched for genes associated with cancer progression.", acknowledgement = ack-nhfb, keywords = "EM; glioma; microarrays; traveling salesman", } @Article{Thomas:2008:GMR, author = "John Thomas and Naren Ramakrishnan and Chris Bailey-Kellogg", title = "Graphical Models of Residue Coupling in Protein Families", journal = j-TCBB, volume = "5", number = "2", pages = "183--197", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Many statistical measures and algorithmic techniques have been proposed for studying residue coupling in protein families. Generally speaking, two residue positions are considered coupled if, in the sequence record, some of their amino acid type combinations are significantly more common than others. While the proposed approaches have proven useful in finding and describing coupling, a significant missing component is a formal probabilistic model that explicates and compactly represents the coupling, integrates information about sequence,structure, and function, and supports inferential procedures for analysis, diagnosis, and prediction. We present an approach to learning and using probabilistic graphical models of residue coupling. These models capture significant conservation and coupling constraints observable in a multiply-aligned set of sequences. Our approach can place a structural prior on considered couplings, so that all identified relationships have direct mechanistic explanations. It can also incorporate information about functional classes, and thereby learn a differential graphical model that distinguishes constraints common to all classes from those unique to individual classes. Such differential models separately account for class-specific conservation and family-wide coupling, two different sources of sequence covariation. They are then able to perform interpretable functional classification of new sequences, explaining classification decisions in terms of the underlying conservation and coupling constraints. We apply our approach in studies of both G protein-coupled receptors and PDZ domains, identifying and analyzing family-wide and class-specific constraints, and performing functional classification. The results demonstrate that graphical models of residue coupling provide a powerful tool for uncovering, representing, and utilizing significant sequence structure-function relationships in protein families.", acknowledgement = ack-nhfb, keywords = "correlated mutations; evolutionary covariation; functional classification; graphical models; sequence-structure-function relationships", } @Article{Mena-Chalco:2008:IPC, author = "Jesus Mena-Chalco and Helaine Carrer and Yossi Zana and Roberto M. {Cesar Jr.}", title = "Identification of Protein Coding Regions Using the Modified Gabor-Wavelet Transform", journal = j-TCBB, volume = "5", number = "2", pages = "198--207", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "An important topic in genomic sequence analysis is the identification of protein coding regions. In this context, several coding DNA model-independent methods, based on the occurrence of specific patterns of nucleotides at coding regions, have been proposed. Nonetheless, these methods have not been completely suitable due to their dependence on an empirically pre-defined window length required for a local analysis of a DNA region. We introduce a method, based on a modified Gabor-wavelet transform (MGWT), for the identification of protein coding regions. This novel transform is tuned to analyze periodic signal components and presents the advantage of being independent of the window length. We compared the performance of the MGWT with other methods using eukaryote datasets. The results show that the MGWT outperforms all assessed model-independent methods with respect to identification accuracy. These results indicate that the source of at least part of the identification errors produced by the previous methods is the fixed working scale. The new method not only avoids this source of errors, but also makes available a tool for detailed exploration of the nucleotide occurrence.", acknowledgement = ack-nhfb, keywords = "biology and genetics; pattern recognition; signal processing", } @Article{deJong:2008:SSS, author = "Hidde de Jong and Michel Page", title = "Search for Steady States of Piecewise-Linear Differential Equation Models of Genetic Regulatory Networks", journal = j-TCBB, volume = "5", number = "2", pages = "208--222", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Analysis of the attractors of a genetic regulatory network gives a good indication of the possible functional modes of the system. In this paper we are concerned with the problem of finding all steady states of genetic regulatory networks described by piecewise-linear differential equation (PLDE) models. We show that the problem is NP-hard and translate it into a propositional satisfiability (SAT) problem. This allows the use of existing, efficient SAT solvers and has enabled the development of a steady state search module of the computer tool Genetic Network Analyzer (GNA). The practical use of this module is demonstrated by means of the analysis of a number of relatively small bacterial regulatory networks as well as randomly generated networks of several hundreds of genes.", acknowledgement = ack-nhfb, keywords = "genetic regulatory networks; large-scale systems; piecewise-linear differential equations; SAT problem; steady states", } @Article{Sadot:2008:TVB, author = "Avital Sadot and Jasmin Fisher and Dan Barak and Yishai Admanit and Michael J. Stern and E. Jane Albert Hubbard and David Harel", title = "Toward Verified Biological Models", journal = j-TCBB, volume = "5", number = "2", pages = "223--234", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The last several decades have witnessed a vast accumulation of biological data and data analysis. Many of these data sets represent only a small fraction of the system's behavior, making the visualization of full system behavior difficult. A more complete understanding of a biological system is gained when different types of data (and/or conclusions drawn from the data) are integrated into a larger-scale representation or model of the system. Ideally, this type of model is consistent with all available data about the system, and it is then used to generate additional hypotheses to be tested. Computer-based methods intended to formulate models that integrate various events and to test the consistency of these models with respect to the laboratory-based observations on which they are based are potentially very useful. In addition, in contrast to informal models, the consistency of such formal computer-based models with laboratory data can be tested rigorously by methods of formal verification. We combined two formal modeling approaches in computer science that were originally developed for non-biological system design. One is the inter-object approach using the language of live sequence charts (LSCs) with the Play-Engine tool, and the other is the intra-object approach using the language of statecharts and Rhapsody as the tool. Integration is carried out using InterPlay, a simulation engine coordinator. Using these tools, we constructed a combined model comprising three modules. One module represents the early lineage of the somatic gonad of C. elegans in LSCs, while a second more detailed module in statecharts represents an interaction between two cells within this lineage that determine their developmental outcome. Using the advantages of the tools, we created a third module representing a set of key experimental data using LSCs. We tested the combined statechart-LSC model by showing that the simulations were consistent with the set of experimental LSCs. This small-scale modular example demonstrates the potential for using similar approaches for verification by exhaustive testing of models by LSCs. It also shows the advantages of these approaches for modeling biology.", acknowledgement = ack-nhfb, keywords = "C. elegans; modeling; statecharts; verification", } @Article{Spillner:2008:CPD, author = "Andreas Spillner and Binh T. Nguyen and Vincent Moulton", title = "Computing Phylogenetic Diversity for Split Systems", journal = j-TCBB, volume = "5", number = "2", pages = "235--244", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "In conservation biology it is a central problem to measure, predict, and preserve biodiversity as species face extinction. In 1992 Faith proposed measuring the diversity of a collection of species in terms of their relationships on a phylogenetic tree, and to use this information to identify collections of species with high diversity. Here we are interested in some variants of the resulting optimization problem that arise when considering species whose evolution is better represented by a network rather than a tree. More specifically, we consider the problem of computing phylogenetic diversity relative to a split system on a collection of species of size $n$. We show that for general split systems this problem is NP-hard. In addition we provide some efficient algorithms for some special classes of split systems, in particular presenting an optimal $O(n)$ time algorithm for phylogenetic trees and an $O(n\log n + n k)$ time algorithm for choosing an optimal subset of size $k$ relative to a circular split system.", acknowledgement = ack-nhfb, keywords = "biology and genetics; life and medical sciences", } @Article{Lancia:2008:HDA, author = "Giuseppe Lancia and R. Ravi and Romeo Rizzi", title = "Haplotyping for Disease Association: {A} Combinatorial Approach", journal = j-TCBB, volume = "5", number = "2", pages = "245--251", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We consider a combinatorial problem derived from haplotyping a population with respect to a genetic disease, either recessive or dominant. Given a set of individuals, partitioned into healthy and diseased, and the corresponding sets of genotypes, we want to infer ``bad'' and ``good'' haplotypes to account for these genotypes and for the disease. Assume e.g. the disease is recessive. Then, the resolving haplotypes must consist of \emph{bad} and \emph{good} haplotypes, so that (i) each genotype belonging to a diseased individual is explained by a pair of bad haplotypes and (ii) each genotype belonging to a healthy individual is explained by a pair of haplotypes of which at least one is good. We prove that the associated decision problem is NP-complete. However, we also prove that there is a simple solution, provided the data satisfy a very weak requirement.", acknowledgement = ack-nhfb, keywords = "biology and genetics; combinatorics; discrete mathematics", } @Article{Gusev:2008:HSG, author = "Alexander Gusev and Ion I. M{\~a}ndoiu and Bogdan Pa{\c{s}}aniuc", title = "Highly Scalable Genotype Phasing by Entropy Minimization", journal = j-TCBB, volume = "5", number = "2", pages = "252--261", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "A Single Nucleotide Polymorphism (SNP) is a position in the genome at which two or more of the possible four nucleotides occur in a large percentage of the population. SNPsaccount for most of the genetic variability between individuals,and mapping SNPs in the human population has become the next high-priority in genomics after the completion of the HumanGenome project. In diploid organisms such as humans, there are two non-identical copies of each autosomal chromosome. A description of the SNPs in a chromosome is called a haplotype. At present, it is prohibitively expensive to directly determine the haplotypes of an individual, but it is possible to obtain rather easily the conflated SNP information in the so called genotype. Computational methods for genotype phasing, i.e., inferring haplotypes from genotype data, have received much attention in recent years as haplotype information leads to increased statistical power of disease association tests. However, many of the existing algorithms have impractical running time for phasing large genotype datasets such as those generated by the international HapMap project. In this paper we propose a highly scalable algorithm based on entropy minimization. Our algorithm is capable of phasing both unrelated and related genotypes coming from complex pedigrees. Experimental results on both real and simulated datasets show that our algorithm achieves a phasing accuracy worse but close to that of best existing methods while being several orders of magnitude faster. The open source code implementation of the algorithm and a web interface are publicly available at http://dna.engr.uconn.edu/~software/ent/.", acknowledgement = ack-nhfb, keywords = "algorithm; genotype phasing; haplotype; Single Nucleotide Polymorphism", } @Article{Zhao:2008:ICG, author = "Wentao Zhao and Erchin Serpedin and Edward R. Dougherty", title = "Inferring Connectivity of Genetic Regulatory Networks Using Information-Theoretic Criteria", journal = j-TCBB, volume = "5", number = "2", pages = "262--274", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Recently, the concept of mutual information has been proposed for infering the structure of genetic regulatory networks from gene expression profiling. After analyzing the limitations of mutual information in inferring the gene-to-gene interactions, this paper introduces the concept of conditional mutual information and based on it proposes two novel algorithms to infer the connectivity structure of genetic regulatory networks. One of the proposed algorithms exhibits a better accuracy while the other algorithm excels in simplicity and flexibility. By exploiting the mutual information and conditional mutual information, a practical metric is also proposed to assess the likeliness of direct connectivity between genes. This novel metric resolves a common limitation associated with the current inference algorithms, namely the situations where the gene connectivity is established in terms of the dichotomy of being either connected or disconnected. Based on the data sets generated by synthetic networks, the performance of the proposed algorithms is compared favorably relative to existing state-of-the-art schemes. The proposed algorithms are also applied on realistic biological measurements, such as the cutaneous melanoma data set, and biological meaningful results are inferred.", acknowledgement = ack-nhfb, keywords = "biology and genetics; DNA microarray; genetic regulatory network; information theory", } @Article{Bordewich:2008:NRS, author = "Magnus Bordewich and Charles Semple", title = "Nature Reserve Selection Problem: {A} Tight Approximation Algorithm", journal = j-TCBB, volume = "5", number = "2", pages = "275--280", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The Nature Reserve Selection Problem is a problem that arises in the context of studying biodiversity conservation. Subject to budgetary constraints, the problem is to select a set of regions to conserve so that the phylogenetic diversity of the set of species contained within those regions is maximized. Recently, it was shown in a paper by Moulton {\\em et al.} that this problem is NP-hard. In this paper, we establish a tight polynomial-time approximation algorithm for the Nature Reserve Section Problem. Furthermore, we resolve a question on the computational complexity of a related problem left open in Moulton {\\em et al.}", acknowledgement = ack-nhfb, keywords = "combinatorial algorithms; trees", } @Article{Hsieh:2008:OAI, author = "Yong-Hsiang Hsieh and Chih-Chiang Yu and Biing-Feng Wang", title = "Optimal Algorithms for the Interval Location Problem with Range Constraints on Length and Average", journal = j-TCBB, volume = "5", number = "2", pages = "281--290", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Let $A$ be a sequence of $n$ real numbers, $L_1$ and $L_2$ be two integers such that $L_1 \leq L_2$, and $R_1$ and $R_2$ be two real numbers such that $R_1 \leq R_2$. An interval of $A$ is feasible if its length is between $L_1$ and $L_2$ and its average is between $R_1$ and $R_2$. In this paper, we study the following problems: finding all feasible intervals of $A$, counting all feasible intervals of $A$, finding a maximum cardinality set of non-overlapping feasible intervals of $A$, locating a longest feasible interval of $A$, and locating a shortest feasible interval of $A$. The problems are motivated from the problem of locating CpG islands in biomolecular sequences. In this paper, we firstly show that all the problems have an $\Omega(n \log n)$-time lower bound in the comparison model. Then, we use geometric approaches to design optimal algorithms for the problems. All the presented algorithms run in an on-line manner and use $O(n)$ space.", acknowledgement = ack-nhfb, keywords = "algorithms; analysis of algorithms; data structures; geometrical problems and computations", } @Article{Lamers:2008:PRX, author = "Susanna L. Lamers and Marco Salemi and Michael S. McGrath and Gary B. Fogel", title = "Prediction of {R5}, {X4}, and {R5X4} {HIV}-1 Coreceptor Usage with Evolved Neural Networks", journal = j-TCBB, volume = "5", number = "2", pages = "291--300", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The HIV-1 genome is highly heterogeneous. This variation affords the virus a wide range of molecular properties, including the ability to infect cell types, such as macrophages and lymphocytes, expressing different chemokine receptors on the cell surface. In particular, R5 HIV-1 viruses use CCR5 as co-receptor for viral entry, X4 viruses use CXCR4, whereas some viral strains, known as R5X4 or D-tropic, have the ability to utilize both co-receptors. X4 and R5X4 viruses are associated with rapid disease progression to AIDS. R5X4 viruses differ in that they have yet to be characterized by the examination of the genetic sequence of HIV-1 alone. In this study, a series of experiments was performed to evaluate different strategies of feature selection and neural network optimization. We demonstrate the use of artificial neural networks trained via evolutionary computation to predict viral co-receptor usage. The results indicate identification of R5X4 viruses with predictive accuracy of 75.5\%.", acknowledgement = ack-nhfb, keywords = "AIDS; artificial neural networks; Computational intelligence; dual-tropic viruses; evolutionary computation; HIV; phenotype prediction; tropism", } @Article{vanIersel:2008:SIT, author = "Leo van Iersel and Judith Keijsper and Steven Kelk and Leen Stougie", title = "Shorelines of Islands of Tractability: Algorithms for Parsimony and Minimum Perfect Phylogeny Haplotyping Problems", journal = j-TCBB, volume = "5", number = "2", pages = "301--312", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The problem Parsimony Haplotyping (PH) asks for the smallest set of haplotypes which can explain a given set of genotypes, and the problem Minimum Perfect Phylogeny Haplotyping (MPPH) asks for the smallest such set which also allows the haplotypes to be embedded in a perfect phylogeny, an evolutionary tree with biologically-motivated restrictions. For PH, we extend recent work by further mapping the interface between ``easy'' and ``hard'' instances, within the framework of $(k,l)$-bounded instances where the number of 2's per column and row of the input matrix is restricted. By exploring, in the same way, the tractability frontier of MPPH we provide the first concrete, positive results for this problem. In addition, we construct for both PH and MPPH polynomial time approximation algorithms, based on properties of the columns of the input matrix.", acknowledgement = ack-nhfb, keywords = "biology and genetics; combinatorial algorithms; complexity hierarchies", } @Article{Brinza:2008:SPM, author = "Dumitru Brinza and Alexander Zelikovsky", title = "{2SNP}: Scalable Phasing Method for Trios and Unrelated Individuals", journal = j-TCBB, volume = "5", number = "2", pages = "313--318", month = apr, year = "2008", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Thu Jun 12 16:59:29 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Emerging microarray technologies allow affordable typing of very long genome sequences. A key challenge in analyzing of such huge amount of data is scalable and accurate computational inferring of haplotypes (i.e., splitting of each genotype into a pair of corresponding haplotypes). In this paper, we first phase genotypes consisting only of two SNPs using genotypes frequencies adjusted to the random mating model and then extend phasing of two-SNP genotypes to phasing of complete genotypes using maximum spanning trees. Runtime of the proposed 2SNP algorithm is $O(nm (n + \log m)$, where n and m are the numbers of genotypes and SNPs, respectively, and it can handle genotypes spanning entire chromosomes in a matter of hours. On datasets across 23 chromosomal regions from HapMap[11], 2SNP is several orders of magnitude faster than GERBIL and PHASE while matching them in quality measured by the number of correctly phased genotypes, single-site and switching errors. For example the 2SNP software phases entire chromosome ($10^5$ SNPs from HapMap) for 30 individuals in 2 hours with average switching error 7.7\%. We have also enhanced 2SNP algorithm to phase family trio data and compared it with four other well-known phasing methods on simulated data from [15]. 2SNP is much faster than all of them while losing in quality only to PHASE. 2SNP software is publicly available at \texttt{http://alla.cs.gsu.edu/~software/2SNP}.", acknowledgement = ack-nhfb, keywords = "algorithm; genotype; haplotype; phasing; SNP", } @Article{Mandoiu:2008:GEI, author = "Ion I. Mandoiu and Yi Pan and Alexander Zelikovsky", title = "{Guest Editors}' Introduction to the Special Section on Bioinformatics Research and Applications", journal = j-TCBB, volume = "5", number = "3", pages = "321--322", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.85", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sridhar:2008:MIL, author = "Srinath Sridhar and Fumei Lam and Guy E. Blelloch and R. Ravi and Russell Schwartz", title = "Mixed Integer Linear Programming for Maximum-Parsimony Phylogeny Inference", journal = j-TCBB, volume = "5", number = "3", pages = "323--331", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.26", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Reconstruction of phylogenetic trees is a fundamental problem in computational biology. While excellent heuristic methods are available for many variants of this problem, new advances in phylogeny inference will be required if we are to be able to continue to make effective use of the rapidly growing stores of variation data now being gathered. In this paper, we present two integer linear programming (ILP) formulations to find the most parsimonious phylogenetic tree from a set of binary variation data. One method uses a flow-based formulation that can produce exponential numbers of variables and constraints in the worst case. The method has, however, proven extremely efficient in practice on datasets that are well beyond the reach of the available provably efficient methods, solving several large mtDNA and Y-chromosome instances within a few seconds and giving provably optimal results in times competitive with fast heuristics than cannot guarantee optimality. An alternative formulation establishes that the problem can be solved with a polynomial-sized ILP. We further present a web server developed based on the exponential-sized ILP that performs fast maximum parsimony inferences and serves as a front end to a database of precomputed phylogenies spanning the human genome.", acknowledgement = ack-nhfb, keywords = "algorithms; computational biology; integer linear programming; maximum parsimony; phylogenetic tree reconstruction; Steiner tree problem", } @Article{Bernt:2008:SPR, author = "Matthias Bernt and Daniel Merkle and Martin Middendorf", title = "Solving the Preserving Reversal Median Problem", journal = j-TCBB, volume = "5", number = "3", pages = "332--347", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.39", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Genomic rearrangement operations can be very useful to infer the phylogenetic relationship of gene orders representing species. We study the problem of finding potential ancestral gene orders for the gene orders of given taxa, such that the corresponding rearrangement scenario has a minimal number of reversals, and where each of the reversals has to preserve the common intervals of the given input gene orders. Common intervals identify sets of genes that occur consecutively in all input gene orders. The problem of finding such an ancestral gene order is called the preserving reversal median problem (pRMP). A tree-based data structure for the representation of the common intervals of all input gene orders is used in our exact algorithm TCIP for solving the pRMP. It is known that the minimum number of reversals to transform one gene order into another can be computed in polynomial time, whereas the corresponding problem with the restriction that common intervals should not be destroyed is already NP-hard. It is shown theoretically that TCIP can solve a large class of pRMP instances in polynomial time. Empirically we show the good performance of TCIP on biological and artificial data.", acknowledgement = ack-nhfb, keywords = "biology and genetics; permutations and combinations", } @Article{Braga:2008:ESS, author = "Mar{\'\i}lia D. V. Braga and Marie-France Sagot and Celine Scornavacca and Eric Tannier", title = "Exploring the Solution Space of Sorting by Reversals, with Experiments and an Application to Evolution", journal = j-TCBB, volume = "5", number = "3", pages = "348--356", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.16", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "In comparative genomics, algorithms that sort permutations by reversals are often used to propose evolutionary scenarios of rearrangements between species. One of the main problems of such methods is that they give one solution while the number of optimal solutions is huge, with no criteria to discriminate among them. Bergeron et al. started to give some structure to the set of optimal solutions, in order to be able to deliver more presentable results than only one solution or a complete list of all solutions. However, no algorithm exists so far to compute this structure except through the enumeration of all solutions, which takes too much time even for small permutations. Bergeron et al. state as an open problem the design of such an algorithm. We propose in this paper an answer to this problem, that is, an algorithm which gives all the classes of solutions and counts the number of solutions in each class, with a better theoretical and practical complexity than the complete enumeration method. We give an example of how to reduce the number of classes obtained, using further constraints. Finally, we apply our algorithm to analyse the possible scenarios of rearrangement between mammalian sex chromosomes.", acknowledgement = ack-nhfb, keywords = "common intervals; evolution; genome rearrangements; perfect sorting; sex chromosomes; signed permutations; sorting by reversals", } @Article{Vassura:2008:RSP, author = "Marco Vassura and Luciano Margara and Pietro Di Lena and Filippo Medri and Piero Fariselli and Rita Casadio", title = "Reconstruction of {$3$D} Structures From Protein Contact Maps", journal = j-TCBB, volume = "5", number = "3", pages = "357--367", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.27", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The prediction of the protein tertiary structure from solely its residue sequence (the so called Protein Folding Problem) is one of the most challenging problems in Structural Bioinformatics. We focus on the protein residue contact map. When this map is assigned it is possible to reconstruct the 3D structure of the protein backbone. The general problem of recovering a set of 3D coordinates consistent with some given contact map is known as a unit-disk-graph realization problem and it has been recently proven to be NP-Hard. In this paper we describe a heuristic method (COMAR) that is able to reconstruct with an unprecedented rate (3-15 seconds) a 3D model that exactly matches the target contact map of a protein. Working with a non-redundant set of 1760 proteins, we find that the scoring efficiency of finding a 3D model very close to the protein native structure depends on the threshold value adopted to compute the protein residue contact map. Contact maps whose threshold values range from 10 to 18 {\AA}ngstroms allow reconstructing 3D models that are very similar to the proteins native structure.", acknowledgement = ack-nhfb, keywords = "combinatorial algorithms; contact map; molecular modeling; protein structure prediction", } @Article{Lee:2008:IEN, author = "George Lee and Carlos Rodriguez and Anant Madabhushi", title = "Investigating the Efficacy of Nonlinear Dimensionality Reduction Schemes in Classifying Gene and Protein Expression Studies", journal = j-TCBB, volume = "5", number = "3", pages = "368--384", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.36", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The recent explosion in procurement and availability of high-dimensional gene- and protein-expression profile datasets for cancer diagnostics has necessitated the development of sophisticated machine learning tools with which to analyze them. A major limitation in the ability to accurate classify these high-dimensional datasets stems from the 'curse of dimensionality', occurring in situations where the number of genes or peptides significantly exceeds the total number of patient samples. Previous attempts at dealing with this issue have mostly centered on the use of a dimensionality reduction (DR) scheme, Principal Component Analysis (PCA), to obtain a low-dimensional projection of the high-dimensional data. However, linear PCA and other linear DR methods, which rely on Euclidean distances to estimate object similarity, do not account for the inherent underlying nonlinear structure associated with most biomedical data. The motivation behind this work is to identify the appropriate DR methods for analysis of high-dimensional gene- and protein-expression studies. Towards this end, we empirically and rigorously compare three nonlinear (Isomap, Locally Linear Embedding, Laplacian Eigenmaps) and three linear DR schemes (PCA, Linear Discriminant Analysis, Multidimensional Scaling) with the intent of determining a reduced subspace representation in which the individual object classes are more easily discriminable.", acknowledgement = ack-nhfb, keywords = "and association rules; Bioinformatics (genome or protein) databases; classification; clustering; data and knowledge visualization; data mining; feature extraction or construction", } @Article{Cho:2008:CHC, author = "Hyuk Cho and Inderjit S. Dhillon", title = "Coclustering of Human Cancer Microarrays Using Minimum Sum-Squared Residue Coclustering", journal = j-TCBB, volume = "5", number = "3", pages = "385--400", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70268", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "It is a consensus in microarray analysis that identifying potential local patterns, characterized by coherent groups of genes and conditions, may shed light on the discovery of previously undetectable biological cellular processes of genes as well as macroscopic phenotypes of related samples. In order to simultaneously cluster genes and conditions, we have previously developed a fast co-clustering algorithm, Minimum Sum-Squared Residue Co-clustering (MSSRCC), which employs an alternating minimization scheme and generates what we call co-clusters in a checkerboard structure. In this paper, we propose specific strategies that enable MSSRCC to escape poor local minima and resolve the degeneracy problem in partitional clustering algorithms. The strategies include binormalization, deterministic spectral initialization, and incremental local search. We assess the effects of various strategies on both synthetic gene expression datasets and real human cancer microarrays and provide empirical evidence that MSSRCC with the proposed strategies performs better than existing co-clustering and clustering algorithms. In particular, the combination of all the three strategies leads to the best performance. Furthermore, we illustrate coherence of the resulting co-clusters in a checkerboard structure, where genes in a co-cluster manifest the phenotype structure of corresponding specific samples, and evaluate the enrichment of functional annotations in Gene Ontology (GO).", acknowledgement = ack-nhfb, keywords = "binormalization; co-clustering; deterministic spectral initialization; gene ontology; local search; microarray analysis", } @Article{Wei:2008:IGF, author = "Peng Wei and Wei Pan", title = "Incorporating Gene Functions into Regression Analysis of {DNA}-Protein Binding Data and Gene Expression Data to Construct Transcriptional Networks", journal = j-TCBB, volume = "5", number = "3", pages = "401--415", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.1062", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Useful information on transcriptional networks has been extracted by regression analyses of gene expression data and DNA-protein binding data. However, a potential limitation of these approaches is their assumption on the common and constant activity level of a transcription factor (TF) on all the genes in any given experimental condition; for example, any TF is assumed to be either an activator or a repressor, but not both, while it is known that some TFs can be dual regulators. Rather than assuming a common linear regression model for all the genes, we propose using separate regression models for various gene groups; the genes can be grouped based on their functions or some clustering results. Furthermore, to take advantage of the hierarchical structure of many existing gene function annotation systems, such as Gene Ontology (GO), we propose a shrinkage method that borrows information from relevant gene groups. Applications to a yeast dataset and simulations lend support for our proposed methods. In particular, we find that the shrinkage method consistently works well under various scenarios. We recommend the use of the shrinkage method as a useful alternative to the existing methods.", acknowledgement = ack-nhfb, keywords = "LASSO; microarray; shrinkage estimator; stratified analysis; transcription factor", } @Article{Mak:2008:PPS, author = "Man-Wai Mak and Jian Guo and Sun-Yuan Kung", title = "{PairProSVM}: Protein Subcellular Localization Based on Local Pairwise Profile Alignment and {SVM}", journal = j-TCBB, volume = "5", number = "3", pages = "416--422", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70256", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The subcellular locations of proteins are important functional annotations. An effective and reliable subcellular localization method is necessary for proteomics research. This paper introduces a new method---PairProSVM---to automatically predict the subcellular locations of proteins. The profiles of all protein sequences in the training set are constructed by PSI-BLAST and the pairwise profile-alignment scores are used to form feature vectors for training a support vector machine (SVM) classifier. It was found that PairProSVM outperforms the methods that are based on sequence alignment and amino-acid compositions even if most of the homologous sequences have been removed. This paper also demonstrates that the performance of PairProSVM is sensitive (and somewhat proportional) to the degree of its kernel matrix meeting the Mercer's condition. PairProSVM was evaluated on Reinhardt and Hubbard's, Huang and Li's, and Gardy et al.'s protein datasets. The overall accuracies on these three datasets reach 99.3\\\%, 76.5\\\%, and 91.9\\\%, respectively, which are higher than or comparable to those obtained by sequence alignment and by the methods compared in this paper.", acknowledgement = ack-nhfb, keywords = "kernel methods; Mercer condition; profile alignment; subcellular localization; support vector machines", } @Article{Elo:2008:ROT, author = "Laura L. Elo and Sanna Filen and Riitta Lahesmaa and Tero Aittokallio", title = "Reproducibility-Optimized Test Statistic for Ranking Genes in Microarray Studies", journal = j-TCBB, volume = "5", number = "3", pages = "423--431", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/tcbb.2007.1078", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "A principal goal of microarray studies is to identify the genes showing differential expression under distinct conditions. In such studies, the selection of an optimal test statistic is a crucial challenge, which depends on the type and amount of data under analysis. While previous studies on simulated or spike-in datasets do not provide practical guidance on how to choose the best method for a given real dataset, we introduce an enhanced reproducibility-optimization procedure, which enables the selection of a suitable gene- anking statistic directly from the data. In comparison with existing ranking methods, the reproducibility-optimized statistic shows good performance consistently under various simulated conditions and on Affymetrix spike-in dataset. Further, the feasibility of the novel statistic is confirmed in a practical research setting using data from an in-house cDNA microarray study of asthma-related gene expression changes. These results suggest that the procedure facilitates the selection of an appropriate test statistic for a given dataset without relying on a priori assumptions, which may bias the findings and their interpretation. Moreover, the general reproducibility-optimization procedure is not limited to detecting differential expression only but could be extended to a wide range of other applications as well.", acknowledgement = ack-nhfb, keywords = "bootstrap; differential expression; gene expression; gene ranking; microarray; reproducibility", } @Article{Parker:2008:SPT, author = "Douglass Stott Parker and Ruey-Lung Hsiao and Yi Xing and Alissa M. Resch and Christopher J. Lee", title = "Solving the Problem of Trans-Genomic Query with Alignment Tables", journal = j-TCBB, volume = "5", number = "3", pages = "432--447", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.1073", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The trans-genomic query (TGQ) problem -- enabling the free query of biological information, even across genomes -- is a central challenge facing bioinformatics. Solutions to this problem can alter the nature of the field, moving it beyond the jungle of data integration and expanding the number and scope of questions that can be answered. An alignment table is a binary relationship on locations (sequence segments). An important special case of alignment tables are hit tables ? tables of pairs of highly similar segments produced by alignment tools like BLAST. However, alignment tables also include general binary relationships, and can represent any useful connection between sequence locations. They can be curated, and provide a high-quality queryable backbone of connections between biological information. Alignment tables thus can be a natural foundation for TGQ, as they permit a central part of the TGQ problem to be reduced to purely technical problems involving tables of locations. Key challenges in implementing alignment tables include efficient representation and indexing of sequence locations. We define a location datatype that can be incorporated naturally into common off-the-shelf database systems. We also describe an implementation of alignment tables in BLASTGRES, an extension of the open-source POSTGRESQL database system that provides indexing and operators on locations required for querying alignment tables. This paper also reviews several successful large-scale applications of alignment tables for Trans-Genomic Query. Tables with millions of alignments have been used in queries about alternative splicing, an area of genomic analysis concerning the way in which a single gene can yield multiple transcripts. Comparative genomics is a large potential application area for TGQ and alignment tables.", acknowledgement = ack-nhfb, } @Article{Dawy:2008:FSG, author = "Zaher Dawy and Michel Sarkis and Joachim Hagenauer and Jakob C. Mueller", title = "Fine-Scale Genetic Mapping Using Independent Component Analysis", journal = j-TCBB, volume = "5", number = "3", pages = "448--460", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.1072", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The aim of genetic mapping is to locate the loci responsible for specific traits such as complex diseases. These traits are normally caused by mutations at multiple loci of unknown locations and interactions. In this work, we model the biological system that relates DNA polymorphisms with complex traits as a linear mixing process. Given this model, we propose a new fine-scale genetic mapping method based on independent component analysis. The proposed method outputs both independent associated groups of SNPs in addition to specific associated SNPs with the phenotype. It is applied to a clinical data set for the Schizophrenia disease with 368 individuals and 42 SNPs. It is also applied to a simulation study to investigate in more depth its performance. The obtained results demonstrate the novel characteristics of the proposed method compared to other genetic mapping methods. Finally, we study the robustness of the proposed method with missing genotype values and limited sample sizes.", acknowledgement = ack-nhfb, keywords = "association mapping; complex diseases; independent component analysis (ICA); linkage disequilibrium; principal component analysis (PCA); single nucleotide polymorphisms (SNPs)", } @Article{Hendy:2008:HCK, author = "Michael D. Hendy and Sagi Snir", title = "{Hadamard} Conjugation for the {Kimura} {3ST} Model: Combinatorial Proof Using Path Sets", journal = j-TCBB, volume = "5", number = "3", pages = "461--471", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70227", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Under a stochastic model of molecular sequence evolution the probability of each possible pattern of a characters is well defined. The Kimura's three-substitution-types (K3ST) model of evolution, allows analytical expression for these probabilities of by means of the Hadamard conjugation as a function of the phylogeny T and the substitution probabilities on each edge of TM . In this paper we produce a direct combinatorial proof of these results, using pathset distances which generalise pairwise distances between sequences. This interpretation provides us with tools that were proved useful in related problems in the mathematical analysis of sequence evolution.", acknowledgement = ack-nhfb, keywords = "Hadamard conjugation; K3ST model; path-sets; phylogenetic invariants; phylogenetic trees", } @Article{Gambette:2008:ILP, author = "Philippe Gambette and Daniel H. Huson", title = "Improved Layout of Phylogenetic Networks", journal = j-TCBB, volume = "5", number = "3", pages = "472--479", month = jul, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/tcbb.2007.1046", ISSN = "1545-5963", bibdate = "Fri Oct 10 12:59:44 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Split networks are increasingly being used in phylogenetic analysis. Usually, a simple equal angle algorithm is used to draw such networks, producing layouts that leave much room for improvement. Addressing the problem of producing better layouts of split networks, this paper presents an algorithm for maximizing the area covered by the network, describes an extension of the equal-daylight algorithm to networks, looks into using a spring embedder and discusses how to construct rooted split networks.", acknowledgement = ack-nhfb, keywords = "algorithms; graph drawing; phylogenetic networks; phylogenetics", } @Article{Gusfield:2008:EE, author = "Dan Gusfield", title = "{EIC} Editorial", journal = j-TCBB, volume = "5", number = "4", pages = "481--481", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.115", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Giancarlo:2008:GEI, author = "Raffaele Giancarlo and Sridhar Hannenhalli", title = "{Guest Editors}' Introduction to the Special Section on Algorithms in Bioinformatics", journal = j-TCBB, volume = "5", number = "4", pages = "482--483", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.116", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Jeong:2008:ISP, author = "Jieun Jeong and Piotr Berman and Teresa M. Przytycka", title = "Improving Strand Pairing Prediction through Exploring Folding Cooperativity", journal = j-TCBB, volume = "5", number = "4", pages = "484--491", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.88", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", abstract = "The topology of $\beta$-sheets is defined by the pattern of hydrogen-bonded strand pairing. Therefore, predicting hydrogen bonded strand partners is a fundamental step towards predicting $\beta$-sheet topology. At the same time, finding the correct partners is very difficult due to long range interactions involved in strand pairing. Additionally, patterns of aminoacids involved, in $\beta$-sheet formations are very general and therefore difficult to use for computational recognition of specific contacts between strands. In this work, we report a new strand pairing algorithm. To address above mentioned difficulties, our algorithm attempts to mimic elements of the folding process. Namely, in addition to ensuring that the predicted hydrogen bonded strand pairs satisfy basic global consistency constraints, it takes into account hypothetical folding pathways. Consistently with this view, introducing hydrogen bonds between a pair of strands changes the probabilities of forming hydrogen bonds between other pairs of strand. We demonstrate that this approach provides an improvement over previously proposed algorithms. We also compare the performance of this method to that of a global optimization algorithm that poses the problem as integer linear programming optimization problem and solves it using ILOG CPLEX\texttrademark\ package.", acknowledgement = ack-nhfb, keywords = "Biology and genetics; Combinatorial algorithms", } @Article{Genovese:2008:SAH, author = "Loredana M. Genovese and Filippo Geraci and Marco Pellegrini", title = "{SpeedHap}: An Accurate Heuristic for the Single Individual {SNP} Haplotyping Problem with Many Gaps, High Reading Error Rate and Low Coverage", journal = j-TCBB, volume = "5", number = "4", pages = "492--502", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.67", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", abstract = "Single nucleotide polymorphism (SNP) is the most frequent form of DNA variation. The set of SNP's present in a chromosome (called the em haplotype) is of interest in a wide area of applications in molecular biology and biomedicine, including diagnostic and medical therapy. In this paper we propose a new heuristic method for the problem of haplotype reconstruction for (portions of) a pair of homologous human chromosomes from a single individual (SIH). The problem is well known in literature and exact algorithms have been proposed for the case when no (or few) gaps are allowed in the input fragments. These algorithms, though exact and of polynomial complexity, are slow in practice. When gaps are considered no exact method of polynomial complexity is known. The problem is also hard to approximate with guarantees. Therefore fast heuristics have been proposed. In this paper we describe SpeedHap, a new heuristic method that is able to tackle the case of many gapped fragments and retains its effectiveness even when the input fragments have high rate of reading errors (up to 20\%) and low coverage (as low as 3). We test SpeedHap on real data from the HapMap Project.", acknowledgement = ack-nhfb, keywords = "Algorithms; Biology and genetics", } @Article{Lozano:2008:STA, author = "Antoni Lozano and Ron Y. Pinter and Oleg Rokhlenko and Gabriel Valiente and Michal Ziv-Ukelson", title = "Seeded Tree Alignment", journal = j-TCBB, volume = "5", number = "4", pages = "503--513", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.59", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", abstract = "The optimal transformation of one tree into another by means of elementary edit operations is an important algorithmic problem that has several interesting applications to computational biology. Here we introduce a constrained form of this problem in which a partial mapping of a set of nodes (the 'seeds') in one tree to a corresponding set of nodes in the other tree is given, and present efficient algorithms for both ordered and unordered trees. Whereas ordered tree matching based on seeded nodes has applications in pattern matching of RNA structures, unordered tree matching based on seeded nodes has applications in co-speciation and phylogeny reconciliation. The latter involves the solution of the planar tanglegram layout problem, for which a polynomial-time algorithm is given here.", acknowledgement = ack-nhfb, keywords = "Biology and genetics; Computer Applications; Discrete Mathematics; Graph algorithms; Graph Theory; Life and Medical Sciences; Mathematics of Computing; Trees", } @Article{Bansal:2008:NLS, author = "Mukul S. Bansal and Oliver Eulenstein", title = "An {$\Omega(n^2/ \log n)$} Speed-Up of {TBR} Heuristics for the Gene-Duplication Problem", journal = j-TCBB, volume = "5", number = "4", pages = "514--524", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.69", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", abstract = "The gene-duplication problem is to infer a species supertree from gene trees that are confounded by complex histories of gene duplications. This problem is NP-hard and thus requires efficient and effective heuristics. Existing heuristics perform a stepwise search of the tree space, where each step is guided by an exact solution to an instance of a local search problem. We improve on the time complexity of the local search problem by a factor of n2= log n, where n is the size of the resulting species supertree. Typically, several thousand instances of the local search problem are solved throughout a stepwise heuristic search. Hence, our improvement makes the gene-duplication problem much more tractable for large-scale phylogenetic analyses.", acknowledgement = ack-nhfb, keywords = "Algorithms; Computational Biology; Gene Duplication; Phylogenetics; Supertrees", } @Article{Wang:2008:DCO, author = "Xueyi Wang and Jack Snoeyink", title = "Defining and Computing Optimum {RMSD} for Gapped and Weighted Multiple-Structure Alignment", journal = j-TCBB, volume = "5", number = "4", pages = "525--533", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.92", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", abstract = "Pairwise structure alignment commonly uses root mean square deviation (RMSD) to measure the structural similarity, and methods for optimizing RMSD are well established. We extend RMSD to weighted RMSD for multiple structures. By using multiplicative weights, we show that weighted RMSD for all pairs is the same as weighted RMSD to an average of the structures. Thus, using RMSD or weighted RMSD implies that the average is a consensus structure. Although we show that in general, the two tasks of finding the optimal translations and rotations for minimizing weighted RMSD cannot be separated for multiple structures like they can for pairs, an inherent difficulty and a fact ignored by previous work, we develop a near-linear iterative algorithm to converge weighted RMSD to a local minimum. 10,000 experiments of gapped alignment done on each of 23 protein families from HOMSTRAD (where each structure starts with a random translation and rotation) converge rapidly to the same minimum. Finally we propose a heuristic method to iteratively remove the effect of outliers and find well-aligned positions that determine the structural conserved region by modeling B-factors and deviations from the average positions as weights and iteratively assigning higher weights to better aligned atoms.", acknowledgement = ack-nhfb, keywords = "multiple structure alignment; optimization methods; structural conserved region; weighted RMSD", } @Article{Yao:2008:EAE, author = "Peggy Yao and Ankur Dhanik and Nathan Marz and Ryan Propper and Charles Kou and Guanfeng Liu and Henry van den Bedem and Jean-Claude Latombe and Inbal Halperin-Landsberg and Russ B. Altman", title = "Efficient Algorithms to Explore Conformation Spaces of Flexible Protein Loops", journal = j-TCBB, volume = "5", number = "4", pages = "534--545", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.96", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", abstract = "Several applications in biology - e.g., incorporation of protein flexibility in ligand docking algorithms, interpretation of fuzzy X-ray crystallographic data, and homology modeling - require computing the internal parameters of a flexible fragment (usually, a loop) of a protein in order to connect its termini to the rest of the protein without causing any steric clash. One must often sample many such conformations in order to explore and adequately represent the conformational range of the studied loop. While sampling must be fast, it is made difficult by the fact that two conflicting constraints - kinematic closure and clash avoidance - must be satisfied concurrently. This paper describes two efficient and complementary sampling algorithms to explore the space of closed clash-free conformations of a flexible protein loop. The 'seed sampling' algorithm samples broadly from this space, while the 'deformation sampling' algorithm uses seed conformations as starting points to explore the conformation space around them at a finer grain. Computational results are presented for various loops ranging from 5 to 25 residues. More specific results also show that the combination of the sampling algorithms with a functional site prediction software (FEATURE) makes it possible to compute and recognize calcium-binding loop conformations. The sampling algorithms are implemented in a toolkit (LoopTK), which is available at https://simtk.org/home/looptk.", acknowledgement = ack-nhfb, keywords = "Biology and genetics; Robotics", } @Article{Kim:2008:LSS, author = "Eagu Kim and John Kececioglu", title = "Learning Scoring Schemes for Sequence Alignment from Partial Examples", journal = j-TCBB, volume = "5", number = "4", pages = "546--556", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.57", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", abstract = "When aligning biological sequences, the choice of parameter values for the alignment scoring function is critical. Small changes in gap penalties, for example, can yield radically different alignments. A rigorous way to compute parameter values that are appropriate for aligning biological sequences is through inverse parametric sequence alignment. Given a collection of examples of biologically correct alignments, this is the problem of finding parameter values that make the scores of the example alignments close to those of optimal alignments for their sequences. We extend prior work on inverse parametric alignment to partial examples, which contain regions where the alignment is left unspecified, and to an improved formulation based on minimizing the average error between the score of an example and the score of an optimal alignment. Experiments on benchmark biological alignments show we can find parameters that generalize across protein families and that boost the accuracy of multiple sequence alignment by as much as 25\%.", acknowledgement = ack-nhfb, keywords = "Analysis of Algorithms and Problem Complexity; Biology and genetics; Linear programming; Pattern matching", } @Article{Schliep:2008:EAC, author = "Alexander Schliep and Roland Krause", title = "Efficient Algorithms for the Computational Design of Optimal Tiling Arrays", journal = j-TCBB, volume = "5", number = "4", pages = "557--567", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.50", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", abstract = "The representation of a genome by oligonucleotide probes is a prerequisite for the analysis of many of its basic properties, such as transcription factor binding sites, chromosomal breakpoints, gene expression of known genes and detection of novel genes, in particular those coding for small RNAs. An ideal representation would consist of a high density set of oligonucleotides with similar melting temperatures that do not cross-hybridize with other regions of the genome and are equidistantly spaced. The implementation of such design is typically called a tiling array or genome array. We formulate the minimal cost tiling path problem for the selection of oligonucleotides from a set of candidates. Computing the selection of probes requires multi-criterion optimization, which we cast into a shortest path problem. Standard algorithms running in linear time allow us to compute globally optimal tiling paths from millions of candidate oligonucleotides on a standard desktop computer for most problem variants. The solutions to this multi-criterion optimization are spatially adaptive to the problem instance. Our formulation incorporates experimental constraints with respect to specific regions of interest and trade offs between hybridization parameters, probe quality and tiling density easily.", acknowledgement = ack-nhfb, keywords = "Biology and genetics; Graph Theory", } @Article{Yu:2008:CAA, author = "Zeyun Yu and Chandrajit Bajaj", title = "Computational Approaches for Automatic Structural Analysis of Large Biomolecular Complexes", journal = j-TCBB, volume = "5", number = "4", pages = "568--582", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70226", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", abstract = "We present computational solutions to two problems of macromolecular structure interpretation from reconstructed three-dimensional electron microscopy (3D-EM) maps of large bio-molecular complexes at intermediate resolution (5A-15A). Thetwo problems addressed are: (a) 3D structural alignment (matching)between identified and segmented 3D maps of structure units(e.g. trimeric configuration of proteins), and (b) the secondary structure identification of a segmented protein 3D map (i.e.locations of a-helices, b -sheets). For problem (a), we present an efficient algorithm to correlate spatially (and structurally)two 3D maps of structure units. Besides providing a similarity score between structure units, the algorithm yields an effective technique for resolution refinement of repeated structure units,by 3D alignment and averaging. For problem (b), we present an efficient algorithm to compute eigenvalues and link eigenvectors of a Gaussian convoluted structure tensor derived from the protein 3D Map, thereby identifying and locating secondary structural motifs of proteins. The efficiency and performance of our approach is demonstrated on several experimentally reconstructed 3D maps of virus capsid shells from single-particle cryo-EM, as well as computationally simulated protein structure density 3D maps generated from protein model entries in the Protein Data Bank.", acknowledgement = ack-nhfb, keywords = "3D Reconstruction; Alignment; Cryo-EM Maps; Secondary Structure Detection; Segmentation; Similarity Measure; Skeletonization; Structure Analysis", } @Article{Christinat:2008:GED, author = "Yann Christinat and Bernd Wachmann and Lei Zhang", title = "Gene Expression Data Analysis Using a Novel Approach to Biclustering Combining Discrete and Continuous Data", journal = j-TCBB, volume = "5", number = "4", pages = "583--593", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70251", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", abstract = "Many different methods exist for pattern detection in gene expression data. In contrast to classical methods, biclustering has the ability to cluster a group of genes together with a group of conditions (replicates, set of patients or drug compounds). However, since the problem is NP-complex, most algorithms use heuristic search functions and therefore might converge towards local maxima. By using the results of biclustering on discrete data as a starting point for a local search function on continuous data, our algorithm avoids the problem of heuristic initialization. Similar to OPSM, our algorithm aims to detect biclusters whose rows and columns can be ordered such that row values are growing across the bicluster's columns and vice-versa. Results have been generated on the yeast genome (Saccharomyces cerevisiae), a human cancer dataset and random data. Results on the yeast genome showed that 89\% of the one hundred biggest non-overlapping biclusters were enriched with Gene Ontology annotations. A comparison with OPSM and ISA demonstrated a better efficiency when using gene and condition orders. We present results on random and real datasets that show the ability of our algorithm to capture statistically significant and biologically relevant biclusters.", acknowledgement = ack-nhfb, keywords = "Bioinformatics (genome or protein) databases; Data and knowledge visualization; Data mining; Graph and tree search strategies; Machine learning", } @Article{Lacroix:2008:IMN, author = "Vincent Lacroix and Ludovic Cottret and Patricia Th{\'e}bault and Marie-France Sagot", title = "An Introduction to Metabolic Networks and Their Structural Analysis", journal = j-TCBB, volume = "5", number = "4", pages = "594--617", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.79", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", abstract = "There has been a renewed interest for metabolism in the computational biology community, leading to an avalanche of papers coming from methodological network analysis as well as experimental and theoretical biology. This paper is meant to serve as an initial guide for both the biologists interested in formal approaches and the mathematicians or computer scientists wishing to inject more realism into their models. The paper is focused on the structural aspects of metabolism only. The literature is vast enough already, and the thread through it difficult to follow even for the more experienced worker in the field. We explain methods for acquiring data and reconstructing metabolic networks, and review the various models that have been used for their structural analysis. Several concepts such as modularity are introduced, as are the controversies that have beset the field these past few years, for instance, on whether metabolic networks are small-world or scale-free, and on which model better explains the evolution of metabolism. Clarifying the work that has been done also helps in identifying open questions and in proposing relevant future directions in the field, which we do along the paper and in the conclusion.", acknowledgement = ack-nhfb, keywords = "Biology and genetics; evolution; Graph Theory; Introductory and Survey; metabolic networks; modelling; modularity; reconstruction", } @Article{Satya:2008:UIP, author = "Ravi Vijaya Satya and Amar Mukherjee", title = "The Undirected Incomplete Perfect Phylogeny Problem", journal = j-TCBB, volume = "5", number = "4", pages = "618--629", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70218", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", abstract = "The incomplete perfect phylogeny (IPP) problem and the incomplete perfect phylogeny haplotyping (IPPH) problem deal with constructing a phylogeny for a given set of haplotypes or genotypes with missing entries. The earlier approaches for both of these problems dealt with restricted versions of the problems, where the root is either available or can be trivially re-constructed from the data, or certain assumptions were made about the data. In this paper, we deal with the unrestricted versions of the problems, where the root of the phylogeny is neither available nor trivially recoverable from the data. Both IPP and IPPH problems have previously been proven to be NP complete. Here, we present efficient enumerative algorithms that can handle practical instances of the problem. Empirical analysis on simulated data shows that the algorithms perform very well both in terms of speed and in terms accuracy of the recovered data.", acknowledgement = ack-nhfb, keywords = "Haplotype Inference; Incomplete Perfect Phylogeny; Perfect Phylogeny; Phylogenetics", } @Article{Gondro:2008:OCM, author = "Cedric Gondro and Brian P. Kinghorn", title = "Optimization of {cDNA} Microarray Experimental Designs Using an Evolutionary Algorithm", journal = j-TCBB, volume = "5", number = "4", pages = "630--638", month = oct, year = "2008", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70222", ISSN = "1545-5963", bibdate = "Wed Jan 14 12:51:33 MST 2009", bibsource = "http://portal.acm.org/", abstract = "The cDNA microarray is an important tool for generating large datasets of gene expression measurements. An efficient design is critical to ensure that the experiment will be able to address relevant biological questions. Microarray experimental design can be treated as a multicriterion optimization problem. For this class of problems evolutionary algorithms (EAs) are well suited, as they can search the solution space and evolve a design that optimizes the parameters of interest based on their relative value to the researcher under a given set of constraints. This paper introduces the use of EAs for optimization of experimental designs of spotted microarrays using a weighted objective function. The EA and the various criteria relevant to design optimization are discussed. Evolved designs are compared with designs obtained through exhaustive search with results suggesting that the EA can find just as efficient optimal or near-optimal designs within a tractable timeframe.", acknowledgement = ack-nhfb, keywords = "Evolutionary computing and genetic algorithms; experimental design; global optimization; microarrays", } @Article{Gusfield:2009:FFY, author = "Dan Gusfield", title = "Final, Five-Year End, Editorial", journal = j-TCBB, volume = "6", number = "1", pages = "1--2", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sagot:2009:NEE, author = "Marie-France Sagot", title = "New {EIC} Editorial", journal = j-TCBB, volume = "6", number = "1", pages = "3--3", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Huson:2009:SSP, author = "Daniel H. Huson and Vincent Moulton and Mike Steel", title = "Special Section: Phylogenetics", journal = j-TCBB, volume = "6", number = "1", pages = "4--6", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Liu:2009:BWT, author = "Kevin Liu and Serita Nelesen and Sindhu Raghavan and C. Randal Linder and Tandy Warnow", title = "Barking Up The Wrong Treelength: The Impact of Gap Penalty on Alignment and Tree Accuracy", journal = j-TCBB, volume = "6", number = "1", pages = "7--21", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Minh:2009:BPD, author = "Bui Quang Minh and Fabio Pardi and Steffen Klaere and Arndt von Haeseler", title = "Budgeted Phylogenetic Diversity on Circular Split Systems", journal = j-TCBB, volume = "6", number = "1", pages = "22--29", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Linz:2009:HNT, author = "Simone Linz and Charles Semple", title = "Hybridization in Nonbinary Trees", journal = j-TCBB, volume = "6", number = "1", pages = "30--45", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Cardona:2009:MPN, author = "Gabriel Cardona and Merc{\`e} Llabr{\'e}s and Francesc Rossell{\'o} and Gabriel Valiente", title = "Metrics for Phylogenetic Networks {I}: Generalizations of the {Robinson--Foulds} Metric", journal = j-TCBB, volume = "6", number = "1", pages = "46--61", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Willson:2009:RTS, author = "Stephen J. Willson", title = "Robustness of Topological Supertree Methods for Reconciling Dense Incompatible Data", journal = j-TCBB, volume = "6", number = "1", pages = "62--75", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Allman:2009:ICM, author = "Elizabeth S. Allman and John A. Rhodes", title = "The Identifiability of Covarion Models in Phylogenetics", journal = j-TCBB, volume = "6", number = "1", pages = "76--88", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Matsen:2009:FTI, author = "Frederick A. Matsen", title = "{Fourier} Transform Inequalities for Phylogenetic Trees", journal = j-TCBB, volume = "6", number = "1", pages = "89--95", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Gusfield:2009:OEE, author = "Dan Gusfield", title = "Outgoing {EIC} Editorial for this Special Section of {TCBB} with the Theme of Phylogenetics", journal = j-TCBB, volume = "6", number = "1", pages = "96--96", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Grunewald:2009:MPT, author = "Stefan Gr{\"u}newald and Vincent Moulton", title = "Maximum Parsimony for Tree Mixtures", journal = j-TCBB, volume = "6", number = "1", pages = "97--102", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Huson:2009:DRP, author = "Daniel H. Huson", title = "Drawing Rooted Phylogenetic Networks", journal = j-TCBB, volume = "6", number = "1", pages = "103--109", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Bordewich:2009:CTM, author = "Magnus Bordewich and Olivier Gascuel and Katharina T. Huber and Vincent Moulton", title = "Consistency of Topological Moves Based on the Balanced Minimum Evolution Principle of Phylogenetic Inference", journal = j-TCBB, volume = "6", number = "1", pages = "110--117", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Wu:2009:RPT, author = "Taoyang Wu and Vincent Moulton and Mike Steel", title = "Refining Phylogenetic Trees Given Additional Data: An Algorithm Based on Parsimony", journal = j-TCBB, volume = "6", number = "1", pages = "118--125", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Mossel:2009:SEA, author = "Elchanan Mossel and Sebastien Roch and Mike Steel", title = "Shrinkage Effect in Ancestral Maximum Likelihood", journal = j-TCBB, volume = "6", number = "1", pages = "126--133", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Ma:2009:GCU, author = "Jianmin Ma and Minh N. Nguyen and Jagath C. Rajapakse", title = "Gene Classification Using Codon Usage and Support Vector Machines", journal = j-TCBB, volume = "6", number = "1", pages = "134--143", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Maitra:2009:IPO, author = "Ranjan Maitra", title = "Initializing Partition-Optimization Algorithms", journal = j-TCBB, volume = "6", number = "1", pages = "144--157", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Narasimhan:2009:SPG, author = "Sridharakumar Narasimhan and Raghunathan Rengaswamy and Rajanikanth Vadigepalli", title = "Structural Properties of Gene Regulatory Networks: Definitions and Connections", journal = j-TCBB, volume = "6", number = "1", pages = "158--170", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Anonymous:2009:RL, author = "Anonymous", title = "2008 Reviewers List", journal = j-TCBB, volume = "6", number = "1", pages = "171--173", month = jan, year = "2009", CODEN = "ITCBCY", ISSN = "1545-5963", bibdate = "Mon Mar 2 18:46:49 MST 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sagot:2009:EE, author = "Marie-France Sagot", title = "{EIC} Editorial", journal = j-TCBB, volume = "6", number = "2", pages = "177--177", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2009.44", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Mandoiu:2009:GEI, author = "Ion Mandoiu and Yi Pan and Raj Sunderraman and Alexander Zelikovsky", title = "{Guest Editors}' Introduction to the Special Section on Bioinformatics Research and Applications", journal = j-TCBB, volume = "6", number = "2", pages = "178--179", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2009.45", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Treangen:2009:NHL, author = "Todd J. Treangen and Aaron E. Darling and Guillaume Achaz and Mark A. Ragan and Xavier Messeguer and Eduardo P. C. Rocha", title = "A Novel Heuristic for Local Multiple Alignment of Interspersed {DNA} Repeats", journal = j-TCBB, volume = "6", number = "2", pages = "180--189", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2009.9", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Pairwise local sequence alignment methods have been the prevailing technique to identify homologous nucleotides between related species. However, existing methods that identify and align all homologous nucleotides in one or more genomes have suffered from poor scalability and limited accuracy. We propose a novel method that couples a gapped extension heuristic with an efficient filtration method for identifying interspersed repeats in genome sequences. During gapped extension, we use the MUSCLE implementation of progressive global multiple alignment with iterative refinement. The resulting gapped extensions potentially contain alignments of unrelated sequence. We detect and remove such undesirable alignments using a hidden Markov model (HMM) to predict the posterior probability of homology. The HMM emission frequencies for nucleotide substitutions can be derived from any time-reversible nucleotide substitution matrix. We evaluate the performance of our method and previous approaches on a hybrid data set of real genomic DNA with simulated interspersed repeats. Our method outperforms a related method in terms of sensitivity, positive predictive value, and localizing boundaries of homology. The described methods have been implemented in freely available software, Repeatoire, available from: http://wwwabi.snv.jussieu.fr/public/Repeatoire.", acknowledgement = ack-nhfb, keywords = "DNA repeats; gapped extension.; genome comparison; hidden Markov model; local multiple alignment; Sequence alignment", } @Article{Qiu:2009:FMK, author = "Shibin Qiu and Terran Lane", title = "A Framework for Multiple Kernel Support Vector Regression and Its Applications to {siRNA} Efficacy Prediction", journal = j-TCBB, volume = "6", number = "2", pages = "190--199", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.139", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "The cell defense mechanism of RNA interference has applications in gene function analysis and promising potentials in human disease therapy. To effectively silence a target gene, it is desirable to select appropriate initiator siRNA molecules having satisfactory silencing capabilities. Computational prediction for silencing efficacy of siRNAs can assist this screening process before using them in biological experiments. String kernel functions, which operate directly on the string objects representing siRNAs and target mRNAs, have been applied to support vector regression for the prediction and improved accuracy over numerical kernels in multidimensional vector spaces constructed from descriptors of siRNA design rules. To fully utilize information provided by string and numerical data, we propose to unify the two in a kernel feature space by devising a multiple kernel regression framework where a linear combination of the kernels is used. We formulate the multiple kernel learning into a quadratically constrained quadratic programming (QCQP) problem, which although yields global optimal solution, is computationally demanding and requires a commercial solver package. We further propose three heuristics based on the principle of kernel-target alignment and predictive accuracy. Empirical results demonstrate that multiple kernel regression can improve accuracy, decrease model complexity by reducing the number of support vectors, and speed up computational performance dramatically. In addition, multiple kernel regression evaluates the importance of constituent kernels, which for the siRNA efficacy prediction problem, compares the relative significance of the design rules. Finally, we give insights into the multiple kernel regression mechanism and point out possible extensions.", acknowledgement = ack-nhfb, keywords = "multiple kernel heuristics; Multiple kernel learning; QCQP optimization; RNA interference; siRNA efficacy.; support vector regression", } @Article{Park:2009:NBI, author = "Yongjin Park and Stanley Shackney and Russell Schwartz", title = "Network-Based Inference of Cancer Progression from Microarray Data", journal = j-TCBB, volume = "6", number = "2", pages = "200--212", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.126", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Cancer cells exhibit a common phenotype of uncontrolled cell growth, but this phenotype may arise from many different combinations of mutations. By inferring how cells evolve in individual tumors, a process called cancer progression, we may be able to identify important mutational events for different tumor types, potentially leading to new therapeutics and diagnostics. Prior work has shown that it is possible to infer frequent progression pathways by using gene expression profiles to estimate &\#8220;distances'' between tumors. Here, we apply gene network models to improve these estimates of evolutionary distance by controlling for correlations among coregulated genes. We test three variants of this approach: one using an optimized best-fit network, another using sampling to infer a high-confidence subnetwork, and one using a modular network inferred from clusters of similarly expressed genes. Application to lung cancer and breast cancer microarray data sets shows small improvements in phylogenies when correcting from the optimized network and more substantial improvements when correcting from the sampled or modular networks. Our results suggest that a network correction approach improves estimates of tumor similarity, but sophisticated network models are needed to control for the large hypothesis space and sparse data currently available.", acknowledgement = ack-nhfb, keywords = "Biology and genetics; graphs and networks; machine learning.; trees", } @Article{Zhu:2009:GGA, author = "Qian Zhu and Zaky Adam and Vicky Choi and David Sankoff", title = "Generalized Gene Adjacencies, Graph Bandwidth, and Clusters in Yeast Evolution", journal = j-TCBB, volume = "6", number = "2", pages = "213--220", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.121", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "We present a parameterized definition of gene clusters that allows us to control the emphasis placed on conserved order within a cluster. Though motivated by biological rather than mathematical considerations, this parameter turns out to be closely related to the bandwidth parameter of a graph. Our focus will be on how this parameter affects the characteristics of clusters: how numerous they are, how large they are, how rearranged they are, and to what extent they are preserved from ancestor to descendant in a phylogenetic tree. We infer the latter property by dynamic programming optimization of the presence of individual edges at the ancestral nodes of the phylogeny. We apply our analysis to a set of genomes drawn from the Yeast Gene Order Browser.", acknowledgement = ack-nhfb, keywords = "Ashbya gossypii; Candida glabrata; Comparative genomics; dynamic programming; evolution; gene clusters; genome rearrangements; graph bandwidth; Kluyveromyces lactis.; Kluyveromyces waltii; phylogeny; Saccharomyces cerevisiae; yeast", } @Article{Bansal:2009:GDP, author = "Mukul S. Bansal and Oliver Eulenstein and Andr{\'e} Wehe", title = "The Gene-Duplication Problem: Near-Linear Time Algorithms for {NNI}-Based Local Searches", journal = j-TCBB, volume = "6", number = "2", pages = "221--231", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2009.7", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "The gene-duplication problem is to infer a species supertree from a collection of gene trees that are confounded by complex histories of gene-duplication events. This problem is NP-complete and thus requires efficient and effective heuristics. Existing heuristics perform a stepwise search of the tree space, where each step is guided by an exact solution to an instance of a local search problem. A classical local search problem is the {\tt NNI} search problem, which is based on the nearest neighbor interchange operation. In this work, we 1) provide a novel near-linear time algorithm for the {\tt NNI} search problem, 2) introduce extensions that significantly enlarge the search space of the {\tt NNI} search problem, and 3) present algorithms for these extended versions that are asymptotically just as efficient as our algorithm for the {\tt NNI} search problem. The exceptional speedup achieved in the extended {\tt NNI} search problems makes the gene-duplication problem more tractable for large-scale phylogenetic analyses. We verify the performance of our algorithms in a comparison study using sets of large randomly generated gene trees.", acknowledgement = ack-nhfb, keywords = "Computational phylogenetics; gene-duplication; local search; supertrees; {\tt NNI}.", } @Article{Sun:2009:DPP, author = "Yanni Sun and Jeremy Buhler", title = "Designing Patterns and Profiles for Faster {HMM} Search", journal = j-TCBB, volume = "6", number = "2", pages = "232--243", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.14", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Profile HMMs are powerful tools for modeling conserved motifs in proteins. They are widely used by search tools to classify new protein sequences into families based on domain architecture. However, the proliferation of known motifs and new proteomic sequence data poses a computational challenge for search, requiring days of CPU time to annotate an organism's proteome. It is highly desirable to speed up HMM search in large databases. We design PROSITE-like patterns and short profiles that are used as filters to rapidly eliminate protein-motif pairs for which a full profile HMM comparison does not yield a significant match. The design of the pattern-based filters is formulated as a multichoice knapsack problem. Profile-based filters with high sensitivity are extracted from a profile HMM based on their theoretical sensitivity and false positive rate. Experiments show that our profile-based filters achieve high sensitivity (near 100 percent) while keeping around 20\times speedup with respect to the unfiltered search program. Pattern-based filters typically retain at least 90 percent of the sensitivity of the source HMM with 30-40\times speedup. The profile-based filters have sensitivity comparable to the multistage filtering strategy HMMERHEAD [15] and are faster in most of our experiments.", acknowledgement = ack-nhfb, keywords = "bioinformatics databases; Biology and genetics; hidden Markov models.; sequence similarity search", } @Article{Shaik:2009:FAS, author = "Jahangheer Shaik and Mohammed Yeasin", title = "Fuzzy-Adaptive-Subspace-Iteration-Based Two-Way Clustering of Microarray Data", journal = j-TCBB, volume = "6", number = "2", pages = "244--259", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.15", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "This paper presents Fuzzy-Adaptive-Subspace-Iteration-based Two-way Clustering (FASIC) of microarray data for finding differentially expressed genes (DEGs) from two-sample microarray experiments. The concept of fuzzy membership is introduced to transform the hard adaptive subspace iteration (ASI) algorithm into a fuzzy-ASI algorithm to perform two-way clustering. The proposed approach follows a progressive framework to assign a relevance value to genes associated with each cluster. Subsequently, each gene cluster is scored and ranked based on its potential to provide a correct classification of the sample classes. These ranks are converted into P values using the R-test, and the significance of each gene is determined. A fivefold validation is performed on the DEGs selected using the proposed approach. Empirical analyses on a number of simulated microarray data sets are conducted to quantify the results obtained using the proposed approach. To exemplify the efficacy of the proposed approach, further analyses on different real microarray data sets are also performed.", acknowledgement = ack-nhfb, keywords = "classification and association rules; Clustering; data and knowledge visualization; data mining; feature extraction or construction.", } @Article{Vignes:2009:GCI, author = "Matthieu Vignes and Florence Forbes", title = "Gene Clustering via Integrated {Markov} Models Combining Individual and Pairwise Features", journal = j-TCBB, volume = "6", number = "2", pages = "260--270", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70248", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Clustering of genes into groups sharing common characteristics is a useful exploratory technique for a number of subsequent computational analysis. A wide range of clustering algorithms have been proposed in particular to analyze gene expression data, but most of them consider genes as independent entities or include relevant information on gene interactions in a suboptimal way. We propose a probabilistic model that has the advantage to account for individual data (e.g., expression) and pairwise data (e.g., interaction information coming from biological networks) simultaneously. Our model is based on hidden Markov random field models in which parametric probability distributions account for the distribution of individual data. Data on pairs, possibly reflecting distance or similarity measures between genes, are then included through a graph, where the nodes represent the genes, and the edges are weighted according to the available interaction information. As a probabilistic model, this model has many interesting theoretical features. In addition, preliminary experiments on simulated and real data show promising results and points out the gain in using such an approach. Availability: The software used in this work is written in C++ and is available with other supplementary material at http://mistis.inrialpes.fr/people/forbes/transparentia/supplementary.html.", acknowledgement = ack-nhfb, keywords = "gene expression.; Markov random fields; metabolic networks; model-based clustering", } @Article{Heath:2009:SMN, author = "Lenwood S. Heath and Allan A. Sioson", title = "Semantics of Multimodal Network Models", journal = j-TCBB, volume = "6", number = "2", pages = "271--280", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70242", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "A multimodal network (MMN) is a novel graph-theoretic formalism designed to capture the structure of biological networks and to represent relationships derived from multiple biological databases. MMNs generalize the standard notions of graphs and hypergraphs, which are the bases of current diagrammatic representations of biological phenomena, and incorporate the concept of mode. Each vertex of an MMN is a biological entity, a biot, while each modal hyperedge is a typed relationship, where the type is given by the mode of the hyperedge. The semantics of each modal hyperedge e is given through denotational semantics, where a valuation function f\_{e} defines the relationship among the values of the vertices incident on e. The meaning of an MMN is denoted in terms of the semantics of a hyperedge sequence. A companion paper defines MMNs and concentrates on the structural aspects of MMNs. This paper develops MMN denotational semantics when used as a representation of the semantics of biological networks and discusses applications of MMNs in managing complex biological data.", acknowledgement = ack-nhfb, keywords = "biological model; biological networks; biot; denotational semantics.; graph; hypergraph; mode; Multimodal network", } @Article{Arribas-Gil:2009:SAS, author = "Ana Arribas-Gil and Dirk Metzler and Jean-Louis Plouhinec", title = "Statistical Alignment with a Sequence Evolution Model Allowing Rate Heterogeneity along the Sequence", journal = j-TCBB, volume = "6", number = "2", pages = "281--295", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70246", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "We present a stochastic sequence evolution model to obtain alignments and estimate mutation rates between two homologous sequences. The model allows two possible evolutionary behaviors along a DNA sequence in order to determine conserved regions and take its heterogeneity into account. In our model, the sequence is divided into slow and fast evolution regions. The boundaries between these sections are not known. It is our aim to detect them. The evolution model is based on a fragment insertion and deletion process working on fast regions only and on a substitution process working on fast and slow regions with different rates. This model induces a pair hidden Markov structure at the level of alignments, thus making efficient statistical alignment algorithms possible. We propose two complementary estimation methods, namely, a Gibbs sampler for Bayesian estimation and a stochastic version of the EM algorithm for maximum likelihood estimation. Both algorithms involve the sampling of alignments. We propose a partial alignment sampler, which is computationally less expensive than the typical whole alignment sampler. We show the convergence of the two estimation algorithms when used with this partial sampler. Our algorithms provide consistent estimates for the mutation rates and plausible alignments and sequence segmentations on both simulated and real data.", acknowledgement = ack-nhfb, keywords = "biology and genetics.; Markov processes; mathematics and statistics; probabilistic algorithms; sequence evolution", } @Article{Weber:2009:VET, author = "Gunther H. Weber and Oliver Rubel and Min-Yu Huang and Angela H. DePace and Charless C. Fowlkes and Soile V. E. Keranen and Cris L. Luengo Hendriks and Hans Hagen and David W. Knowles and Jitendra Malik and Mark D. Biggin and Bernd Hamann", title = "Visual Exploration of Three-Dimensional Gene Expression Using Physical Views and Linked Abstract Views", journal = j-TCBB, volume = "6", number = "2", pages = "296--309", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70249", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "During animal development, complex patterns of gene expression provide positional information within the embryo. To better understand the underlying gene regulatory networks, the Berkeley Drosophila Transcription Network Project (BDTNP) has developed methods that support quantitative computational analysis of three-dimensional (3D) gene expression in early Drosophila embryos at cellular resolution. We introduce PointCloudXplore (PCX), an interactive visualization tool that supports visual exploration of relationships between different genes' expression using a combination of established visualization techniques. Two aspects of gene expression are of particular interest: 1) gene expression patterns defined by the spatial locations of cells expressing a gene and 2) relationships between the expression levels of multiple genes. PCX provides users with two corresponding classes of data views: 1) Physical Views based on the spatial relationships of cells in the embryo and 2) Abstract Views that discard spatial information and plot expression levels of multiple genes with respect to each other. Cell Selectors highlight data associated with subsets of embryo cells within a View. Using linking, these selected cells can be viewed in multiple representations. We describe PCX as a 3D gene expression visualization tool and provide examples of how it has been used by BDTNP biologists to generate new hypotheses.", acknowledgement = ack-nhfb, keywords = "brushing; information visualization; Interactive data exploration; multiple linked views; physical views; scatter plots.; spatial expression patterns; three-dimensional gene expression; visualization", } @Article{Dougherty:2009:CBM, author = "Edward R. Dougherty and Marcel Brun and Jeffrey M. Trent and Michael L. Bittner", title = "Conditioning-Based Modeling of Contextual Genomic Regulation", journal = j-TCBB, volume = "6", number = "2", pages = "310--320", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70247", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "A more complete understanding of the alterations in cellular regulatory and control mechanisms that occur in the various forms of cancer has been one of the central targets of the genomic and proteomic methods that allow surveys of the abundance and/or state of cellular macromolecules. This preference is driven both by the intractability of cancer to generic therapies, assumed to be due to the highly varied molecular etiologies observed in cancer, and by the opportunity to discern and dissect the regulatory and control interactions presented by the highly diverse assortment of perturbations of regulation and control that arise in cancer. Exploiting the opportunities for inference on the regulatory and control connections offered by these revealing system perturbations is fraught with the practical problems that arise from the way biological systems operate. Two classes of regulatory action in biological systems are particularly inimical to inference, convergent regulation, where a variety of regulatory actions result in a common set of control responses (crosstalk), and divergent regulation, where a single regulatory action produces entirely different sets of control responses, depending on cellular context (conditioning). We have constructed a coarse mathematical model of the propagation of regulatory influence in such distributed, context-sensitive regulatory networks that allows a quantitative estimation of the amount of crosstalk and conditioning associated with a candidate regulatory gene taken from a set of genes that have been profiled over a series of samples where the candidate's activity varies.", acknowledgement = ack-nhfb, keywords = "Microarray; regulatory networks.", } @Article{Heath:2009:MNS, author = "Lenwood S. Heath and Allan A. Sioson", title = "Multimodal Networks: Structure and Operations", journal = j-TCBB, volume = "6", number = "2", pages = "321--332", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70243", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "A multimodal network (MMN) is a novel graph-theoretic formalism designed to capture the structure of biological networks and to represent relationships derived from multiple biological databases. MMNs generalize the standard notions of graphs and hypergraphs, which are the bases of current diagrammatic representations of biological phenomena and incorporate the concept of mode. Each vertex of an MMN is a biological entity, a biot, while each modal hyperedge is a typed relationship, where the type is given by the mode of the hyperedge. The current paper defines MMNs and concentrates on the structural aspects of MMNs. A companion paper develops MMNs as a representation of the semantics of biological networks and discusses applications of the MMNs in managing complex biological data. The MMN model has been implemented in a database system containing multiple kinds of biological networks.", acknowledgement = ack-nhfb, keywords = "biological networks; biot.; graph; hypergraph; mode; Multimodal network", } @Article{Yukinawa:2009:OAB, author = "Naoto Yukinawa and Shigeyuki Oba and Kikuya Kato and Shin Ishii", title = "Optimal Aggregation of Binary Classifiers for Multiclass Cancer Diagnosis Using Gene Expression Profiles", journal = j-TCBB, volume = "6", number = "2", pages = "333--343", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70239", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Multiclass classification is one of the fundamental tasks in bioinformatics and typically arises in cancer diagnosis studies by gene expression profiling. There have been many studies of aggregating binary classifiers to construct a multiclass classifier based on one-versus-the-rest (1R), one-versus-one (11), or other coding strategies, as well as some comparison studies between them. However, the studies found that the best coding depends on each situation. Therefore, a new problem, which we call the &\#8220;optimal coding problem,'' has arisen: how can we determine which coding is the optimal one in each situation? To approach this optimal coding problem, we propose a novel framework for constructing a multiclass classifier, in which each binary classifier to be aggregated has a weight value to be optimally tuned based on the observed data. Although there is no a priori answer to the optimal coding problem, our weight tuning method can be a consistent answer to the problem. We apply this method to various classification problems including a synthesized data set and some cancer diagnosis data sets from gene expression profiling. The results demonstrate that, in most situations, our method can improve classification accuracy over simple voting heuristics and is better than or comparable to state-of-the-art multiclass predictors.", acknowledgement = ack-nhfb, keywords = "cancer diagnosis.; error correcting output coding; gene expression profiling; Multiclass classification", } @Article{Olman:2009:PCA, author = "Victor Olman and Fenglou Mao and Hongwei Wu and Ying Xu", title = "Parallel Clustering Algorithm for Large Data Sets with Applications in Bioinformatics", journal = j-TCBB, volume = "6", number = "2", pages = "344--352", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70272", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Large sets of bioinformatical data provide a challenge in time consumption while solving the cluster identification problem, and that is why a parallel algorithm is so needed for identifying dense clusters in a noisy background. Our algorithm works on a graph representation of the data set to be analyzed. It identifies clusters through the identification of densely intraconnected subgraphs. We have employed a minimum spanning tree (MST) representation of the graph and solve the cluster identification problem using this representation. The computational bottleneck of our algorithm is the construction of an MST of a graph, for which a parallel algorithm is employed. Our high-level strategy for the parallel MST construction algorithm is to first partition the graph, then construct MSTs for the partitioned subgraphs and auxiliary bipartite graphs based on the subgraphs, and finally merge these MSTs to derive an MST of the original graph. The computational results indicate that when running on 150 CPUs, our algorithm can solve a cluster identification problem on a data set with 1,000,000 data points almost 100 times faster than on single CPU, indicating that this program is capable of handling very large data clustering problems in an efficient manner. We have implemented the clustering algorithm as the software CLUMP.", acknowledgement = ack-nhfb, keywords = "clustering algorithm; genome application; parallel processing.; Pattern recognition", } @Article{Paul:2009:PCC, author = "Topon Kumar Paul and Hitoshi Iba", title = "Prediction of Cancer Class with Majority Voting Genetic Programming Classifier Using Gene Expression Data", journal = j-TCBB, volume = "6", number = "2", pages = "353--367", month = apr, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2007.70245", ISSN = "1545-5963", bibdate = "Mon Jun 1 17:03:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "In order to get a better understanding of different types of cancers and to find the possible biomarkers for diseases, recently, many researchers are analyzing the gene expression data using various machine learning techniques. However, due to a very small number of training samples compared to the huge number of genes and class imbalance, most of these methods suffer from overfitting. In this paper, we present a majority voting genetic programming classifier (MVGPC) for the classification of microarray data. Instead of a single rule or a single set of rules, we evolve multiple rules with genetic programming (GP) and then apply those rules to test samples to determine their labels with majority voting technique. By performing experiments on four different public cancer data sets, including multiclass data sets, we have found that the test accuracies of MVGPC are better than those of other methods, including AdaBoost with GP. Moreover, some of the more frequently occurring genes in the classification rules are known to be associated with the types of cancers being studied in this paper.", acknowledgement = ack-nhfb, keywords = "Classifier design and evaluation; data mining; evolutionary computing and genetic algorithm; feature extraction; gene expression; majority voting.", } @Article{Sagot:2009:EEI, author = "Marie-France Sagot", title = "{EIC} Editorial: Introducing New {Associate Editors}", journal = j-TCBB, volume = "6", number = "3", pages = "369--369", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2009.66", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Bi:2009:MCE, author = "Chengpeng Bi", title = "A {Monte Carlo} {EM} Algorithm for {De Novo Motif} Discovery in Biomolecular Sequences", journal = j-TCBB, volume = "6", number = "3", pages = "370--386", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.103", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Motif discovery methods play pivotal roles in deciphering the genetic regulatory codes (i.e., motifs) in genomes as well as in locating conserved domains in protein sequences. The Expectation Maximization (EM) algorithm is one of the most popular methods used in de novo motif discovery. Based on the position weight matrix (PWM) updating technique, this paper presents a Monte Carlo version of the EM motif-finding algorithm that carries out stochastic sampling in local alignment space to overcome the conventional EM's main drawback of being trapped in a local optimum. The newly implemented algorithm is named as Monte Carlo EM Motif Discovery Algorithm (MCEMDA). MCEMDA starts from an initial model, and then it iteratively performs Monte Carlo simulation and parameter update until convergence. A log-likelihood profiling technique together with the top-$k$ strategy is introduced to cope with the phase shifts and multiple modal issues in motif discovery problem. A novel grouping motif alignment (GMA) algorithm is designed to select motifs by clustering a population of candidate local alignments and successfully applied to subtle motif discovery. MCEMDA compares favorably to other popular PWM-based and word enumerative motif algorithms tested using simulated $(l, d)$-motif cases, documented prokaryotic, and eukaryotic DNA motif sequences. Finally, MCEMDA is applied to detect large blocks of conserved domains using protein benchmarks and exhibits its excellent capacity while compared with other multiple sequence alignment methods.", acknowledgement = ack-nhfb, keywords = "Expectation maximization (EM); Monte Carlo EM; motif discovery; multiple sequence alignment; transcriptional regulation.", } @Article{Stoye:2009:UAR, author = "Jens Stoye and Roland Wittler", title = "A Unified Approach for Reconstructing Ancient Gene Clusters", journal = j-TCBB, volume = "6", number = "3", pages = "387--400", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.135", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "The order of genes in genomes provides extensive information. In comparative genomics, differences or similarities of gene orders are determined to predict functional relations of genes or phylogenetic relations of genomes. For this purpose, various combinatorial models can be used to identify gene clusters&\#8212;groups of genes that are colocated in a set of genomes. We introduce a unified approach to model gene clusters and define the problem of labeling the inner nodes of a given phylogenetic tree with sets of gene clusters. Our optimization criterion in this context combines two properties: parsimony, i.e., the number of gains and losses of gene clusters has to be minimal, and consistency, i.e., for each ancestral node, there must exist at least one potential gene order that contains all the reconstructed clusters. We present and evaluate an exact algorithm to solve this problem. Despite its exponential worst-case time complexity, our method is suitable even for large-scale data. We show the effectiveness and efficiency on both simulated and real data.", acknowledgement = ack-nhfb, keywords = "Comparative genomics; consistency.; gene cluster; gene cluster reconstruction; gene order; parsimony; phylogeny", } @Article{Chen:2009:AAM, author = "Xin Chen and Yun Cui", title = "An Approximation Algorithm for the Minimum Breakpoint Linearization Problem", journal = j-TCBB, volume = "6", number = "3", pages = "401--409", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2009.3", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "In the recent years, there has been a growing interest in inferring the total order of genes or markers on a chromosome, since current genetic mapping efforts might only suffice to produce a partial order. Many interesting optimization problems were thus formulated in the framework of genome rearrangement. As an important one among them, the minimum breakpoint linearization (MBL) problem is to find the total order of a partially ordered genome that minimizes its breakpoint distance to a reference genome whose genes are already totally ordered. It was previously shown to be NP-hard, and the algorithms proposed so far are all heuristic. In this paper, we present an {m^2+m\over 2}-approximation algorithm for the MBL problem, where m is the number of gene maps that are combined together to form a partial order of the genome under investigation.", acknowledgement = ack-nhfb, keywords = "approximation algorithms.; breakpoint distance; Comparative genomics; partially ordered genomes", } @Article{Wang:2009:EKF, author = "Zidong Wang and Xiaohui Liu and Yurong Liu and Jinling Liang and Veronica Vinciotti", title = "An Extended {Kalman} Filtering Approach to Modeling Nonlinear Dynamic Gene Regulatory Networks via Short Gene Expression Time Series", journal = j-TCBB, volume = "6", number = "3", pages = "410--419", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2009.5", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "In this paper, the extended Kalman filter (EKF) algorithm is applied to model the gene regulatory network from gene time series data. The gene regulatory network is considered as a nonlinear dynamic stochastic model that consists of the gene measurement equation and the gene regulation equation. After specifying the model structure, we apply the EKF algorithm for identifying both the model parameters and the actual value of gene expression levels. It is shown that the EKF algorithm is an online estimation algorithm that can identify a large number of parameters (including parameters of nonlinear functions) through iterative procedure by using a small number of observations. Four real-world gene expression data sets are employed to demonstrate the effectiveness of the EKF algorithm, and the obtained models are evaluated from the viewpoint of bioinformatics.", acknowledgement = ack-nhfb, keywords = "clustering; DNA microarray technology; extended Kalman filtering; gene expression; Modeling; time series data.", } @Article{Bryant:2009:CDT, author = "David Bryant and Mike Steel", title = "Computing the Distribution of a Tree Metric", journal = j-TCBB, volume = "6", number = "3", pages = "420--426", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2009.32", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "The Robinson-Foulds (RF) distance is by far the most widely used measure of dissimilarity between trees. Although the distribution of these distances has been investigated for 20 years, an algorithm that is explicitly polynomial time has yet to be described for computing the distribution for trees around a given tree. In this paper, we derive a polynomial-time algorithm for this distribution. We show how the distribution can be approximated by a Poisson distribution determined by the proportion of leaves that lie in &\#8220;cherries'' of the given tree. We also describe how our results can be used to derive normalization constants that are required in a recently proposed maximum likelihood approach to supertree construction.", acknowledgement = ack-nhfb, keywords = "Biology and genetics; discrete mathematics applications; normalization constant.; phylogenetics; Poisson approximation; Robinson-Foulds distance; trees", } @Article{Hulsman:2009:EOK, author = "Marc Hulsman and Marcel J. T. Reinders and Dick de Ridder", title = "Evolutionary Optimization of Kernel Weights Improves Protein Complex Comembership Prediction", journal = j-TCBB, volume = "6", number = "3", pages = "427--437", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.137", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "In recent years, more and more high-throughput data sources useful for protein complex prediction have become available (e.g., gene sequence, mRNA expression, and interactions). The integration of these different data sources can be challenging. Recently, it has been recognized that kernel-based classifiers are well suited for this task. However, the different kernels (data sources) are often combined using equal weights. Although several methods have been developed to optimize kernel weights, no large-scale example of an improvement in classifier performance has been shown yet. In this work, we employ an evolutionary algorithm to determine weights for a larger set of kernels by optimizing a criterion based on the area under the ROC curve. We show that setting the right kernel weights can indeed improve performance. We compare this to the existing kernel weight optimization methods (i.e., (regularized) optimization of the SVM criterion or aligning the kernel with an ideal kernel) and find that these do not result in a significant performance improvement and can even cause a decrease in performance. Results also show that an expert approach of assigning high weights to features with high individual performance is not necessarily the best strategy.", acknowledgement = ack-nhfb, keywords = "biology and genetics; Classifier design and evaluation; evolutionary computing and genetic algorithms.", } @Article{Chen:2009:IAA, author = "Zhi-Zhong Chen and Lusheng Wang", title = "Improved Approximation Algorithms for Reconstructing the History of Tandem Repeats", journal = j-TCBB, volume = "6", number = "3", pages = "438--453", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.122", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Some genetic diseases in human beings are dominated by short sequences repeated consecutively called tandem repeats. Once a region containing tandem repeats is found, it is of great interest to study the history of creating the repeats. The computational problem of reconstructing the duplication history of tandem repeats has been studied extensively in the literature. Almost all previous studies focused on the simplest case where the size of each duplication block is 1. Only recently we succeeded in giving the first polynomial-time approximation algorithm with a guaranteed ratio for a more general case where the size of each duplication block is at most 2; the algorithm achieves a ratio of 6 and runs in O(n^{11}) time. In this paper, we present two new polynomial-time approximation algorithms for this more general case. One of them achieves a ratio of 5 and runs in O(n^9) time, while the other achieves a ratio of 2.5+\epsilon for any constant \epsilon > 0 but runs slower.", acknowledgement = ack-nhfb, keywords = "approximation algorithms.; Computational biology", } @Article{Cardona:2009:MPN, author = "Gabriel Cardona and Merce Llabres and Francesc Rossello and Gabriel Valiente", title = "Metrics for Phylogenetic Networks {II}: Nodal and Triplets Metrics", journal = j-TCBB, volume = "6", number = "3", pages = "454--469", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.127", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "The assessment of phylogenetic network reconstruction methods requires the ability to compare phylogenetic networks. This is the second in a series of papers devoted to the analysis and comparison of metrics for tree-child time consistent phylogenetic networks on the same set of taxa. In this paper, we generalize to phylogenetic networks two metrics that have already been introduced in the literature for phylogenetic trees: the nodal distance and the triplets distance. We prove that they are metrics on any class of tree-child time consistent phylogenetic networks on the same set of taxa, as well as some basic properties for them. To prove these results, we introduce a reduction/expansion procedure that can be used not only to establish properties of tree-child time consistent phylogenetic networks by induction, but also to generate all tree-child time consistent phylogenetic networks with a given number of leaves.", acknowledgement = ack-nhfb, keywords = "nodal distance; partition distance; Phylogenetic network; temporal representation; time consistency; tree-child phylogenetic network; triplets distance.", } @Article{Sotiropoulos:2009:MRM, author = "Vassilios Sotiropoulos and Marrie-Nathalie Contou-Carrere and Prodromos Daoutidis and Yiannis N. Kaznessis", title = "Model Reduction of Multiscale Chemical {Langevin} Equations: {A} Numerical Case Study", journal = j-TCBB, volume = "6", number = "3", pages = "470--482", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2009.23", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Two very important characteristics of biological reaction networks need to be considered carefully when modeling these systems. First, models must account for the inherent probabilistic nature of systems far from the thermodynamic limit. Often, biological systems cannot be modeled with traditional continuous-deterministic models. Second, models must take into consideration the disparate spectrum of time scales observed in biological phenomena, such as slow transcription events and fast dimerization reactions. In the last decade, significant efforts have been expended on the development of stochastic chemical kinetics models to capture the dynamics of biomolecular systems, and on the development of robust multiscale algorithms, able to handle stiffness. In this paper, the focus is on the dynamics of reaction sets governed by stiff chemical Langevin equations, i.e., stiff stochastic differential equations. These are particularly challenging systems to model, requiring prohibitively small integration step sizes. We describe and illustrate the application of a semianalytical reduction framework for chemical Langevin equations that results in significant gains in computational cost.", acknowledgement = ack-nhfb, keywords = "chemical Langevin equations (CLEs); Model reduction; multiscale models; stiff biomolecular systems.; stochastic chemical kinetics", } @Article{Roytberg:2009:SSP, author = "Mikhail Roytberg and Anna Gambin and Laurent Noe and Slawomir Lasota and Eugenia Furletova and Ewa Szczurek and Gregory Kucherov", title = "On Subset Seeds for Protein Alignment", journal = j-TCBB, volume = "6", number = "3", pages = "483--494", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2009.4", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "We apply the concept of subset seeds proposed in [1] to similarity search in protein sequences. The main question studied is the design of efficient seed alphabets to construct seeds with optimal sensitivity/selectivity trade-offs. We propose several different design methods and use them to construct several alphabets. We then perform a comparative analysis of seeds built over those alphabets and compare them with the standard Blastp seeding method [2], [3], as well as with the family of vector seeds proposed in [4]. While the formalism of subset seeds is less expressive (but less costly to implement) than the cumulative principle used in Blastp and vector seeds, our seeds show a similar or even better performance than Blastp on Bernoulli models of proteins compatible with the common BLOSUM62 matrix. Finally, we perform a large-scale benchmarking of our seeds against several main databases of protein alignments. Here again, the results show a comparable or better performance of our seeds versus Blastp.", acknowledgement = ack-nhfb, keywords = "local alignment; multiple seeds; protein databases; Protein sequences; seed alphabet; seeds; selectivity.; sensitivity; similarity search; subset seeds", } @Article{Jin:2009:PSP, author = "Guohua Jin and Luay Nakhleh and Sagi Snir and Tamir Tuller", title = "Parsimony Score of Phylogenetic Networks: Hardness Results and a Linear-Time Heuristic", journal = j-TCBB, volume = "6", number = "3", pages = "495--505", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.119", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Phylogenies&\#8212;the evolutionary histories of groups of organisms&\#8212;play a major role in representing the interrelationships among biological entities. Many methods for reconstructing and studying such phylogenies have been proposed, almost all of which assume that the underlying history of a given set of species can be represented by a binary tree. Although many biological processes can be effectively modeled and summarized in this fashion, others cannot: recombination, hybrid speciation, and horizontal gene transfer result in networks of relationships rather than trees of relationships. In previous works, we formulated a maximum parsimony (MP) criterion for reconstructing and evaluating phylogenetic networks, and demonstrated its quality on biological as well as synthetic data sets. In this paper, we provide further theoretical results as well as a very fast heuristic algorithm for the MP criterion of phylogenetic networks. In particular, we provide a novel combinatorial definition of phylogenetic networks in terms of &\#8220;forbidden cycles,'' and provide detailed hardness and hardness of approximation proofs for the 'small'' MP problem. We demonstrate the performance of our heuristic in terms of time and accuracy on both biological and synthetic data sets. Finally, we explain the difference between our model and a similar one formulated by Nguyen et al., and describe the implications of this difference on the hardness and approximation results.", acknowledgement = ack-nhfb, keywords = "hardness and approximation.; horizontal gene transfer; Maximum parsimony; phylogenetic networks", } @Article{Thomas:2009:PDS, author = "John Thomas and Naren Ramakrishnan and Chris Bailey-Kellogg", title = "Protein Design by Sampling an Undirected Graphical Model of Residue Constraints", journal = j-TCBB, volume = "6", number = "3", pages = "506--516", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.124", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "This paper develops an approach for designing protein variants by sampling sequences that satisfy residue constraints encoded in an undirected probabilistic graphical model. Due to evolutionary pressures on proteins to maintain structure and function, the sequence record of a protein family contains valuable information regarding position-specific residue conservation and coupling (or covariation) constraints. Representing these constraints with a graphical model provides two key benefits for protein design: a probabilistic semantics enabling evaluation of possible sequences for consistency with the constraints, and an explicit factorization of residue dependence and independence supporting efficient exploration of the constrained sequence space. We leverage these benefits in developing two complementary MCMC algorithms for protein design: constrained shuffling mixes wild-type sequences positionwise and evaluates graphical model likelihood, while component sampling directly generates sequences by sampling clique values and propagating to other cliques. We apply our methods to design WW domains. We demonstrate that likelihood under a model of wild-type WWs is highly predictive of foldedness of new WWs. We then show both theoretical and rapid empirical convergence of our algorithms in generating high-likelihood, diverse new sequences. We further show that these sequences capture the original sequence constraints, yielding a model as predictive of foldedness as the original one.", acknowledgement = ack-nhfb, keywords = "graphical models; Markov chain Monte Carlo (MCMC).; Protein design; residue coupling", } @Article{Smith:2009:RSD, author = "Jennifer A. Smith", title = "{RNA} Search with Decision Trees and Partial Covariance Models", journal = j-TCBB, volume = "6", number = "3", pages = "517--527", month = jul, year = "2009", CODEN = "ITCBCY", DOI = "http://dx.doi.org/10.1109/TCBB.2008.120", ISSN = "1545-5963", bibdate = "Tue Aug 11 18:13:22 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "The use of partial covariance models to search for RNA family members in genomic sequence databases is explored. The partial models are formed from contiguous subranges of the overall RNA family multiple alignment columns. A binary decision-tree framework is presented for choosing the order to apply the partial models and the score thresholds on which to make the decisions. The decision trees are chosen to minimize computation time subject to the constraint that all of the training sequences are passed to the full covariance model for final evaluation. Computational intelligence methods are suggested to select the decision tree since the tree can be quite complex and there is no obvious method to build the tree in these cases. Experimental results from seven RNA families shows execution times of 0.066-0.268 relative to using the full covariance model alone. Tests on the full sets of known sequences for each family show that at least 95 percent of these sequences are found for two families and 100 percent for five others. Since the full covariance model is run on all sequences accepted by the partial model decision tree, the false alarm rate is at least as low as that of the full model alone.", acknowledgement = ack-nhfb, keywords = "Bioinformatics; computational intelligence; covariance models; decision trees; RNA database search.", }