Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.63",
%%%     date            = "21 November 2024",
%%%     time            = "05:58:16 MST",
%%%     filename        = "tkdd.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "https://www.math.utah.edu/~beebe",
%%%     checksum        = "22802 44140 226892 2136722",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "ACM Transactions on Knowledge Discovery from
%%%                        Data (TKDD); bibliography; TKDD",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        ACM Transactions on Knowledge Discovery from
%%%                        Data (TKDD) (CODEN ????, ISSN 1556-4681),
%%%                        covering all journal issues from 2007 --
%%%                        date.
%%%
%%%                        At version 1.63, the COMPLETE journal
%%%                        coverage looked like this:
%%%
%%%                             2007 (  14)    2013 (  20)    2019 (  65)
%%%                             2008 (  18)    2014 (  37)    2020 (  78)
%%%                             2009 (  25)    2015 (  41)    2021 ( 112)
%%%                             2010 (  26)    2016 (  54)    2022 ( 126)
%%%                             2011 (  11)    2017 (  27)    2023 ( 137)
%%%                             2012 (  26)    2018 (  73)    2024 ( 234)
%%%
%%%                             Article:       1124
%%%
%%%                             Total entries: 1124
%%%
%%%                        The journal Web page can be found at:
%%%
%%%                            http://www.acm.org/pubs/tkdd.html
%%%
%%%                        The journal table of contents page is at:
%%%
%%%                            http://www.acm.org/tkdd/
%%%                            http://portal.acm.org/browse_dl.cfm?idx=J1054
%%%
%%%                        Qualified subscribers can retrieve the full
%%%                        text of recent articles in PDF form.
%%%
%%%                        The initial draft was extracted from the ACM
%%%                        Web pages.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.  Article reviews have been
%%%                        omitted, until their copyright status has
%%%                        been clarified.
%%%
%%%                        bibsource keys in the bibliography entries
%%%                        below indicate the entry originally came
%%%                        from the computer science bibliography
%%%                        archive, even though it has likely since
%%%                        been corrected and updated.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%                        information about the entry.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using ``bibsort -byvolume.''
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility."
%%%     }
%%% ====================================================================
@Preamble{"\input bibnames.sty" #
    "\def \TM {${}^{\sc TM}$}" #
    "\ifx \undefined \bioname      \def \bioname#1{{{\em #1\/}}} \fi"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|https://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-TKDD                  = "ACM Transactions on Knowledge
                                  Discovery from Data (TKDD)"}

%%% ====================================================================
%%% Bibliography entries:
@Article{Han:2007:I,
  author =       "Jiawei Han",
  title =        "Introduction",
  journal =      j-TKDD,
  volume =       "1",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1217299.1217300",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:36 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Leskovec:2007:GED,
  author =       "Jure Leskovec and Jon Kleinberg and Christos
                 Faloutsos",
  title =        "Graph evolution: {Densification} and shrinking
                 diameters",
  journal =      j-TKDD,
  volume =       "1",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1217299.1217301",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:36 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "How do real graphs evolve over time? What are normal
                 growth patterns in social, technological, and
                 information networks? Many studies have discovered
                 patterns in {\em static graphs}, identifying properties
                 in a single snapshot of a large network or in a very
                 small number of snapshots; these include heavy tails
                 for in- and out-degree distributions, communities,
                 small-world phenomena, and others. However, given the
                 lack of information about network evolution over long
                 periods, it has been hard to convert these findings
                 into statements about trends over time.\par

                 Here we study a wide range of real graphs, and we
                 observe some surprising phenomena. First, most of these
                 graphs densify over time with the number of edges
                 growing superlinearly in the number of nodes. Second,
                 the average distance between nodes often shrinks over
                 time in contrast to the conventional wisdom that such
                 distance parameters should increase slowly as a
                 function of the number of nodes (like $ O(\log n) $ or
                 $ O(\log (\log n))$).\par

                 Existing graph generation models do not exhibit these
                 types of behavior even at a qualitative level. We
                 provide a new graph generator, based on a forest fire
                 spreading process that has a simple, intuitive
                 justification, requires very few parameters (like the
                 flammability of nodes), and produces graphs exhibiting
                 the full range of properties observed both in prior
                 work and in the present study.\par

                 We also notice that the forest fire model exhibits a
                 sharp transition between sparse graphs and graphs that
                 are densifying. Graphs with decreasing distance between
                 the nodes are generated around this transition
                 point.\par

                 Last, we analyze the connection between the temporal
                 evolution of the degree distribution and densification
                 of a graph. We find that the two are fundamentally
                 related. We also observe that real networks exhibit
                 this type of relation between densification and the
                 degree distribution.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Densification power laws; graph generators; graph
                 mining; heavy-tailed distributions; small-world
                 phenomena",
}

@Article{Machanavajjhala:2007:DPB,
  author =       "Ashwin Machanavajjhala and Daniel Kifer and Johannes
                 Gehrke and Muthuramakrishnan Venkitasubramaniam",
  title =        "{$L$}-diversity: {Privacy} beyond $k$-anonymity",
  journal =      j-TKDD,
  volume =       "1",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1217299.1217302",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:36 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Publishing data about individuals without revealing
                 sensitive information about them is an important
                 problem. In recent years, a new definition of privacy
                 called $k$-anonymity has gained popularity. In a
                 $k$-anonymized dataset, each record is
                 indistinguishable from at least $ k - 1$ other records
                 with respect to certain identifying attributes.\par

                 In this article, we show using two simple attacks that
                 a $k$-anonymized dataset has some subtle but severe
                 privacy problems. First, an attacker can discover the
                 values of sensitive attributes when there is little
                 diversity in those sensitive attributes. This is a
                 known problem. Second, attackers often have background
                 knowledge, and we show that $k$-anonymity does not
                 guarantee privacy against attackers using background
                 knowledge. We give a detailed analysis of these two
                 attacks, and we propose a novel and powerful privacy
                 criterion called $ \ell $-diversity that can defend
                 against such attacks. In addition to building a formal
                 foundation for $ \ell $-diversity, we show in an
                 experimental evaluation that $ \ell $-diversity is
                 practical and can be implemented efficiently.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "-diversity; Data privacy; ell-k-anonymity;
                 privacy-preserving data publishing",
}

@Article{Gionis:2007:CA,
  author =       "Aristides Gionis and Heikki Mannila and Panayiotis
                 Tsaparas",
  title =        "Clustering aggregation",
  journal =      j-TKDD,
  volume =       "1",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1217299.1217303",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:36 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We consider the following problem: given a set of
                 clusterings, find a single clustering that agrees as
                 much as possible with the input clusterings. This
                 problem, {\em clustering aggregation}, appears
                 naturally in various contexts. For example, clustering
                 categorical data is an instance of the clustering
                 aggregation problem; each categorical attribute can be
                 viewed as a clustering of the input rows where rows are
                 grouped together if they take the same value on that
                 attribute. Clustering aggregation can also be used as a
                 metaclustering method to improve the robustness of
                 clustering by combining the output of multiple
                 algorithms. Furthermore, the problem formulation does
                 not require a priori information about the number of
                 clusters; it is naturally determined by the
                 optimization function.\par

                 In this article, we give a formal statement of the
                 clustering aggregation problem, and we propose a number
                 of algorithms. Our algorithms make use of the
                 connection between clustering aggregation and the
                 problem of {\em correlation clustering}. Although the
                 problems we consider are NP-hard, for several of our
                 methods, we provide theoretical guarantees on the
                 quality of the solutions. Our work provides the best
                 deterministic approximation algorithm for the variation
                 of the correlation clustering problem we consider. We
                 also show how sampling can be used to scale the
                 algorithms for large datasets. We give an extensive
                 empirical evaluation demonstrating the usefulness of
                 the problem and of the solutions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "clustering aggregation; clustering categorical data;
                 correlation clustering; Data clustering",
}

@Article{Bhattacharya:2007:CER,
  author =       "Indrajit Bhattacharya and Lise Getoor",
  title =        "Collective entity resolution in relational data",
  journal =      j-TKDD,
  volume =       "1",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1217299.1217304",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:36 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Many databases contain uncertain and imprecise
                 references to real-world entities. The absence of
                 identifiers for the underlying entities often results
                 in a database which contains multiple references to the
                 same entity. This can lead not only to data redundancy,
                 but also inaccuracies in query processing and knowledge
                 extraction. These problems can be alleviated through
                 the use of {\em entity resolution}. Entity resolution
                 involves discovering the underlying entities and
                 mapping each database reference to these entities.
                 Traditionally, entities are resolved using pairwise
                 similarity over the attributes of references. However,
                 there is often additional relational information in the
                 data. Specifically, references to different entities
                 may cooccur. In these cases, collective entity
                 resolution, in which entities for cooccurring
                 references are determined jointly rather than
                 independently, can improve entity resolution accuracy.
                 We propose a novel relational clustering algorithm that
                 uses both attribute and relational information for
                 determining the underlying domain entities, and we give
                 an efficient implementation. We investigate the impact
                 that different relational similarity measures have on
                 entity resolution quality. We evaluate our collective
                 entity resolution algorithm on multiple real-world
                 databases. We show that it improves entity resolution
                 performance over both attribute-based baselines and
                 over algorithms that consider relational information
                 but do not resolve entities collectively. In addition,
                 we perform detailed experiments on synthetically
                 generated data to identify data characteristics that
                 favor collective relational resolution over purely
                 attribute-based algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "data cleaning; Entity resolution; graph clustering;
                 record linkage",
}

@Article{Loh:2007:EEL,
  author =       "Wei-Yin Loh and Chien-Wei Chen and Wei Zheng",
  title =        "Extrapolation errors in linear model trees",
  journal =      j-TKDD,
  volume =       "1",
  number =       "2",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1267066.1267067",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:48 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Prediction errors from a linear model tend to be
                 larger when extrapolation is involved, particularly
                 when the model is wrong. This article considers the
                 problem of extrapolation and interpolation errors when
                 a linear model tree is used for prediction. It proposes
                 several ways to curtail the size of the errors, and
                 uses a large collection of real datasets to demonstrate
                 that the solutions are effective in reducing the
                 average mean squared prediction error. The article also
                 provides a proof that, if a linear model is correct,
                 the proposed solutions have no undesirable effects as
                 the training sample size tends to infinity.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Decision tree; prediction; regression; statistics",
}

@Article{Zhang:2007:MPP,
  author =       "Minghua Zhang and Ben Kao and David W. Cheung and
                 Kevin Y. Yip",
  title =        "Mining periodic patterns with gap requirement from
                 sequences",
  journal =      j-TKDD,
  volume =       "1",
  number =       "2",
  pages =        "7:1--7:??",
  month =        aug,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1267066.1267068",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:48 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We study a problem of mining frequently occurring
                 periodic patterns with a gap requirement from
                 sequences. Given a character sequence $S$ of length $L$
                 and a pattern $P$ of length $l$, we consider $P$ a
                 frequently occurring pattern in $S$ if the probability
                 of {\em observing\/} $P$ given a randomly picked
                 length-$l$ subsequence of $S$ exceeds a certain
                 threshold. In many applications, particularly those
                 related to bioinformatics, interesting patterns are
                 {\em periodic\/} with a {\em gap requirement}. That is
                 to say, the characters in $P$ should match subsequences
                 of $S$ in such a way that the matching characters in
                 $S$ are separated by gaps of more or less the same
                 size. We show the complexity of the mining problem and
                 discuss why traditional mining algorithms are
                 computationally infeasible. We propose practical
                 algorithms for solving the problem and study their
                 characteristics. We also present a case study in which
                 we apply our algorithms on some DNA sequences. We
                 discuss some interesting patterns obtained from the
                 case study.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "gap requirement; periodic pattern; Sequence mining",
}

@Article{Huang:2007:TTE,
  author =       "Jen-Wei Huang and Bi-Ru Dai and Ming-Syan Chen",
  title =        "{Twain}: {Two-end} association miner with precise
                 frequent exhibition periods",
  journal =      j-TKDD,
  volume =       "1",
  number =       "2",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1267066.1267069",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:48 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We investigate the general model of mining
                 associations in a temporal database, where the
                 exhibition periods of items are allowed to be different
                 from one to another. The database is divided into
                 partitions according to the time granularity imposed.
                 Such temporal association rules allow us to observe
                 short-term but interesting patterns that are absent
                 when the whole range of the database is evaluated
                 altogether. Prior work may omit some temporal
                 association rules and thus have limited practicability.
                 To remedy this and to give more precise frequent
                 exhibition periods of frequent temporal itemsets, we
                 devise an efficient algorithm {\em Twain\/} (standing
                 for {\em TWo end AssocIation miNer\/}). {\em Twain\/}
                 not only generates frequent patterns with more precise
                 frequent exhibition periods, but also discovers more
                 interesting frequent patterns. {\em Twain\/} employs
                 Start time and End time of each item to provide precise
                 frequent exhibition period while progressively handling
                 itemsets from one partition to another. Along with one
                 scan of the database, {\em Twain\/} can generate
                 frequent 2-itemsets directly according to the
                 cumulative filtering threshold. Then, {\em Twain\/}
                 adopts the scan reduction technique to generate all
                 frequent $k$-itemsets ($k$ > 2) from the generated
                 frequent 2-itemsets. Theoretical properties of {\em
                 Twain\/} are derived as well in this article. The
                 experimental results show that {\em Twain\/}
                 outperforms the prior works in the quality of frequent
                 patterns, execution time, I/O cost, CPU overhead and
                 scalability.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Association; temporal",
}

@Article{Bayardop:2007:ISI,
  author =       "Roberto Bayardop and Kristin P. Bennett and Gautam Das
                 and Dimitrios Gunopulos and Johannes Gunopulos",
  title =        "Introduction to special issue {ACM SIGKDD 2006}",
  journal =      j-TKDD,
  volume =       "1",
  number =       "3",
  pages =        "9:1--9:??",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1297332.1297333",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:56 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Bohm:2007:RPF,
  author =       "Christian B{\"o}hm and Christos Faloutsos and Jia-Yu
                 Pan and Claudia Plant",
  title =        "{RIC}: {Parameter-free} noise-robust clustering",
  journal =      j-TKDD,
  volume =       "1",
  number =       "3",
  pages =        "10:1--10:??",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1297332.1297334",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:56 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "How do we find a {\em natural\/} clustering of a
                 real-world point set which contains an unknown number
                 of clusters with different shapes, and which may be
                 contaminated by noise? As most clustering algorithms
                 were designed with certain assumptions (Gaussianity),
                 they often require the user to give input parameters,
                 and are sensitive to noise. In this article, we propose
                 a robust framework for determining a natural clustering
                 of a given dataset, based on the minimum description
                 length (MDL) principle. The proposed framework, {\em
                 robust information-theoretic clustering (RIC)}, is
                 orthogonal to any known clustering algorithm: Given a
                 preliminary clustering, RIC purifies these clusters
                 from noise, and adjusts the clusterings such that it
                 simultaneously determines the most natural amount and
                 shape (subspace) of the clusters. Our RIC method can be
                 combined with any clustering technique ranging from
                 K-means and K-medoids to advanced methods such as
                 spectral clustering. In fact, RIC is even able to
                 purify and improve an initial coarse clustering, even
                 if we start with very simple methods. In an extension,
                 we propose a fully automatic stand-alone clustering
                 method and efficiency improvements. RIC scales well
                 with the dataset size. Extensive experiments on
                 synthetic and real-world datasets validate the proposed
                 RIC framework.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Clustering; data summarization; noise robustness;
                 parameter-free data mining",
}

@Article{Mei:2007:SAF,
  author =       "Qiaozhu Mei and Dong Xin and Hong Cheng and Jiawei Han
                 and Chengxiang Zhai",
  title =        "Semantic annotation of frequent patterns",
  journal =      j-TKDD,
  volume =       "1",
  number =       "3",
  pages =        "11:1--11:??",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1297332.1297335",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:56 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Using frequent patterns to analyze data has been one
                 of the fundamental approaches in many data mining
                 applications. Research in frequent pattern mining has
                 so far mostly focused on developing efficient
                 algorithms to discover various kinds of frequent
                 patterns, but little attention has been paid to the
                 important next step --- interpreting the discovered
                 frequent patterns. Although the compression and
                 summarization of frequent patterns has been studied in
                 some recent work, the proposed techniques there can
                 only annotate a frequent pattern with nonsemantical
                 information (e.g., support), which provides only
                 limited help for a user to understand the
                 patterns.\par

                 In this article, we study the novel problem of
                 generating semantic annotations for frequent patterns.
                 The goal is to discover the hidden meanings of a
                 frequent pattern by annotating it with in-depth,
                 concise, and structured information. We propose a
                 general approach to generate such an annotation for a
                 frequent pattern by constructing its context model,
                 selecting informative context indicators, and
                 extracting representative transactions and semantically
                 similar patterns. This general approach can well
                 incorporate the user's prior knowledge, and has
                 potentially many applications, such as generating a
                 dictionary-like description for a pattern, finding
                 synonym patterns, discovering semantic relations, and
                 summarizing semantic classes of a set of frequent
                 patterns. Experiments on different datasets show that
                 our approach is effective in generating semantic
                 pattern annotations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Frequent pattern; pattern annotation; pattern context;
                 pattern semantic analysis",
}

@Article{Koren:2007:MEP,
  author =       "Yehuda Koren and Stephen C. North and Chris Volinsky",
  title =        "Measuring and extracting proximity graphs in
                 networks",
  journal =      j-TKDD,
  volume =       "1",
  number =       "3",
  pages =        "12:1--12:??",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1297332.1297336",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:56 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Measuring distance or some other form of proximity
                 between objects is a standard data mining tool.
                 Connection subgraphs were recently proposed as a way to
                 demonstrate proximity between nodes in networks. We
                 propose a new way of measuring and extracting proximity
                 in networks called ``cycle-free effective conductance''
                 (CFEC). Importantly, the measured proximity is
                 accompanied with a {\em proximity subgraph\/} which
                 allows assessing and understanding measured values. Our
                 proximity calculation can handle more than two
                 endpoints, directed edges, is statistically well
                 behaved, and produces an effectiveness score for the
                 computed subgraphs. We provide an efficient algorithm
                 to measure and extract proximity. Also, we report
                 experimental results and show examples for four large
                 network datasets: a telecommunications calling graph,
                 the IMDB actors graph, an academic coauthorship
                 network, and a movie recommendation system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Connection subgraph; cycle-free escape probability;
                 escape probability; graph mining; proximity; proximity
                 subgraph; random walk",
}

@Article{Ihler:2007:LDE,
  author =       "Alexander Ihler and Jon Hutchins and Padhraic Smyth",
  title =        "Learning to detect events with {Markov}-modulated
                 {Poisson} processes",
  journal =      j-TKDD,
  volume =       "1",
  number =       "3",
  pages =        "13:1--13:??",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1297332.1297337",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:56 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Time-series of count data occur in many different
                 contexts, including Internet navigation logs, freeway
                 traffic monitoring, and security logs associated with
                 buildings. In this article we describe a framework for
                 detecting anomalous events in such data using an
                 unsupervised learning approach. Normal periodic
                 behavior is modeled via a time-varying Poisson process
                 model, which in turn is modulated by a hidden Markov
                 process that accounts for bursty events. We outline a
                 Bayesian framework for learning the parameters of this
                 model from count time-series. Two large real-world
                 datasets of time-series counts are used as testbeds to
                 validate the approach, consisting of freeway traffic
                 data and logs of people entering and exiting a
                 building. We show that the proposed model is
                 significantly more accurate at detecting known events
                 than a more traditional threshold-based technique. We
                 also describe how the model can be used to investigate
                 different degrees of periodicity in the data, including
                 systematic day-of-week and time-of-day effects, and to
                 make inferences about different aspects of events such
                 as number of vehicles or people involved. The results
                 indicate that the Markov-modulated Poisson framework
                 provides a robust and accurate framework for adaptively
                 and autonomously learning how to separate unusual
                 bursty events from traces of normal human activity.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Event detection; Markov modulated; Poisson",
}

@Article{Gionis:2007:ADM,
  author =       "Aristides Gionis and Heikki Mannila and Taneli
                 Mielik{\"a}inen and Panayiotis Tsaparas",
  title =        "Assessing data mining results via swap randomization",
  journal =      j-TKDD,
  volume =       "1",
  number =       "3",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1297332.1297338",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:58:56 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The problem of assessing the significance of data
                 mining results on high-dimensional 0--1 datasets has
                 been studied extensively in the literature. For
                 problems such as mining frequent sets and finding
                 correlations, significance testing can be done by
                 standard statistical tests such as chi-square, or other
                 methods. However, the results of such tests depend only
                 on the specific attributes and not on the dataset as a
                 whole. Moreover, the tests are difficult to apply to
                 sets of patterns or other complex results of data
                 mining algorithms. In this article, we consider a
                 simple randomization technique that deals with this
                 shortcoming. The approach consists of producing random
                 datasets that have the same row and column margins as
                 the given dataset, computing the results of interest on
                 the randomized instances and comparing them to the
                 results on the actual data. This randomization
                 technique can be used to assess the results of many
                 different types of data mining algorithms, such as
                 frequent sets, clustering, and spectral analysis. To
                 generate random datasets with given margins, we use
                 variations of a Markov chain approach which is based on
                 a simple swap operation. We give theoretical results on
                 the efficiency of different randomization methods, and
                 apply the swap randomization method to several
                 well-known datasets. Our results indicate that for some
                 datasets the structure discovered by the data mining
                 algorithms is expected, given the row and column
                 margins of the datasets, while for other datasets the
                 discovered structure conveys information that is not
                 captured by the margin counts.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "0--1 data; randomization tests; Significance testing;
                 swaps",
}

@Article{Tang:2008:TTA,
  author =       "Lei Tang and Huan Liu and Jianping Zhang and Nitin
                 Agarwal and John J. Salerno",
  title =        "Topic taxonomy adaptation for group profiling",
  journal =      j-TKDD,
  volume =       "1",
  number =       "4",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324172.1324173",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:07 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "A topic taxonomy is an effective representation that
                 describes salient features of virtual groups or online
                 communities. A topic taxonomy consists of topic nodes.
                 Each internal node is defined by its vertical path
                 (i.e., ancestor and child nodes) and its horizontal
                 list of attributes (or terms). In a text-dominant
                 environment, a topic taxonomy can be used to flexibly
                 describe a group's interests with varying granularity.
                 However, the stagnant nature of a taxonomy may fail to
                 timely capture the dynamic change of a group's
                 interest. This article addresses the problem of how to
                 adapt a topic taxonomy to the accumulated data that
                 reflects the change of a group's interest to achieve
                 dynamic group profiling. We first discuss the issues
                 related to topic taxonomy. We next formulate taxonomy
                 adaptation as an optimization problem to find the
                 taxonomy that best fits the data. We then present a
                 viable algorithm that can efficiently accomplish
                 taxonomy adaptation. We conduct extensive experiments
                 to evaluate our approach's efficacy for group
                 profiling, compare the approach with some alternatives,
                 and study its performance for dynamic group profiling.
                 While pointing out various applications of taxonomy
                 adaption, we suggest some future work that can take
                 advantage of burgeoning Web 2.0 services for online
                 targeted marketing, counterterrorism in connecting
                 dots, and community tracking.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "dynamic profiling; group interest; taxonomy
                 adjustment; text hierarchical classification; Topic
                 taxonomy",
}

@Article{Cormode:2008:FHH,
  author =       "Graham Cormode and Flip Korn and S. Muthukrishnan and
                 Divesh Srivastava",
  title =        "Finding hierarchical heavy hitters in streaming data",
  journal =      j-TKDD,
  volume =       "1",
  number =       "4",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324172.1324174",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:07 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Data items that arrive online as streams typically
                 have attributes which take values from one or more
                 hierarchies (time and geographic location, source and
                 destination IP addresses, etc.). Providing an aggregate
                 view of such data is important for summarization,
                 visualization, and analysis. We develop an aggregate
                 view based on certain organized sets of large-valued
                 regions (``heavy hitters'') corresponding to
                 hierarchically discounted frequency counts. We formally
                 define the notion of {\em hierarchical heavy hitters\/}
                 (HHHs). We first consider computing (approximate) HHHs
                 over a data stream drawn from a single hierarchical
                 attribute. We formalize the problem and give
                 deterministic algorithms to find them in a single pass
                 over the input.\par

                 In order to analyze a wider range of realistic data
                 streams (e.g., from IP traffic-monitoring
                 applications), we generalize this problem to multiple
                 dimensions. Here, the semantics of HHHs are more
                 complex, since a ``child'' node can have multiple
                 ``parent'' nodes. We present online algorithms that
                 find approximate HHHs in one pass, with provable
                 accuracy guarantees. The product of hierarchical
                 dimensions forms a mathematical lattice structure. Our
                 algorithms exploit this structure, and so are able to
                 track approximate HHHs using only a small, fixed number
                 of statistics per stored item, regardless of the number
                 of dimensions.\par

                 We show experimentally, using real data, that our
                 proposed algorithms yields outputs which are very
                 similar (virtually identical, in many cases) to offline
                 computations of the exact solutions, whereas
                 straightforward heavy-hitters-based approaches give
                 significantly inferior answer quality. Furthermore, the
                 proposed algorithms result in an order of magnitude
                 savings in data structure size while performing
                 competitively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "approximation algorithms; Data mining; network data
                 analysis",
}

@Article{Somaiya:2008:LCU,
  author =       "Manas Somaiya and Christopher Jermaine and Sanjay
                 Ranka",
  title =        "Learning correlations using the mixture-of-subsets
                 model",
  journal =      j-TKDD,
  volume =       "1",
  number =       "4",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324172.1324175",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:07 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Using a mixture of random variables to model data is a
                 tried-and-tested method common in data mining, machine
                 learning, and statistics. By using mixture modeling it
                 is often possible to accurately model even complex,
                 multimodal data via very simple components. However,
                 the classical mixture model assumes that a data point
                 is generated by a single component in the model. A lot
                 of datasets can be modeled closer to the underlying
                 reality if we drop this restriction. We propose a
                 probabilistic framework, the {\em mixture-of-subsets
                 (MOS) model}, by making two fundamental changes to the
                 classical mixture model. First, we allow a data point
                 to be generated by a set of components, rather than
                 just a single component. Next, we limit the number of
                 data attributes that each component can influence. We
                 also propose an EM framework to learn the MOS model
                 from a dataset, and experimentally evaluate it on real,
                 high-dimensional datasets. Our results show that the
                 MOS model learned from the data represents the
                 underlying nature of the data accurately.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "EM algorithm; high-dimensional data; Mixture
                 modeling",
}

@Article{Halkidi:2008:CFB,
  author =       "M. Halkidi and D. Gunopulos and M. Vazirgiannis and N.
                 Kumar and C. Domeniconi",
  title =        "A clustering framework based on subjective and
                 objective validity criteria",
  journal =      j-TKDD,
  volume =       "1",
  number =       "4",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1324172.1324176",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:07 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Clustering, as an unsupervised learning process is a
                 challenging problem, especially in cases of
                 high-dimensional datasets. Clustering result quality
                 can benefit from user constraints and objective
                 validity assessment. In this article, we propose a
                 semisupervised framework for learning the weighted
                 Euclidean subspace, where the best clustering can be
                 achieved. Our approach capitalizes on: (i) user
                 constraints; and (ii) the quality of intermediate
                 clustering results in terms of their structural
                 properties. The proposed framework uses the clustering
                 algorithm and the validity measure as its parameters.
                 We develop and discuss algorithms for learning and
                 tuning the weights of contributing dimensions and
                 defining the ``best'' clustering obtained by satisfying
                 user constraints. Experimental results on benchmark
                 datasets demonstrate the superiority of the proposed
                 approach in terms of improved clustering accuracy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "cluster validity; data mining; Semisupervised
                 learning; similarity measure learning; space learning",
}

@Article{Zaki:2008:ISI,
  author =       "Mohammed J. Zaki and George Karypis and Jiong Yang and
                 Wei Wang",
  title =        "Introduction to special issue on bioinformatics",
  journal =      j-TKDD,
  volume =       "2",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1342320.1342321",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:18 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Jin:2008:CMM,
  author =       "Ying Jin and T. M. Murali and Naren Ramakrishnan",
  title =        "Compositional mining of multirelational biological
                 datasets",
  journal =      j-TKDD,
  volume =       "2",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1342320.1342322",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:18 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "High-throughput biological screens are yielding
                 ever-growing streams of information about multiple
                 aspects of cellular activity. As more and more
                 categories of datasets come online, there is a
                 corresponding multitude of ways in which inferences can
                 be chained across them, motivating the need for
                 compositional data mining algorithms. In this article,
                 we argue that such compositional data mining can be
                 effectively realized by functionally cascading
                 redescription mining and biclustering algorithms as
                 primitives. Both these primitives mirror shifts of
                 vocabulary that can be composed in arbitrary ways to
                 create rich chains of inferences. Given a relational
                 database and its schema, we show how the schema can be
                 automatically compiled into a compositional data mining
                 program, and how different domains in the schema can be
                 related through logical sequences of biclustering and
                 redescription invocations. This feature allows us to
                 rapidly prototype new data mining applications,
                 yielding greater understanding of scientific datasets.
                 We describe two applications of compositional data
                 mining: (i) matching terms across categories of the
                 Gene Ontology and (ii) understanding the molecular
                 mechanisms underlying stress response in human cells.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Biclustering; bioinformatics; compositional data
                 mining; inductive logic programming; redescription
                 mining",
}

@Article{Sahay:2008:DSB,
  author =       "Saurav Sahay and Sougata Mukherjea and Eugene
                 Agichtein and Ernest V. Garcia and Shamkant B. Navathe
                 and Ashwin Ram",
  title =        "Discovering semantic biomedical relations utilizing
                 the {Web}",
  journal =      j-TKDD,
  volume =       "2",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1342320.1342323",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:18 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "To realize the vision of a Semantic Web for Life
                 Sciences, discovering relations between resources is
                 essential. It is very difficult to automatically
                 extract relations from Web pages expressed in natural
                 language formats. On the other hand, because of the
                 explosive growth of information, it is difficult to
                 manually extract the relations. In this paper we
                 present techniques to automatically discover relations
                 between biomedical resources from the Web. For this
                 purpose we retrieve relevant information from Web
                 Search engines and Pubmed database using various
                 lexico-syntactic patterns as queries over SOAP web
                 services. The patterns are initially handcrafted but
                 can be progressively learnt. The extracted relations
                 can be used to construct and augment ontologies and
                 knowledge bases. Experiments are presented for general
                 biomedical relation discovery and domain specific
                 search to show the usefulness of our technique.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Ontology construction; relation identification",
}

@Article{Ye:2008:DSA,
  author =       "Jieping Ye and Jianhui Chen and Ravi Janardan and
                 Sudhir Kumar",
  title =        "Developmental stage annotation of \bioname{Drosophila}
                 gene expression pattern images via an entire solution
                 path for {LDA}",
  journal =      j-TKDD,
  volume =       "2",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1342320.1342324",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:18 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Gene expression in a developing embryo occurs in
                 particular cells (spatial patterns) in a time-specific
                 manner (temporal patterns), which leads to the
                 differentiation of cell fates. Images of a
                 \bioname{Drosophila melanogaster} embryo at a given
                 developmental stage, showing a particular gene
                 expression pattern revealed by a gene-specific probe,
                 can be compared for spatial overlaps. The comparison is
                 fundamentally important to formulating and testing gene
                 interaction hypotheses. Expression pattern comparison
                 is most biologically meaningful when images from a
                 similar time point (developmental stage) are compared.
                 In this paper, we present LdaPath, a novel formulation
                 of Linear Discriminant Analysis (LDA) for automatic
                 developmental stage range classification. It employs
                 multivariate linear regression with the {$ L_1 $}-norm
                 penalty controlled by a regularization parameter for
                 feature extraction and visualization. LdaPath computes
                 an entire solution path for all values of
                 regularization parameter with essentially the same
                 computational cost as fitting one LDA model. Thus, it
                 facilitates efficient model selection. It is based on
                 the equivalence relationship between LDA and the least
                 squares method for multiclass classifications. This
                 equivalence relationship is established under a mild
                 condition, which we show empirically to hold for many
                 high-dimensional datasets, such as expression pattern
                 images. Our experiments on a collection of 2705
                 expression pattern images show the effectiveness of the
                 proposed algorithm. Results also show that the LDA
                 model resulting from LdaPath is sparse, and irrelevant
                 features may be removed. Thus, LdaPath provides a
                 general framework for simultaneous feature selection
                 and feature extraction.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "dimensionality reduction; Gene expression pattern
                 image; linear discriminant analysis; linear
                 regression",
}

@Article{Lu:2008:ADA,
  author =       "Yijuan Lu and Qi Tian and Jennifer Neary and Feng Liu
                 and Yufeng Wang",
  title =        "Adaptive discriminant analysis for microarray-based
                 classification",
  journal =      j-TKDD,
  volume =       "2",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1342320.1342325",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:18 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Microarray technology has generated enormous amounts
                 of high-dimensional gene expression data, providing a
                 unique platform for exploring gene regulatory networks.
                 However, the curse of dimensionality plagues effort to
                 analyze these high throughput data. Linear Discriminant
                 Analysis (LDA) and Biased Discriminant Analysis (BDA)
                 are two popular techniques for dimension reduction,
                 which pay attention to different roles of the positive
                 and negative samples in finding discriminating
                 subspace. However, the drawbacks of these two methods
                 are obvious: LDA has limited efficiency in classifying
                 sample data from subclasses with different
                 distributions, and BDA does not account for the
                 underlying distribution of negative samples.\par

                 In this paper, we propose a novel dimension reduction
                 technique for microarray analysis: Adaptive
                 Discriminant Analysis (ADA), which effectively exploits
                 favorable attributes of both BDA and LDA and avoids
                 their unfavorable ones. ADA can find a good
                 discriminative subspace with adaptation to different
                 sample distributions. It not only alleviates the
                 problem of high dimensionality, but also enhances the
                 classification performance in the subspace with
                 na{\"\i}ve Bayes classifier. To learn the best model
                 fitting the real scenario, boosted Adaptive
                 Discriminant Analysis is further proposed. Extensive
                 experiments on the yeast cell cycle regulation data
                 set, and the expression data of the red blood cell
                 cycle in malaria parasite {\em Plasmodium falciparum\/}
                 demonstrate the superior performance of ADA and boosted
                 ADA. We also present some putative genes of specific
                 functional classes predicted by boosted ADA. Their
                 potential functionality is confirmed by independent
                 predictions based on Gene Ontology, demonstrating that
                 ADA and boosted ADA are effective dimension reduction
                 methods for microarray-based classification.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "ADA; BDA; boosted ADA; dimension reduction; LDA;
                 microarray",
}

@Article{Hashimoto:2008:NEP,
  author =       "Kosuke Hashimoto and Kiyoko Flora Aoki-Kinoshita and
                 Nobuhisa Ueda and Minoru Kanehisa and Hiroshi
                 Mamitsuka",
  title =        "A new efficient probabilistic model for mining labeled
                 ordered trees applied to glycobiology",
  journal =      j-TKDD,
  volume =       "2",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1342320.1342326",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:18 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Mining frequent patterns from large datasets is an
                 important issue in data mining. Recently, complex and
                 unstructured (or semi-structured) datasets have
                 appeared as targets for major data mining applications,
                 including text mining, web mining and bioinformatics.
                 Our work focuses on labeled ordered trees, which are
                 typically semi-structured datasets. In bioinformatics,
                 carbohydrate sugar chains, or glycans, can be modeled
                 as labeled ordered trees. Glycans are the third major
                 class of biomolecules, having important roles in
                 signaling and recognition. For mining labeled ordered
                 trees, we propose a new probabilistic model and its
                 efficient learning scheme which significantly improves
                 the time and space complexity of an existing
                 probabilistic model for labeled ordered trees. We
                 evaluated the performance of the proposed model,
                 comparing it with those of other probabilistic models,
                 using synthetic as well as real datasets from
                 glycobiology. Experimental results showed that the
                 proposed model drastically reduced the computation time
                 of the competing model, keeping the predictive power
                 and avoiding overfitting to the training data. Finally,
                 we assessed our results on real data from a variety of
                 biological viewpoints, verifying known facts in
                 glycobiology.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Expectation-maximization; labeled ordered trees;
                 maximum likelihood; probabilistic models",
}

@Article{Ge:2008:JCA,
  author =       "Rong Ge and Martin Ester and Byron J. Gao and Zengjian
                 Hu and Binay Bhattacharya and Boaz Ben-Moshe",
  title =        "Joint cluster analysis of attribute data and
                 relationship data: {The} connected $k$-center problem,
                 algorithms and applications",
  journal =      j-TKDD,
  volume =       "2",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1376815.1376816",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:30 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Attribute data and relationship data are two principal
                 types of data, representing the intrinsic and extrinsic
                 properties of entities. While attribute data have been
                 the main source of data for cluster analysis,
                 relationship data such as social networks or metabolic
                 networks are becoming increasingly available. It is
                 also common to observe both data types carry
                 complementary information such as in market
                 segmentation and community identification, which calls
                 for a joint cluster analysis of both data types so as
                 to achieve better results. In this article, we
                 introduce the novel Connected $k$-Center ({\em CkC\/})
                 problem, a clustering model taking into account
                 attribute data as well as relationship data. We analyze
                 the complexity of the problem and prove its
                 NP-hardness. Therefore, we analyze the approximability
                 of the problem and also present a constant factor
                 approximation algorithm. For the special case of the
                 {\em CkC\/} problem where the relationship data form a
                 tree structure, we propose a dynamic programming method
                 giving an optimal solution in polynomial time. We
                 further present NetScan, a heuristic algorithm that is
                 efficient and effective for large real databases. Our
                 extensive experimental evaluation on real datasets
                 demonstrates the meaningfulness and accuracy of the
                 NetScan results.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "approximation algorithms; Attribute data; community
                 identification; document clustering; joint cluster
                 analysis; market segmentation; NP-hardness;
                 relationship data",
}

@Article{Gupta:2008:BBC,
  author =       "Gunjan Gupta and Joydeep Ghosh",
  title =        "{Bregman} bubble clustering: a robust framework for
                 mining dense clusters",
  journal =      j-TKDD,
  volume =       "2",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1376815.1376817",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:30 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In classical clustering, each data point is assigned
                 to at least one cluster. However, in many applications
                 only a small subset of the available data is relevant
                 for the problem and the rest needs to be ignored in
                 order to obtain good clusters. Certain nonparametric
                 density-based clustering methods find the most relevant
                 data as multiple dense regions, but such methods are
                 generally limited to low-dimensional data and do not
                 scale well to large, high-dimensional datasets. Also,
                 they use a specific notion of ``distance'', typically
                 Euclidean or Mahalanobis distance, which further limits
                 their applicability. On the other hand, the recent One
                 Class Information Bottleneck (OC-IB) method is fast and
                 works on a large class of distortion measures known as
                 Bregman Divergences, but can only find a {\em single\/}
                 dense region. This article presents a broad framework
                 for finding $k$ dense clusters while ignoring the rest
                 of the data. It includes a seeding algorithm that can
                 automatically determine a suitable value for {\em k}.
                 When $k$ is forced to 1, our method gives rise to an
                 improved version of OC-IB with optimality guarantees.
                 We provide a generative model that yields the proposed
                 iterative algorithm for finding $k$ dense regions as a
                 special case. Our analysis reveals an interesting and
                 novel connection between the problem of finding dense
                 regions and exponential mixture models; a hard model
                 corresponding to $k$ exponential mixtures with a
                 uniform background results in a set of $k$ dense
                 clusters. The proposed method describes a highly
                 scalable algorithm for finding multiple dense regions
                 that works with any Bregman Divergence, thus extending
                 density based clustering to a variety of non-Euclidean
                 problems not addressable by earlier methods. We present
                 empirical results on three artificial, two microarray
                 and one text dataset to show the relevance and
                 effectiveness of our methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Bregman divergences; Density-based clustering;
                 expectation maximization; exponential family; One Class
                 classification",
}

@Article{Tan:2008:TMG,
  author =       "Henry Tan and Fedja Hadzic and Tharam S. Dillon and
                 Elizabeth Chang and Ling Feng",
  title =        "Tree model guided candidate generation for mining
                 frequent subtrees from {XML} documents",
  journal =      j-TKDD,
  volume =       "2",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1376815.1376818",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:30 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Due to the inherent flexibilities in both structure
                 and semantics, XML association rules mining faces few
                 challenges, such as: a more complicated hierarchical
                 data structure and ordered data context. Mining
                 frequent patterns from XML documents can be recast as
                 mining frequent tree structures from a database of XML
                 documents. In this study, we model a database of XML
                 documents as a database of rooted labeled ordered
                 subtrees. In particular, we are mainly concerned with
                 mining frequent induced and embedded ordered subtrees.
                 Our main contributions are as follows. We describe our
                 unique {\em embedding list\/} representation of the
                 tree structure, which enables efficient implementation
                 of our {\em Tree Model Guided\/} ({\em TMG\/})
                 candidate generation. {\em TMG\/} is an optimal,
                 nonredundant enumeration strategy that enumerates all
                 the valid candidates that conform to the structural
                 aspects of the data. We show through a mathematical
                 model and experiments that {\em TMG\/} has better
                 complexity compared to the commonly used join approach.
                 In this article, we propose two algorithms, MB3-Miner
                 and iMB3-Miner. MB3-Miner mines embedded subtrees.
                 iMB3-Miner mines induced and/or embedded subtrees by
                 using the {\em maximum level of embedding constraint}.
                 Our experiments with both synthetic and real datasets
                 against two well-known algorithms for mining induced
                 and embedded subtrees, demonstrate the effectiveness
                 and the efficiency of the proposed techniques.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "FREQT; TMG; Tree mining; tree model guided;
                 TreeMiner",
}

@Article{Islam:2008:STS,
  author =       "Aminul Islam and Diana Inkpen",
  title =        "Semantic text similarity using corpus-based word
                 similarity and string similarity",
  journal =      j-TKDD,
  volume =       "2",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1376815.1376819",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:30 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We present a method for measuring the semantic
                 similarity of texts using a corpus-based measure of
                 semantic word similarity and a normalized and modified
                 version of the Longest Common Subsequence (LCS) string
                 matching algorithm. Existing methods for computing text
                 similarity have focused mainly on either large
                 documents or individual words. We focus on computing
                 the similarity between two sentences or two short
                 paragraphs. The proposed method can be exploited in a
                 variety of applications involving textual knowledge
                 representation and knowledge discovery. Evaluation
                 results on two different data sets show that our method
                 outperforms several competing methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "corpus-based measures; Semantic similarity of words;
                 similarity of short texts",
}

@Article{Sun:2008:ITA,
  author =       "Jimeng Sun and Dacheng Tao and Spiros Papadimitriou
                 and Philip S. Yu and Christos Faloutsos",
  title =        "Incremental tensor analysis: {Theory} and
                 applications",
  journal =      j-TKDD,
  volume =       "2",
  number =       "3",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1409620.1409621",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:41 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "How do we find patterns in author-keyword
                 associations, evolving over time? Or in data cubes
                 (tensors), with product-branchcustomer sales
                 information? And more generally, how to summarize
                 high-order data cubes (tensors)? How to incrementally
                 update these patterns over time? Matrix decompositions,
                 like principal component analysis (PCA) and variants,
                 are invaluable tools for mining, dimensionality
                 reduction, feature selection, rule identification in
                 numerous settings like streaming data, text, graphs,
                 social networks, and many more settings. However, they
                 have only two orders (i.e., matrices, like author and
                 keyword in the previous example).\par

                 We propose to envision such higher-order data as
                 tensors, and tap the vast literature on the topic.
                 However, these methods do not necessarily scale up, let
                 alone operate on semi-infinite streams. Thus, we
                 introduce a general framework, incremental tensor
                 analysis (ITA), which efficiently computes a compact
                 summary for high-order and high-dimensional data, and
                 also reveals the hidden correlations. Three variants of
                 ITA are presented: (1) dynamic tensor analysis (DTA);
                 (2) streaming tensor analysis (STA); and (3)
                 window-based tensor analysis (WTA). In particular, we
                 explore several fundamental design trade-offs such as
                 space efficiency, computational cost, approximation
                 accuracy, time dependency, and model complexity.\par

                 We implement all our methods and apply them in several
                 real settings, such as network anomaly detection,
                 multiway latent semantic indexing on citation networks,
                 and correlation study on sensor measurements. Our
                 empirical studies show that the proposed methods are
                 fast and accurate and that they find interesting
                 patterns and outliers on the real datasets.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "multilinear algebra; stream mining; Tensor",
}

@Article{Mangasarian:2008:PPC,
  author =       "Olvi L. Mangasarian and Edward W. Wild and Glenn M.
                 Fung",
  title =        "Privacy-preserving classification of vertically
                 partitioned data via random kernels",
  journal =      j-TKDD,
  volume =       "2",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1409620.1409622",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:41 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We propose a novel privacy-preserving support vector
                 machine (SVM) classifier for a data matrix $A$ whose
                 input feature columns are divided into groups belonging
                 to different entities. Each entity is unwilling to
                 share its group of columns or make it public. Our
                 classifier is based on the concept of a reduced kernel
                 $ k(A, B \prime)$, where $ B \prime $ is the transpose
                 of a random matrix $B$. The column blocks of $B$
                 corresponding to the different entities are privately
                 generated by each entity and never made public. The
                 proposed linear or nonlinear SVM classifier, which is
                 public but does not reveal any of the privately held
                 data, has accuracy comparable to that of an ordinary
                 SVM classifier that uses the entire set of input
                 features directly.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Privacy preserving classification; support vector
                 machines; vertically partitioned data",
}

@Article{Lakshmanan:2008:DRA,
  author =       "Laks V. S. Lakshmanan and Raymond T. Ng and Ganesh
                 Ramesh",
  title =        "On disclosure risk analysis of anonymized itemsets in
                 the presence of prior knowledge",
  journal =      j-TKDD,
  volume =       "2",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1409620.1409623",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:41 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Decision makers of companies often face the dilemma of
                 whether to release data for knowledge discovery,
                 vis-a-vis the risk of disclosing proprietary or
                 sensitive information. Among the various methods
                 employed for ``sanitizing'' the data prior to
                 disclosure, we focus in this article on anonymization,
                 given its widespread use in practice. We do due
                 diligence to the question ``just how safe is the
                 anonymized data?'' We consider both those scenarios
                 when the hacker has no information and, more
                 realistically, when the hacker may have partial
                 information about items in the domain. We conduct our
                 analyses in the context of frequent set mining and
                 address the safety question at two different levels:
                 (i) how likely of being cracked (i.e., re-identified by
                 a hacker), are the identities of individual items and
                 (ii) how likely are sets of items cracked? For
                 capturing the prior knowledge of the hacker, we propose
                 a {\em belief function}, which amounts to an educated
                 guess of the frequency of each item. For various
                 classes of belief functions which correspond to
                 different degrees of prior knowledge, we derive
                 formulas for computing the expected number of cracks of
                 single items and for itemsets, the probability of
                 cracking the itemsets. While obtaining, exact values
                 for more general situations is computationally hard, we
                 propose a series of heuristics called the {\em
                 O-estimates}. They are easy to compute and are shown
                 fairly accurate, justified by empirical results on real
                 benchmark datasets. Based on the O-estimates, we
                 propose a recipe for the decision makers to resolve
                 their dilemma. Our recipe operates at two different
                 levels, depending on whether the data owner wants to
                 reason in terms of single items or sets of items (or
                 both). Finally, we present techniques for ascertaining
                 a hacker's knowledge of correlation in terms of
                 co-occurrence of items likely. This information
                 regarding the hacker's knowledge can be incorporated
                 into our framework of disclosure risk analysis and we
                 present experimental results demonstrating how this
                 knowledge affects the heuristic estimates we have
                 developed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "anonymization; belief function; bipartite graphs;
                 correlation; Disclosure risk; frequent itemsets;
                 hacker; matching; prior knowledge; sampling",
}

@Article{Vaidya:2008:PPD,
  author =       "Jaideep Vaidya and Chris Clifton and Murat
                 Kantarcioglu and A. Scott Patterson",
  title =        "Privacy-preserving decision trees over vertically
                 partitioned data",
  journal =      j-TKDD,
  volume =       "2",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1409620.1409624",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:41 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Privacy and security concerns can prevent sharing of
                 data, derailing data-mining projects. Distributed
                 knowledge discovery, if done correctly, can alleviate
                 this problem. We introduce a generalized
                 privacy-preserving variant of the ID3 algorithm for
                 vertically partitioned data distributed over two or
                 more parties. Along with a proof of security, we
                 discuss what would be necessary to make the protocols
                 completely secure. We also provide experimental
                 results, giving a first demonstration of the practical
                 complexity of secure multiparty computation-based data
                 mining.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Decision tree classification; privacy",
}

@Article{Chuang:2009:FPS,
  author =       "Kun-Ta Chuang and Hung-Leng Chen and Ming-Syan Chen",
  title =        "Feature-preserved sampling over streaming data",
  journal =      j-TKDD,
  volume =       "2",
  number =       "4",
  pages =        "15:1--15:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1460797.1460798",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:51 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In this article, we explore a novel sampling model,
                 called {\em feature preserved sampling\/} ({\em FPS\/})
                 that sequentially generates a high-quality sample over
                 sliding windows. The sampling quality we consider
                 refers to the degree of consistency between the sample
                 proportion and the population proportion of each
                 attribute value in a window. Due to the time-variant
                 nature of real-world datasets, users are more likely to
                 be interested in the most recent data. However,
                 previous works have not been able to generate a
                 high-quality sample over sliding windows that precisely
                 preserves up-to-date population characteristics.
                 Motivated by this shortcoming, we have developed the
                 {\em FPS\/} algorithm, which has several advantages:
                 (1) it sequentially generates a sample from a
                 time-variant data source over sliding windows; (2) the
                 execution time of {\em FPS\/} is linear with respect to
                 the database size; (3) the {\em relative\/}
                 proportional differences between the sample proportions
                 and population proportions of most distinct attribute
                 values are guaranteed to be below a specified error
                 threshold, $ \epsilon $, while the {\em relative\/}
                 proportion differences of the remaining attribute
                 values are as close to $ \epsilon $ as possible, which
                 ensures that the generated sample is of high quality;
                 (4) the sample rate is close to the user specified rate
                 so that a high quality sampling result can be obtained
                 without increasing the sample size; (5) by a thorough
                 analytical and empirical study, we prove that {\em
                 FPS\/} has acceptable space overheads, especially when
                 the attribute values have Zipfian distributions, and
                 {\em FPS\/} can also excellently preserve the
                 population proportion of multivariate features in the
                 sample; and (6) {\em FPS\/} can be applied to infinite
                 streams and finite datasets equally, and the generated
                 samples can be used for various applications. Our
                 experiments on both real and synthetic data validate
                 that {\em FPS\/} can effectively obtain a high quality
                 sample of the desired size. In addition, while using
                 the sample generated by {\em FPS\/} in various mining
                 applications, a significant improvement in efficiency
                 can be achieved without compromising the model's
                 precision.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "sampling; Streaming mining",
}

@Article{Jiang:2009:MFC,
  author =       "Daxin Jiang and Jian Pei",
  title =        "Mining frequent cross-graph quasi-cliques",
  journal =      j-TKDD,
  volume =       "2",
  number =       "4",
  pages =        "16:1--16:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1460797.1460799",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:51 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Joint mining of multiple datasets can often discover
                 interesting, novel, and reliable patterns which cannot
                 be obtained solely from any single source. For example,
                 in bioinformatics, jointly mining multiple gene
                 expression datasets obtained by different labs or
                 during various biological processes may overcome the
                 heavy noise in the data. Moreover, by joint mining of
                 gene expression data and protein-protein interaction
                 data, we may discover clusters of genes which show
                 coherent expression patterns and also produce
                 interacting proteins. Such clusters may be potential
                 pathways.\par

                 In this article, we investigate a novel data mining
                 problem, {\em mining frequent cross-graph
                 quasi-cliques}, which is generalized from several
                 interesting applications in bioinformatics,
                 cross-market customer segmentation, social network
                 analysis, and Web mining. In a graph, a set of vertices
                 $S$ is a $ \gamma $-quasi-clique $ (0 < \gamma \leq 1)$
                 if each vertex $v$ in $S$ directly connects to at least
                 $ \gamma \cdot (|S| - 1)$ other vertices in $S$. Given
                 a set of graphs $ G_1, \ldots {}, G_n$ and parameter $
                 {\rm min \_ sup} (0 < {\rm min \_ sup} 1)$, a set of
                 vertices $S$ is a frequent cross-graph quasi-clique if
                 $S$ is a $ \gamma $-quasi-clique in at least $ {\rm min
                 \_ sup} \cdot n$ graphs, and there does not exist a
                 proper superset of $S$ having the property.\par

                 We build a general model, show why the complete set of
                 frequent cross-graph quasi-cliques cannot be found by
                 previous data mining methods, and study the complexity
                 of the problem. While the problem is difficult, we
                 develop practical algorithms which exploit several
                 interesting and effective techniques and heuristics to
                 efficaciously mine frequent cross-graph quasi-cliques.
                 A systematic performance study is reported on both
                 synthetic and real data sets. We demonstrate some
                 interesting and meaningful frequent cross-graph
                 quasi-cliques in bioinformatics. The experimental
                 results also show that our algorithms are efficient and
                 scalable.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "bioinformatics; clique; Graph mining; joint mining",
}

@Article{Domeniconi:2009:WCE,
  author =       "Carlotta Domeniconi and Muna Al-Razgan",
  title =        "Weighted cluster ensembles: {Methods} and analysis",
  journal =      j-TKDD,
  volume =       "2",
  number =       "4",
  pages =        "17:1--17:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1460797.1460800",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:51 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Cluster ensembles offer a solution to challenges
                 inherent to clustering arising from its ill-posed
                 nature. Cluster ensembles can provide robust and stable
                 solutions by leveraging the consensus across multiple
                 clustering results, while averaging out emergent
                 spurious structures that arise due to the various
                 biases to which each participating algorithm is tuned.
                 In this article, we address the problem of combining
                 multiple {\em weighted clusters\/} that belong to
                 different subspaces of the input space. We leverage the
                 diversity of the input clusterings in order to generate
                 a consensus partition that is superior to the
                 participating ones. Since we are dealing with weighted
                 clusters, our consensus functions make use of the
                 weight vectors associated with the clusters. We
                 demonstrate the effectiveness of our techniques by
                 running experiments with several real datasets,
                 including high-dimensional text data. Furthermore, we
                 investigate in depth the issue of diversity and
                 accuracy for our ensemble methods. Our analysis and
                 experimental results show that the proposed techniques
                 are capable of producing a partition that is as good as
                 or better than the best individual clustering.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "accuracy and diversity measures; Cluster ensembles;
                 consensus functions; data mining; subspace clustering;
                 text data",
}

@Article{Zhang:2009:DGA,
  author =       "Zhenjie Zhang and Laks V. S. Lakshmanan and Anthony K.
                 H. Tung",
  title =        "On domination game analysis for microeconomic data
                 mining",
  journal =      j-TKDD,
  volume =       "2",
  number =       "4",
  pages =        "18:1--18:??",
  month =        jan,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1460797.1460801",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 17:59:51 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Game theory is a powerful tool for analyzing the
                 competitions among manufacturers in a market. In this
                 article, we present a study on combining game theory
                 and data mining by introducing the concept of
                 domination game analysis. We present a multidimensional
                 market model, where every dimension represents one
                 attribute of a commodity. Every product or customer is
                 represented by a point in the multidimensional space,
                 and a product is said to ``dominate'' a customer if all
                 of its attributes can satisfy the requirements of the
                 customer. The expected market share of a product is
                 measured by the expected number of the buyers in the
                 customers, all of which are equally likely to buy any
                 product dominating him. A Nash equilibrium is a
                 configuration of the products achieving stable expected
                 market shares for all products. We prove that Nash
                 equilibrium in such a model can be computed in
                 polynomial time if every manufacturer tries to modify
                 its product in a round robin manner. To further improve
                 the efficiency of the computation, we also design two
                 algorithms for the manufacturers to efficiently find
                 their best response to other products in the market.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "data mining; Domination game; game theory",
}

@Article{Kriegel:2009:CHD,
  author =       "Hans-Peter Kriegel and Peer Kr{\"o}ger and Arthur
                 Zimek",
  title =        "Clustering high-dimensional data: a survey on subspace
                 clustering, pattern-based clustering, and correlation
                 clustering",
  journal =      j-TKDD,
  volume =       "3",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1497577.1497578",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:01 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "As a prolific research area in data mining, subspace
                 clustering and related problems induced a vast quantity
                 of proposed solutions. However, many publications
                 compare a new proposition --- if at all --- with one or
                 two competitors, or even with a so-called
                 ``na{\"\i}ve'' ad hoc solution, but fail to clarify the
                 exact problem definition. As a consequence, even if two
                 solutions are thoroughly compared experimentally, it
                 will often remain unclear whether both solutions tackle
                 the same problem or, if they do, whether they agree in
                 certain tacit assumptions and how such assumptions may
                 influence the outcome of an algorithm. In this survey,
                 we try to clarify: (i) the different problem
                 definitions related to subspace clustering in general;
                 (ii) the specific difficulties encountered in this
                 field of research; (iii) the varying assumptions,
                 heuristics, and intuitions forming the basis of
                 different approaches; and (iv) how several prominent
                 solutions tackle different problems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "clustering; high-dimensional data; Survey",
}

@Article{Dhurandhar:2009:SAM,
  author =       "Amit Dhurandhar and Alin Dobra",
  title =        "Semi-analytical method for analyzing models and model
                 selection measures based on moment analysis",
  journal =      j-TKDD,
  volume =       "3",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1497577.1497579",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:01 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In this article we propose a moment-based method for
                 studying models and model selection measures. By
                 focusing on the probabilistic space of classifiers
                 induced by the classification algorithm rather than on
                 that of datasets, we obtain efficient characterizations
                 for computing the moments, which is followed by
                 visualization of the resulting formulae that are too
                 complicated for direct interpretation. By assuming the
                 data to be drawn independently and identically
                 distributed from the underlying probability
                 distribution, and by going over the space of all
                 possible datasets, we establish general relationships
                 between the generalization error, hold-out-set error,
                 cross-validation error, and leave-one-out error. We
                 later exemplify the method and the results by studying
                 the behavior of the errors for the naive Bayes
                 classifier.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "classification; generalization error; Model
                 selection",
}

@Article{Cerf:2009:CPM,
  author =       "Lo{\"\i}c Cerf and J{\'e}r{\'e}my Besson and
                 C{\'e}line Robardet and Jean-Fran{\c{c}}ois Boulicaut",
  title =        "Closed patterns meet $n$-ary relations",
  journal =      j-TKDD,
  volume =       "3",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1497577.1497580",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:01 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Set pattern discovery from binary relations has been
                 extensively studied during the last decade. In
                 particular, many complete and efficient algorithms for
                 frequent closed set mining are now available.
                 Generalizing such a task to $n$-ary relations ($ n \geq
                 2$) appears as a timely challenge. It may be important
                 for many applications, for example, when adding the
                 time dimension to the popular {\em objects\/} $ \times
                 $ {\em features\/} binary case. The generality of the
                 task (no assumption being made on the relation arity or
                 on the size of its attribute domains) makes it
                 computationally challenging. We introduce an algorithm
                 called Data-Peeler. From an $n$-ary relation, it
                 extracts all closed $n$-sets satisfying given piecewise
                 (anti) monotonic constraints. This new class of
                 constraints generalizes both monotonic and
                 antimonotonic constraints. Considering the special case
                 of ternary relations, Data-Peeler outperforms the
                 state-of-the-art algorithms CubeMiner and Trias by
                 orders of magnitude. These good performances must be
                 granted to a new clever enumeration strategy allowing
                 to efficiently enforce the closeness property. The
                 relevance of the extracted closed $n$-sets is assessed
                 on real-life 3-and 4-ary relations. Beyond natural 3-or
                 4-ary relations, expanding a relation with an
                 additional attribute can help in enforcing rather
                 abstract constraints such as the robustness with
                 respect to binarization. Furthermore, a collection of
                 closed $n$-sets is shown to be an excellent starting
                 point to compute a tiling of the dataset.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "$n$-ary relations; Closed patterns; constraint
                 properties; constraint-based mining; tiling",
}

@Article{Angiulli:2009:DEA,
  author =       "Fabrizio Angiulli and Fabio Fassetti",
  title =        "{DOLPHIN}: an efficient algorithm for mining
                 distance-based outliers in very large datasets",
  journal =      j-TKDD,
  volume =       "3",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1497577.1497581",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:01 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In this work a novel distance-based outlier detection
                 algorithm, named DOLPHIN, working on disk-resident
                 datasets and whose I/O cost corresponds to the cost of
                 sequentially reading the input dataset file twice, is
                 presented.\par

                 It is both theoretically and empirically shown that the
                 main memory usage of DOLPHIN amounts to a small
                 fraction of the dataset and that DOLPHIN has linear
                 time performance with respect to the dataset size.
                 DOLPHIN gains efficiency by naturally merging together
                 in a unified schema three strategies, namely the
                 selection policy of objects to be maintained in main
                 memory, usage of pruning rules, and similarity search
                 techniques. Importantly, similarity search is
                 accomplished by the algorithm without the need of
                 preliminarily indexing the whole dataset, as other
                 methods do.\par

                 The algorithm is simple to implement and it can be used
                 with any type of data, belonging to either metric or
                 nonmetric spaces. Moreover, a modification to the basic
                 method allows DOLPHIN to deal with the scenario in
                 which the available buffer of main memory is smaller
                 than its standard requirements. DOLPHIN has been
                 compared with state-of-the-art distance-based outlier
                 detection algorithms, showing that it is much more
                 efficient.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Data mining; distance-based outliers; outlier
                 detection",
}

@Article{Chen:2009:BAS,
  author =       "Bee-Chung Chen and Raghu Ramakrishnan and Jude W.
                 Shavlik and Pradeep Tamma",
  title =        "Bellwether analysis: {Searching} for cost-effective
                 query-defined predictors in large databases",
  journal =      j-TKDD,
  volume =       "3",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1497577.1497582",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:01 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "How to mine massive datasets is a challenging problem
                 with great potential value. Motivated by this
                 challenge, much effort has concentrated on developing
                 scalable versions of machine learning algorithms.
                 However, the cost of mining large datasets is not just
                 computational; preparing the datasets into the ``right
                 form'' so that learning algorithms can be applied is
                 usually costly, due to the human labor that is
                 typically required and a large number of choices in
                 data preparation, which include selecting different
                 subsets of data and aggregating data at different
                 granularities. We make the key observation that, for a
                 number of practically motivated problems, these choices
                 can be defined using database queries and analyzed in
                 an automatic and systematic manner. Specifically, we
                 propose a new class of data-mining problem, called {\em
                 bellwether analysis}, in which the goal is to find a
                 few query-defined predictors (e.g., first week sales of
                 Peoria, IL of an item) that can be used to accurately
                 predict the result of a target query (e.g., first year
                 worldwide sales of the item) from a large number of
                 queries that define candidate predictors. To make a
                 prediction for a new item, the data needed to generate
                 such predictors has to be collected (e.g., selling the
                 new item in Peoria, IL for a week and collecting the
                 sales data). A useful predictor is one that has high
                 prediction accuracy and a low data-collection cost. We
                 call such a cost-effective predictor a {\em
                 bellwether}.\par

                 This article introduces bellwether analysis, which
                 integrates database query processing and predictive
                 modeling into a single framework, and provides scalable
                 algorithms for large datasets that cannot fit in main
                 memory. Through a series of extensive experiments, we
                 show that bellwethers do exist in real-world databases,
                 and that our computation techniques achieve good
                 efficiency on large datasets.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "bellwether; Cost-effective prediction; data cube; OLAP
                 queries; predictive models; scalable algorithms",
}

@Article{Liu:2009:ISI,
  author =       "Huan Liu and John Salerno and Michael Young and Rakesh
                 Agrawal and Philip S. Yu",
  title =        "Introduction to special issue on social computing,
                 behavioral modeling, and prediction",
  journal =      j-TKDD,
  volume =       "3",
  number =       "2",
  pages =        "6:1--6:??",
  month =        apr,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1514888.1514889",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:12 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Mehler:2009:ENC,
  author =       "Andrew Mehler and Steven Skiena",
  title =        "Expanding network communities from representative
                 examples",
  journal =      j-TKDD,
  volume =       "3",
  number =       "2",
  pages =        "7:1--7:??",
  month =        apr,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1514888.1514890",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:12 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We present an approach to leverage a small subset of a
                 coherent community within a social network into a much
                 larger, more representative sample. Our problem becomes
                 identifying a small conductance subgraph containing
                 many (but not necessarily all) members of the given
                 seed set. Starting with an initial seed set
                 representing a sample of a community, we seek to
                 discover as much of the full community as
                 possible.\par

                 We present a general method for network community
                 expansion, demonstrating that our methods work well in
                 expanding communities in real world networks starting
                 from small given seed groups (20 to 400 members). Our
                 approach is marked by incremental expansion from the
                 seeds with retrospective analysis to determine the
                 ultimate boundaries of our community. We demonstrate
                 how to increase the robustness of the general approach
                 through bootstrapping multiple random partitions of the
                 input set into seed and evaluation groups.\par

                 We go beyond statistical comparisons against gold
                 standards to careful subjective evaluations of our
                 expanded communities. This process explains the causes
                 of most disagreement between our expanded communities
                 and our gold-standards --- arguing that our expansion
                 methods provide more reliable communities than can be
                 extracted from reference sources/gazetteers such as
                 Wikipedia.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "artificial intelligence; community discovery; Discrete
                 mathematics; graph theory; news analysis; social
                 networks",
}

@Article{Lin:2009:ACT,
  author =       "Yu-Ru Lin and Yun Chi and Shenghuo Zhu and Hari
                 Sundaram and Belle L. Tseng",
  title =        "Analyzing communities and their evolutions in dynamic
                 social networks",
  journal =      j-TKDD,
  volume =       "3",
  number =       "2",
  pages =        "8:1--8:??",
  month =        apr,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1514888.1514891",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:12 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We discover communities from social network data and
                 analyze the community evolution. These communities are
                 inherent characteristics of human interaction in online
                 social networks, as well as paper citation networks.
                 Also, communities may evolve over time, due to changes
                 to individuals' roles and social status in the network
                 as well as changes to individuals' research interests.
                 We present an innovative algorithm that deviates from
                 the traditional two-step approach to analyze community
                 evolutions. In the traditional approach, communities
                 are first detected for each time slice, and then
                 compared to determine correspondences. We argue that
                 this approach is inappropriate in applications with
                 noisy data. In this paper, we propose {\em FacetNet\/}
                 for analyzing communities and their evolutions through
                 a robust {\em unified\/} process. This novel framework
                 will discover communities and capture their evolution
                 with temporal smoothness given by historic community
                 structures. Our approach relies on formulating the
                 problem in terms of maximum a posteriori (MAP)
                 estimation, where the community structure is estimated
                 both by the observed networked data and by the prior
                 distribution given by historic community structures.
                 Then we develop an iterative algorithm, with proven low
                 time complexity, which is guaranteed to converge to an
                 optimal solution. We perform extensive experimental
                 studies, on both synthetic datasets and real datasets,
                 to demonstrate that our method discovers meaningful
                 communities and provides additional insights not
                 directly obtainable from traditional methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Community; community net; evolution; evolution net;
                 nonnegative matrix factorization; soft membership",
}

@Article{Kimura:2009:BLM,
  author =       "Masahiro Kimura and Kazumi Saito and Hiroshi Motoda",
  title =        "Blocking links to minimize contamination spread in a
                 social network",
  journal =      j-TKDD,
  volume =       "3",
  number =       "2",
  pages =        "9:1--9:??",
  month =        apr,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1514888.1514892",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:12 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We address the problem of minimizing the propagation
                 of undesirable things, such as computer viruses or
                 malicious rumors, by blocking a limited number of links
                 in a network, which is converse to the influence
                 maximization problem in which the most influential
                 nodes for information diffusion is searched in a social
                 network. This minimization problem is more fundamental
                 than the problem of preventing the spread of
                 contamination by removing nodes in a network. We
                 introduce two definitions for the contamination degree
                 of a network, accordingly define two contamination
                 minimization problems, and propose methods for
                 efficiently finding good approximate solutions to these
                 problems on the basis of a naturally greedy strategy.
                 Using large social networks, we experimentally
                 demonstrate that the proposed methods outperform
                 conventional link-removal methods. We also show that
                 unlike the case of blocking a limited number of nodes,
                 the strategy of removing nodes with high out-degrees is
                 not necessarily effective for these problems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Contamination diffusion; link analysis; social
                 networks",
}

@Article{Agichtein:2009:MIS,
  author =       "Eugene Agichtein and Yandong Liu and Jiang Bian",
  title =        "Modeling information-seeker satisfaction in community
                 question answering",
  journal =      j-TKDD,
  volume =       "3",
  number =       "2",
  pages =        "10:1--10:??",
  month =        apr,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1514888.1514893",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Fri Apr 24 18:00:12 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Question Answering Communities such as Naver, Baidu
                 Knows, and Yahoo! Answers have emerged as popular, and
                 often effective, means of information seeking on the
                 web. By posting questions for other participants to
                 answer, information seekers can obtain specific answers
                 to their questions. Users of CQA portals have already
                 contributed millions of questions, and received
                 hundreds of millions of answers from other
                 participants. However, CQA is not always effective: in
                 some cases, a user may obtain a perfect answer within
                 minutes, and in others it may require hours --- and
                 sometimes days --- until a satisfactory answer is
                 contributed. We investigate the problem of predicting
                 information seeker satisfaction in collaborative
                 question answering communities, where we attempt to
                 predict whether a question author will be satisfied
                 with the answers submitted by the community
                 participants. We present a general prediction model,
                 and develop a variety of content, structure, and
                 community-focused features for this task. Our
                 experimental results, obtained from a large-scale
                 evaluation over thousands of real questions and user
                 ratings, demonstrate the feasibility of modeling and
                 predicting asker satisfaction. We complement our
                 results with a thorough investigation of the
                 interactions and information seeking patterns in
                 question answering communities that correlate with
                 information seeker satisfaction. We also explore {\em
                 personalized\/} models of asker satisfaction, and show
                 that when sufficient interaction history exists,
                 personalization can significantly improve prediction
                 accuracy over a ``one-size-fits-all'' model. Our models
                 and predictions could be useful for a variety of
                 applications, such as user intent inference, answer
                 ranking, interface design, and query suggestion and
                 routing.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Community question answering; information seeker
                 satisfaction",
}

@Article{Torvik:2009:AND,
  author =       "Vetle I. Torvik and Neil R. Smalheiser",
  title =        "Author name disambiguation in {MEDLINE}",
  journal =      j-TKDD,
  volume =       "3",
  number =       "3",
  pages =        "11:1--11:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1552303.1552304",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:36:58 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "{\em Background\/}: We recently described
                 ``Author-ity,'' a model for estimating the probability
                 that two articles in MEDLINE, sharing the same author
                 name, were written by the same individual. Features
                 include shared title words, journal name, coauthors,
                 medical subject headings, language, affiliations, and
                 author name features (middle initial, suffix, and
                 prevalence in MEDLINE). Here we test the hypothesis
                 that the Author-ity model will suffice to disambiguate
                 author names for the vast majority of articles in
                 MEDLINE. {\em Methods\/}: Enhancements include: (a)
                 incorporating first names and their variants, email
                 addresses, and correlations between specific last names
                 and affiliation words; (b) new methods of generating
                 large unbiased training sets; (c) new methods for
                 estimating the prior probability; (d) a weighted least
                 squares algorithm for correcting transitivity
                 violations; and (e) a maximum likelihood based
                 agglomerative algorithm for computing clusters of
                 articles that represent inferred author-individuals.
                 {\em Results\/}: Pairwise comparisons were computed for
                 all author names on all 15.3 million articles in
                 MEDLINE (2006 baseline), that share last name and first
                 initial, to create Author-ity 2006, a database that has
                 each name on each article assigned to one of 6.7
                 million inferred author-individual clusters. Recall is
                 estimated at $ \approx 98.8 \% $. Lumping (putting two
                 different individuals into the same cluster) affects $
                 \approx 0.5 \% $ of clusters, whereas splitting
                 (assigning articles written by the same individual to $
                 > 1 $ cluster) affects $ \approx 2 \% $ of articles.
                 {\em Impact\/}: The Author-ity model can be applied
                 generally to other bibliographic databases. Author name
                 disambiguation allows information retrieval and data
                 integration to become {\em person-centered}, not just
                 {\em document-centered}, setting the stage for new data
                 mining and social network tools that will facilitate
                 the analysis of scholarly publishing and collaboration
                 behavior. {\em Availability\/}: The Author-ity 2006
                 database is available for nonprofit academic research,
                 and can be freely queried via
                 http://arrowsmith.psych.uic.edu.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "bibliographic databases; Name disambiguation",
}

@Article{Tu:2009:SDC,
  author =       "Li Tu and Yixin Chen",
  title =        "Stream data clustering based on grid density and
                 attraction",
  journal =      j-TKDD,
  volume =       "3",
  number =       "3",
  pages =        "12:1--12:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1552303.1552305",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:36:58 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Clustering real-time stream data is an important and
                 challenging problem. Existing algorithms such as
                 CluStream are based on the {\em k\/} -means algorithm.
                 These clustering algorithms have difficulties finding
                 clusters of arbitrary shapes and handling outliers.
                 Further, they require the knowledge of {\em k\/} and
                 user-specified time window. To address these issues,
                 this article proposes {\em D-Stream}, a framework for
                 clustering stream data using a density-based
                 approach.\par

                 Our algorithm uses an online component that maps each
                 input data record into a grid and an offline component
                 that computes the grid density and clusters the grids
                 based on the density. The algorithm adopts a density
                 decaying technique to capture the dynamic changes of a
                 data stream and a attraction-based mechanism to
                 accurately generate cluster boundaries.\par

                 Exploiting the intricate relationships among the decay
                 factor, attraction, data density, and cluster
                 structure, our algorithm can efficiently and
                 effectively generate and adjust the clusters in real
                 time. Further, a theoretically sound technique is
                 developed to detect and remove sporadic grids mapped by
                 outliers in order to dramatically improve the space and
                 time efficiency of the system. The technique makes
                 high-speed data stream clustering feasible without
                 degrading the clustering quality. The experimental
                 results show that our algorithm has superior quality
                 and efficiency, can find clusters of arbitrary shapes,
                 and can accurately recognize the evolving behaviors of
                 real-time data streams.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "clustering; data mining; density-based algorithms;
                 Stream data",
}

@Article{Zhou:2009:LST,
  author =       "Bin Zhou and Jian Pei",
  title =        "Link spam target detection using page farms",
  journal =      j-TKDD,
  volume =       "3",
  number =       "3",
  pages =        "13:1--13:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1552303.1552306",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:36:58 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Currently, most popular Web search engines adopt some
                 link-based ranking methods such as PageRank. Driven by
                 the huge potential benefit of improving rankings of Web
                 pages, many tricks have been attempted to boost page
                 rankings. The most common way, which is known as link
                 spam, is to make up some artificially designed link
                 structures. Detecting link spam effectively is a big
                 challenge. In this article, we develop novel and
                 effective detection methods for link spam target pages
                 using page farms. The essential idea is intuitive:
                 whether a page is the beneficiary of link spam is
                 reflected by how it collects its PageRank score.
                 Technically, how a target page collects its PageRank
                 score is modeled by a page farm, which consists of
                 pages contributing a major portion of the PageRank
                 score of the target page. We propose two spamicity
                 measures based on page farms. They can be used as an
                 effective measure to check whether the pages are link
                 spam target pages. An empirical study using a newly
                 available real dataset strongly suggests that our
                 method is effective. It outperforms the
                 state-of-the-art methods like SpamRank and SpamMass in
                 both precision and recall.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Link Spam; Page Farm; PageRank",
}

@Article{Wan:2009:DBC,
  author =       "Li Wan and Wee Keong Ng and Xuan Hong Dang and Philip
                 S. Yu and Kuan Zhang",
  title =        "Density-based clustering of data streams at multiple
                 resolutions",
  journal =      j-TKDD,
  volume =       "3",
  number =       "3",
  pages =        "14:1--14:??",
  month =        jul,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1552303.1552307",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:36:58 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In data stream clustering, it is desirable to have
                 algorithms that are able to detect clusters of
                 arbitrary shape, clusters that evolve over time, and
                 clusters with noise. Existing stream data clustering
                 algorithms are generally based on an online-offline
                 approach: The online component captures synopsis
                 information from the data stream (thus, overcoming
                 real-time and memory constraints) and the offline
                 component generates clusters using the stored synopsis.
                 The online-offline approach affects the overall
                 performance of stream data clustering in various ways:
                 the ease of deriving synopsis from streaming data; the
                 complexity of data structure for storing and managing
                 synopsis; and the frequency at which the offline
                 component is used to generate clusters. In this
                 article, we propose an algorithm that (1) computes and
                 updates synopsis information in constant time; (2)
                 allows users to discover clusters at multiple
                 resolutions; (3) determines the right time for users to
                 generate clusters from the synopsis information; (4)
                 generates clusters of higher purity than existing
                 algorithms; and (5) determines the right threshold
                 function for density-based clustering based on the
                 fading model of stream data. To the best of our
                 knowledge, no existing data stream algorithms has all
                 of these features. Experimental results show that our
                 algorithm is able to detect arbitrarily shaped,
                 evolving clusters with high quality.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Data mining algorithms; density based clustering;
                 evolving data streams",
}

@Article{Mannila:2009:ATS,
  author =       "Heikki Mannila and Dimitrios Gunopulos",
  title =        "{ACM TKDD} special issue {ACM SIGKDD 2007} and {ACM
                 SIGKDD 2008}",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "15:1--15:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1631162.1631163",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Asur:2009:EBF,
  author =       "Sitaram Asur and Srinivasan Parthasarathy and Duygu
                 Ucar",
  title =        "An event-based framework for characterizing the
                 evolutionary behavior of interaction graphs",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "16:1--16:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1631162.1631164",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Interaction graphs are ubiquitous in many fields such
                 as bioinformatics, sociology and physical sciences.
                 There have been many studies in the literature targeted
                 at studying and mining these graphs. However, almost
                 all of them have studied these graphs from a static
                 point of view. The study of the evolution of these
                 graphs over time can provide tremendous insight on the
                 behavior of entities, communities and the flow of
                 information among them. In this work, we present an
                 event-based characterization of critical behavioral
                 patterns for temporally varying interaction graphs. We
                 use nonoverlapping snapshots of interaction graphs and
                 develop a framework for capturing and identifying
                 interesting events from them. We use these events to
                 characterize complex behavioral patterns of individuals
                 and communities over time. We show how semantic
                 information can be incorporated to reason about
                 community-behavior events. We also demonstrate the
                 application of behavioral patterns for the purposes of
                 modeling evolution, link prediction and influence
                 maximization. Finally, we present a diffusion model for
                 evolving networks, based on our framework.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "diffusion of innovations; Dynamic interaction
                 networks; evolutionary analysis",
}

@Article{Chi:2009:ESC,
  author =       "Yun Chi and Xiaodan Song and Dengyong Zhou and Koji
                 Hino and Belle L. Tseng",
  title =        "On evolutionary spectral clustering",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "17:1--17:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1631162.1631165",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Evolutionary clustering is an emerging research area
                 essential to important applications such as clustering
                 dynamic Web and blog contents and clustering data
                 streams. In evolutionary clustering, a good clustering
                 result should fit the current data well, while
                 simultaneously not deviate too dramatically from the
                 recent history. To fulfill this dual purpose, a measure
                 of {\em temporal smoothness\/} is integrated in the
                 overall measure of clustering quality. In this article,
                 we propose two frameworks that incorporate temporal
                 smoothness in evolutionary spectral clustering. For
                 both frameworks, we start with intuitions gained from
                 the well-known {\em k\/} -means clustering problem, and
                 then propose and solve corresponding cost functions for
                 the evolutionary spectral clustering problems. Our
                 solutions to the evolutionary spectral clustering
                 problems provide more stable and consistent clustering
                 results that are less sensitive to short-term noises
                 while at the same time are adaptive to long-term
                 cluster drifts. Furthermore, we demonstrate that our
                 methods provide the optimal solutions to the relaxed
                 versions of the corresponding evolutionary {\em k\/}
                 -means clustering problems. Performance experiments
                 over a number of real and synthetic data sets
                 illustrate our evolutionary spectral clustering methods
                 provide more robust clustering results that are not
                 sensitive to noise and can adapt to data drifts.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Evolutionary spectral clustering; preserving cluster
                 membership; preserving cluster quality; temporal
                 smoothness",
}

@Article{Fujiwara:2009:FLS,
  author =       "Yasuhiro Fujiwara and Yasushi Sakurai and Masaru
                 Kitsuregawa",
  title =        "Fast likelihood search for hidden {Markov} models",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "18:1--18:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1631162.1631166",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Hidden Markov models (HMMs) are receiving considerable
                 attention in various communities and many applications
                 that use HMMs have emerged such as mental task
                 classification, biological analysis, traffic
                 monitoring, and anomaly detection. This article has two
                 goals; The first goal is exact and efficient
                 identification of the model whose state sequence has
                 the highest likelihood for the given query sequence
                 (more precisely, no HMM that actually has a
                 high-probability path for the given sequence is missed
                 by the algorithm), and the second goal is exact and
                 efficient monitoring of streaming data sequences to
                 find the best model. We propose SPIRAL, a fast search
                 method for HMM datasets. SPIRAL is based on three
                 ideas; (1) it clusters states of models to compute
                 approximate likelihood, (2) it uses several
                 granularities and approximates likelihood values in
                 search processing, and (3) it focuses on just the
                 promising likelihood computations by pruning out
                 low-likelihood state sequences. Experiments verify the
                 effectiveness of SPIRAL and show that it is more than
                 490 times faster than the naive method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Hidden Markov model; likelihood; upper bound",
}

@Article{Zhang:2009:EAG,
  author =       "Xiang Zhang and Fei Zou and Wei Wang",
  title =        "Efficient algorithms for genome-wide association
                 study",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "19:1--19:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1631162.1631167",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Studying the association between quantitative
                 phenotype (such as height or weight) and single
                 nucleotide polymorphisms (SNPs) is an important problem
                 in biology. To understand underlying mechanisms of
                 complex phenotypes, it is often necessary to consider
                 joint genetic effects across multiple SNPs. ANOVA
                 (analysis of variance) test is routinely used in
                 association study. Important findings from studying
                 gene-gene (SNP-pair) interactions are appearing in the
                 literature. However, the number of SNPs can be up to
                 millions. Evaluating joint effects of SNPs is a
                 challenging task even for SNP-pairs. Moreover, with
                 large number of SNPs correlated, permutation procedure
                 is preferred over simple Bonferroni correction for
                 properly controlling family-wise error rate and
                 retaining mapping power, which dramatically increases
                 the computational cost of association study.\par

                 In this article, we study the problem of finding
                 SNP-pairs that have significant associations with a
                 given quantitative phenotype. We propose an efficient
                 algorithm, FastANOVA, for performing ANOVA tests on
                 SNP-pairs in a batch mode, which also supports large
                 permutation test. We derive an upper bound of SNP-pair
                 ANOVA test, which can be expressed as the sum of two
                 terms. The first term is based on single-SNP ANOVA
                 test. The second term is based on the SNPs and
                 independent of any phenotype permutation. Furthermore,
                 SNP-pairs can be organized into groups, each of which
                 shares a common upper bound. This allows for maximum
                 reuse of intermediate computation, efficient upper
                 bound estimation, and effective SNP-pair pruning.
                 Consequently, FastANOVA only needs to perform the ANOVA
                 test on a small number of candidate SNP-pairs without
                 the risk of missing any significant ones. Extensive
                 experiments demonstrate that FastANOVA is orders of
                 magnitude faster than the brute-force implementation of
                 ANOVA tests on all SNP pairs. The principles used in
                 FastANOVA can be applied to categorical phenotypes and
                 other statistics such as Chi-square test.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "ANOVA test; Association study; permutation test",
}

@Article{Bilgic:2009:RCM,
  author =       "Mustafa Bilgic and Lise Getoor",
  title =        "Reflect and correct: a misclassification prediction
                 approach to active inference",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "20:1--20:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1631162.1631168",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Information diffusion, viral marketing, graph-based
                 semi-supervised learning, and collective classification
                 all attempt to model and exploit the relationships
                 among nodes in a network to improve the performance of
                 node labeling algorithms. However, sometimes the
                 advantage of exploiting the relationships can become a
                 disadvantage. Simple models like label propagation and
                 iterative classification can aggravate a
                 misclassification by propagating mistakes in the
                 network, while more complex models that define and
                 optimize a global objective function, such as Markov
                 random fields and graph mincuts, can misclassify a set
                 of nodes jointly. This problem can be mitigated if the
                 classification system is allowed to ask for the correct
                 labels for a few of the nodes during inference.
                 However, determining the optimal set of labels to
                 acquire is intractable under relatively general
                 assumptions, which forces us to resort to approximate
                 and heuristic techniques. We describe three such
                 techniques in this article. The first one is based on
                 directly approximating the value of the objective
                 function of label acquisition and greedily acquiring
                 the label that provides the most improvement. The
                 second technique is a simple technique based on the
                 analogy we draw between viral marketing and label
                 acquisition. Finally, we propose a method, which we
                 refer to as {\em reflect and correct}, that can learn
                 and predict when the classification system is likely to
                 make mistakes and suggests acquisitions to correct
                 those mistakes. We empirically show on a variety of
                 synthetic and real-world datasets that the reflect and
                 correct method significantly outperforms the other two
                 techniques, as well as other approaches based on
                 network structural measures such as node degree and
                 network clustering.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "20",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Active inference; collective classification;
                 information diffusion; label acquisition; viral
                 marketing",
}

@Article{Kiernan:2009:CCS,
  author =       "Jerry Kiernan and Evimaria Terzi",
  title =        "Constructing comprehensive summaries of large event
                 sequences",
  journal =      j-TKDD,
  volume =       "3",
  number =       "4",
  pages =        "21:1--21:??",
  month =        nov,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1631162.1631169",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:13 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Event sequences capture system and user activity over
                 time. Prior research on sequence mining has mostly
                 focused on discovering local patterns appearing in a
                 sequence. While interesting, these patterns do not give
                 a comprehensive summary of the entire event sequence.
                 Moreover, the number of patterns discovered can be
                 large. In this article, we take an alternative approach
                 and build {\em short\/} summaries that describe an
                 entire sequence, and discover local dependencies
                 between event types.\par

                 We formally define the summarization problem as an
                 optimization problem that balances shortness of the
                 summary with accuracy of the data description. We show
                 that this problem can be solved optimally in polynomial
                 time by using a combination of two dynamic-programming
                 algorithms. We also explore more efficient greedy
                 alternatives and demonstrate that they work well on
                 large datasets. Experiments on both synthetic and real
                 datasets illustrate that our algorithms are efficient
                 and produce high-quality results, and reveal
                 interesting local structures in the data.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "21",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Event sequences; log mining; summarization",
}

@Article{Koren:2010:FNS,
  author =       "Yehuda Koren",
  title =        "Factor in the neighbors: {Scalable} and accurate
                 collaborative filtering",
  journal =      j-TKDD,
  volume =       "4",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1644873.1644874",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:37 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Recommender systems provide users with personalized
                 suggestions for products or services. These systems
                 often rely on collaborating filtering (CF), where past
                 transactions are analyzed in order to establish
                 connections between users and products. The most common
                 approach to CF is based on neighborhood models, which
                 originate from similarities between products or users.
                 In this work we introduce a new neighborhood model with
                 an improved prediction accuracy. Unlike previous
                 approaches that are based on heuristic similarities, we
                 model neighborhood relations by minimizing a global
                 cost function. Further accuracy improvements are
                 achieved by extending the model to exploit both
                 explicit and implicit feedback by the users. Past
                 models were limited by the need to compute all pairwise
                 similarities between items or users, which grow
                 quadratically with input size. In particular, this
                 limitation vastly complicates adopting user similarity
                 models, due to the typical large number of users. Our
                 new model solves these limitations by factoring the
                 neighborhood model, thus making both item-item and
                 user-user implementations scale linearly with the size
                 of the data. The methods are tested on the Netflix
                 data, with encouraging results.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "collaborative filtering; Netflix Prize; Recommender
                 systems",
}

@Article{Syed:2010:MDP,
  author =       "Zeeshan Syed and Collin Stultz and Manolis Kellis and
                 Piotr Indyk and John Guttag",
  title =        "Motif discovery in physiological datasets: a
                 methodology for inferring predictive elements",
  journal =      j-TKDD,
  volume =       "4",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1644873.1644875",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:37 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In this article, we propose a methodology for
                 identifying predictive physiological patterns in the
                 absence of prior knowledge. We use the principle of
                 conservation to identify activity that consistently
                 precedes an outcome in patients, and describe a
                 two-stage process that allows us to efficiently search
                 for such patterns in large datasets. This involves
                 first transforming continuous physiological signals
                 from patients into symbolic sequences, and then
                 searching for patterns in these reduced representations
                 that are strongly associated with an outcome.\par

                 Our strategy of identifying conserved activity that is
                 unlikely to have occurred purely by chance in symbolic
                 data is analogous to the discovery of regulatory motifs
                 in genomic datasets. We build upon existing work in
                 this area, generalizing the notion of a regulatory
                 motif and enhancing current techniques to operate
                 robustly on non-genomic data. We also address two
                 significant considerations associated with motif
                 discovery in general: computational efficiency and
                 robustness in the presence of degeneracy and noise. To
                 deal with these issues, we introduce the concept of
                 active regions and new subset-based techniques such as
                 a two-layer Gibbs sampling algorithm. These extensions
                 allow for a framework for information inference, where
                 precursors are identified as approximately conserved
                 activity of arbitrary complexity preceding multiple
                 occurrences of an event.\par

                 We evaluated our solution on a population of patients
                 who experienced sudden cardiac death and attempted to
                 discover electrocardiographic activity that may be
                 associated with the endpoint of death. To assess the
                 predictive patterns discovered, we compared likelihood
                 scores for motifs in the sudden death population
                 against control populations of normal individuals and
                 those with non-fatal supraventricular arrhythmias. Our
                 results suggest that predictive motif discovery may be
                 able to identify clinically relevant information even
                 in the absence of significant prior knowledge.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "data mining; Gibbs sampling; inference; knowledge
                 discovery; motifs; physiological signals",
}

@Article{Webb:2010:SSI,
  author =       "Geoffrey I. Webb",
  title =        "Self-sufficient itemsets: an approach to screening
                 potentially interesting associations between items",
  journal =      j-TKDD,
  volume =       "4",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1644873.1644876",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:37 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Self-sufficient itemsets are those whose frequency
                 cannot be explained solely by the frequency of either
                 their subsets or of their supersets. We argue that
                 itemsets that are not self-sufficient will often be of
                 little interest to the data analyst, as their frequency
                 should be expected once that of the itemsets on which
                 their frequency depends is known. We present tests for
                 statistically sound discovery of self-sufficient
                 itemsets, and computational techniques that allow those
                 tests to be applied as a post-processing step for any
                 itemset discovery algorithm. We also present a measure
                 for assessing the degree of potential interest in an
                 itemset that complements these statistical measures.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Association discovery; association rules; itemset
                 discovery; itemset screening; statistical evaluation",
}

@Article{Plantevit:2010:MMM,
  author =       "Marc Plantevit and Anne Laurent and Dominique Laurent
                 and Maguelonne Teisseire and Yeow Wei Choong",
  title =        "Mining multidimensional and multilevel sequential
                 patterns",
  journal =      j-TKDD,
  volume =       "4",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1644873.1644877",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:37 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multidimensional databases have been designed to
                 provide decision makers with the necessary tools to
                 help them understand their data. This framework is
                 different from transactional data as the datasets
                 contain huge volumes of historicized and aggregated
                 data defined over a set of dimensions that can be
                 arranged through multiple levels of granularities. Many
                 tools have been proposed to query the data and navigate
                 through the levels of granularity. However, automatic
                 tools are still missing to mine this type of data in
                 order to discover regular specific patterns. In this
                 article, we present a method for mining sequential
                 patterns from multidimensional databases, at the same
                 time taking advantage of the different dimensions and
                 levels of granularity, which is original compared to
                 existing work. The necessary definitions and algorithms
                 are extended from regular sequential patterns to this
                 particular case. Experiments are reported, showing the
                 significance of this approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "frequent patterns; hierarchy; multidimensional
                 databases; multilevel patterns; Sequential patterns",
}

@Article{Zaki:2010:VVO,
  author =       "Mohammed J. Zaki and Christopher D. Carothers and
                 Boleslaw K. Szymanski",
  title =        "{VOGUE}: a variable order hidden {Markov} model with
                 duration based on frequent sequence mining",
  journal =      j-TKDD,
  volume =       "4",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jan,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1644873.1644878",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Tue Mar 16 18:37:37 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We present VOGUE, a novel, variable order hidden
                 Markov model with state durations, that combines two
                 separate techniques for modeling complex patterns in
                 sequential data: pattern mining and data modeling.
                 VOGUE relies on a variable gap sequence mining method
                 to extract frequent patterns with different lengths and
                 gaps between elements. It then uses these mined
                 sequences to build a variable order hidden Markov model
                 (HMM), that explicitly models the gaps. The gaps
                 implicitly model the order of the HMM, and they
                 explicitly model the duration of each state. We apply
                 VOGUE to a variety of real sequence data taken from
                 domains such as protein sequence classification, Web
                 usage logs, intrusion detection, and spelling
                 correction. We show that VOGUE has superior
                 classification accuracy compared to regular HMMs,
                 higher-order HMMs, and even special purpose HMMs like
                 HMMER, which is a state-of-the-art method for protein
                 classification. The VOGUE implementation and the
                 datasets used in this article are available as
                 open-source.$^1$",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "Hidden Markov models; higher-order HMM; HMM with
                 duration; sequence mining and modeling; variable-order
                 HMM",
}

@Article{Vadera:2010:CCS,
  author =       "Sunil Vadera",
  title =        "{CSNL}: a cost-sensitive non-linear decision tree
                 algorithm",
  journal =      j-TKDD,
  volume =       "4",
  number =       "2",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1754428.1754429",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Sat Aug 14 17:12:30 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "This article presents a new decision tree learning
                 algorithm called CSNL that induces Cost-Sensitive
                 Non-Linear decision trees. The algorithm is based on
                 the hypothesis that nonlinear decision nodes provide a
                 better basis than axis-parallel decision nodes and
                 utilizes discriminant analysis to construct nonlinear
                 decision trees that take account of costs of
                 misclassification.\par

                 The performance of the algorithm is evaluated by
                 applying it to seventeen datasets and the results are
                 compared with those obtained by two well known
                 cost-sensitive algorithms, ICET and MetaCost, which
                 generate multiple trees to obtain some of the best
                 results to date. The results show that CSNL performs at
                 least as well, if not better than these algorithms, in
                 more than twelve of the datasets and is considerably
                 faster. The use of bagging with CSNL further enhances
                 its performance showing the significant benefits of
                 using nonlinear decision nodes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "cost-sensitive learning; Decision tree learning",
}

@Article{Kandylas:2010:AKC,
  author =       "Vasileios Kandylas and S. Phineas Upham and Lyle H.
                 Ungar",
  title =        "Analyzing knowledge communities using foreground and
                 background clusters",
  journal =      j-TKDD,
  volume =       "4",
  number =       "2",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1754428.1754430",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Sat Aug 14 17:12:30 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Insight into the growth (or shrinkage) of ``knowledge
                 communities'' of authors that build on each other's
                 work can be gained by studying the evolution over time
                 of clusters of documents. We cluster documents based on
                 the documents they cite in common using the Streemer
                 clustering method, which finds cohesive foreground
                 clusters (the knowledge communities) embedded in a
                 diffuse background. We build predictive models with
                 features based on the citation structure, the
                 vocabulary of the papers, and the affiliations and
                 prestige of the authors and use these models to study
                 the drivers of community growth and the predictors of
                 how widely a paper will be cited. We find that
                 scientific knowledge communities tend to grow more
                 rapidly if their publications build on diverse
                 information and use narrow vocabulary and that papers
                 that lie on the periphery of a community have the
                 highest impact, while those not in any community have
                 the lowest impact.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "citation analysis; clustering; community evolution;
                 knowledge communities; Text mining",
}

@Article{Ji:2010:SSL,
  author =       "Shuiwang Ji and Lei Tang and Shipeng Yu and Jieping
                 Ye",
  title =        "A shared-subspace learning framework for multi-label
                 classification",
  journal =      j-TKDD,
  volume =       "4",
  number =       "2",
  pages =        "8:1--8:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1754428.1754431",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Sat Aug 14 17:12:30 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multi-label problems arise in various domains such as
                 multi-topic document categorization, protein function
                 prediction, and automatic image annotation. One natural
                 way to deal with such problems is to construct a binary
                 classifier for each label, resulting in a set of
                 independent binary classification problems. Since
                 multiple labels share the same input space, and the
                 semantics conveyed by different labels are usually
                 correlated, it is essential to exploit the correlation
                 information contained in different labels. In this
                 paper, we consider a general framework for extracting
                 shared structures in multi-label classification. In
                 this framework, a common subspace is assumed to be
                 shared among multiple labels. We show that the optimal
                 solution to the proposed formulation can be obtained by
                 solving a generalized eigenvalue problem, though the
                 problem is nonconvex. For high-dimensional problems,
                 direct computation of the solution is expensive, and we
                 develop an efficient algorithm for this case. One
                 appealing feature of the proposed framework is that it
                 includes several well-known algorithms as special
                 cases, thus elucidating their intrinsic relationships.
                 We further show that the proposed framework can be
                 extended to the kernel-induced feature space. We have
                 conducted extensive experiments on multi-topic web page
                 categorization and automatic gene expression pattern
                 image annotation tasks, and results demonstrate the
                 effectiveness of the proposed formulation in comparison
                 with several representative algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "gene expression pattern image annotation; kernel
                 methods; least squares loss; Multi-label
                 classification; shared subspace; singular value
                 decomposition; web page categorization",
}

@Article{Ruggieri:2010:DMD,
  author =       "Salvatore Ruggieri and Dino Pedreschi and Franco
                 Turini",
  title =        "Data mining for discrimination discovery",
  journal =      j-TKDD,
  volume =       "4",
  number =       "2",
  pages =        "9:1--9:??",
  month =        may,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1754428.1754432",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Sat Aug 14 17:12:30 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In the context of civil rights law, discrimination
                 refers to unfair or unequal treatment of people based
                 on membership to a category or a minority, without
                 regard to individual merit. Discrimination in credit,
                 mortgage, insurance, labor market, and education has
                 been investigated by researchers in economics and human
                 sciences. With the advent of automatic decision support
                 systems, such as credit scoring systems, the ease of
                 data collection opens several challenges to data
                 analysts for the fight against discrimination. In this
                 article, we introduce the problem of discovering
                 discrimination through data mining in a dataset of
                 historical decision records, taken by humans or by
                 automatic systems. We formalize the processes of direct
                 and indirect discrimination discovery by modelling
                 protected-by-law groups and contexts where
                 discrimination occurs in a classification rule based
                 syntax. Basically, classification rules extracted from
                 the dataset allow for unveiling contexts of unlawful
                 discrimination, where the degree of burden over
                 protected-by-law groups is formalized by an extension
                 of the lift measure of a classification rule. In direct
                 discrimination, the extracted rules can be directly
                 mined in search of discriminatory contexts. In indirect
                 discrimination, the mining process needs some
                 background knowledge as a further input, for example,
                 census data, that combined with the extracted rules
                 might allow for unveiling contexts of discriminatory
                 decisions. A strategy adopted for combining extracted
                 classification rules with background knowledge is
                 called an inference model. In this article, we propose
                 two inference models and provide automatic procedures
                 for their implementation. An empirical assessment of
                 our results is provided on the German credit dataset
                 and on the PKDD Discovery Challenge 1999 financial
                 dataset.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
  keywords =     "classification rules; Discrimination",
}

@Article{Thomas:2010:MMF,
  author =       "Lini T. Thomas and Satyanarayana R. Valluri and
                 Kamalakar Karlapalem",
  title =        "{MARGIN}: {Maximal} frequent subgraph mining",
  journal =      j-TKDD,
  volume =       "4",
  number =       "3",
  pages =        "10:1--10:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839490.1839491",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:57 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Deodhar:2010:SFS,
  author =       "Meghana Deodhar and Joydeep Ghosh",
  title =        "{SCOAL}: a framework for simultaneous co-clustering
                 and learning from complex data",
  journal =      j-TKDD,
  volume =       "4",
  number =       "3",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839490.1839492",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:57 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chen:2010:BBI,
  author =       "Jinlin Chen and Keli Xiao",
  title =        "{BISC}: a bitmap itemset support counting approach for
                 efficient frequent itemset mining",
  journal =      j-TKDD,
  volume =       "4",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839490.1839493",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:57 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Becchetti:2010:EAL,
  author =       "Luca Becchetti and Paolo Boldi and Carlos Castillo and
                 Aristides Gionis",
  title =        "Efficient algorithms for large-scale local triangle
                 counting",
  journal =      j-TKDD,
  volume =       "4",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839490.1839494",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:57 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhang:2010:MDR,
  author =       "Yin Zhang and Zhi-Hua Zhou",
  title =        "Multilabel dimensionality reduction via dependence
                 maximization",
  journal =      j-TKDD,
  volume =       "4",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839490.1839495",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:57 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Cui:2010:LMN,
  author =       "Ying Cui and Xiaoli Z. Fern and Jennifer G. Dy",
  title =        "Learning multiple nonredundant clusterings",
  journal =      j-TKDD,
  volume =       "4",
  number =       "3",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1839490.1839496",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:57 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2010:TSI,
  author =       "Wei Wang",
  title =        "{TKDD} Special Issue: {SIGKDD 2009}",
  journal =      j-TKDD,
  volume =       "4",
  number =       "4",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857947.1857948",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:58 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chen:2010:BTA,
  author =       "Ye Chen and Dmitry Pavlov and John F. Canny",
  title =        "Behavioral Targeting: The Art of Scaling Up Simple
                 Algorithms",
  journal =      j-TKDD,
  volume =       "4",
  number =       "4",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857947.1857949",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:58 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Mohammed:2010:CDA,
  author =       "Noman Mohammed and Benjamin C. M. Fung and Patrick C.
                 K. Hung and Cheuk-Kwong Lee",
  title =        "Centralized and Distributed Anonymization for
                 High-Dimensional Healthcare Data",
  journal =      j-TKDD,
  volume =       "4",
  number =       "4",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857947.1857950",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:58 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Liu:2010:BBM,
  author =       "Chao Liu and Fan Guo and Christos Faloutsos",
  title =        "{Bayesian} Browsing Model: Exact Inference of Document
                 Relevance from Petabyte-Scale Data",
  journal =      j-TKDD,
  volume =       "4",
  number =       "4",
  pages =        "19:1--19:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857947.1857951",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:58 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wu:2010:MAF,
  author =       "Mingxi Wu and Chris Jermaine and Sanjay Ranka and
                 Xiuyao Song and John Gums",
  title =        "A Model-Agnostic Framework for Fast Spatial Anomaly
                 Detection",
  journal =      j-TKDD,
  volume =       "4",
  number =       "4",
  pages =        "20:1--20:??",
  month =        oct,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1857947.1857952",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:58 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "20",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhong:2010:ATS,
  author =       "Ning Zhong and Gregory Piatetsky-Shapiro and Yiyu Yao
                 and Philip S. Yu",
  title =        "{ACM TKDD} Special Issue on Knowledge Discovery for
                 {Web} Intelligence",
  journal =      j-TKDD,
  volume =       "5",
  number =       "1",
  pages =        "1:1--1:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1870096.1870097",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Tang:2010:CAW,
  author =       "Jie Tang and Limin Yao and Duo Zhang and Jing Zhang",
  title =        "A Combination Approach to {Web} User Profiling",
  journal =      j-TKDD,
  volume =       "5",
  number =       "1",
  pages =        "2:1--2:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1870096.1870098",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Bouguessa:2010:DKS,
  author =       "Mohamed Bouguessa and Shengrui Wang and Benoit
                 Dumoulin",
  title =        "Discovering Knowledge-Sharing Communities in
                 Question-Answering Forums",
  journal =      j-TKDD,
  volume =       "5",
  number =       "1",
  pages =        "3:1--3:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1870096.1870099",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Plangprasopchok:2010:MSA,
  author =       "Anon Plangprasopchok and Kristina Lerman",
  title =        "Modeling Social Annotation: a {Bayesian} Approach",
  journal =      j-TKDD,
  volume =       "5",
  number =       "1",
  pages =        "4:1--4:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1870096.1870100",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Sakurai:2010:FDG,
  author =       "Yasushi Sakurai and Christos Faloutsos and Spiros
                 Papadimitriou",
  title =        "Fast Discovery of Group Lag Correlations in Streams",
  journal =      j-TKDD,
  volume =       "5",
  number =       "1",
  pages =        "5:1--5:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1870096.1870101",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Liu:2010:FCP,
  author =       "Kun Liu and Evimaria Terzi",
  title =        "A Framework for Computing the Privacy Scores of Users
                 in Online Social Networks",
  journal =      j-TKDD,
  volume =       "5",
  number =       "1",
  pages =        "6:1--6:??",
  month =        dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1870096.1870102",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:43:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Sun:2011:ISI,
  author =       "Jimeng Sun and Yan Liu and Jie Tang and Chid Apte",
  title =        "Introduction to Special Issue on Large-Scale Data
                 Mining",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "7:1--7:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1921632.1921633",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Kang:2011:HMR,
  author =       "U. Kang and Charalampos E. Tsourakakis and Ana Paula
                 Appel and Christos Faloutsos and Jure Leskovec",
  title =        "{HADI}: Mining Radii of Large Graphs",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "8:1--8:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1921632.1921634",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{deVries:2011:RRL,
  author =       "Timothy de Vries and Hui Ke and Sanjay Chawla and
                 Peter Christen",
  title =        "Robust Record Linkage Blocking Using Suffix Arrays and
                 {Bloom} Filters",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1921632.1921635",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Dunlavy:2011:TLP,
  author =       "Daniel M. Dunlavy and Tamara G. Kolda and Evrim Acar",
  title =        "Temporal Link Prediction Using Matrix and Tensor
                 Factorizations",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1921632.1921636",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Magdalinos:2011:ECQ,
  author =       "Panagis Magdalinos and Christos Doulkeridis and
                 Michalis Vazirgiannis",
  title =        "Enhancing Clustering Quality through Landmark-Based
                 Dimensionality Reduction",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "11:1--11:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1921632.1921637",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Cheng:2011:CLA,
  author =       "Hong Cheng and Yang Zhou and Jeffrey Xu Yu",
  title =        "Clustering Large Attributed Graphs: a Balance between
                 Structural and Attribute Similarities",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "12:1--12:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1921632.1921638",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Menon:2011:FAA,
  author =       "Aditya Krishna Menon and Charles Elkan",
  title =        "Fast Algorithms for Approximating the Singular Value
                 Decomposition",
  journal =      j-TKDD,
  volume =       "5",
  number =       "2",
  pages =        "13:1--13:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1921632.1921639",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Mon Mar 28 11:44:01 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "A low-rank approximation to a matrix $A$ is a matrix
                 with significantly smaller rank than $A$, and which is
                 close to $A$ according to some norm. Many practical
                 applications involving the use of large matrices focus
                 on low-rank approximations. By reducing the rank or
                 dimensionality of the data, we reduce the complexity of
                 analyzing the data. The singular value decomposition is
                 the most popular low-rank matrix approximation.
                 However, due to its expensive computational
                 requirements, it has often been considered intractable
                 for practical applications involving massive data.
                 Recent developments have tried to address this problem,
                 with several methods proposed to approximate the
                 decomposition with better asymptotic runtime. We
                 present an empirical study of these techniques on a
                 variety of dense and sparse datasets. We find that a
                 sampling approach of Drineas, Kannan and Mahoney is
                 often, but not always, the best performing method. This
                 method gives solutions with high accuracy much faster
                 than classical SVD algorithms, on large sparse datasets
                 in particular. Other modern methods, such as a recent
                 algorithm by Rokhlin and Tygert, also offer savings
                 compared to classical SVD algorithms. The older
                 sampling methods of Achlioptas and McSherry are shown
                 to sometimes take longer than classical SVD.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2011:IDC,
  author =       "Dingding Wang and Shenghuo Zhu and Tao Li and Yun Chi
                 and Yihong Gong",
  title =        "Integrating Document Clustering and Multidocument
                 Summarization",
  journal =      j-TKDD,
  volume =       "5",
  number =       "3",
  pages =        "14:1--14:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1993077.1993078",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Thu Aug 18 13:28:08 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Maier:2011:INS,
  author =       "Marc Maier and Matthew Rattigan and David Jensen",
  title =        "Indexing Network Structure with Shortest-Path Trees",
  journal =      j-TKDD,
  volume =       "5",
  number =       "3",
  pages =        "15:1--15:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1993077.1993079",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Thu Aug 18 13:28:08 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wong:2011:CUA,
  author =       "Raymond Chi-Wing Wong and Ada Wai-Chee Fu and Ke Wang
                 and Philip S. Yu and Jian Pei",
  title =        "Can the Utility of Anonymized Data be Used for Privacy
                 Breaches?",
  journal =      j-TKDD,
  volume =       "5",
  number =       "3",
  pages =        "16:1--16:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1993077.1993080",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Thu Aug 18 13:28:08 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Lin:2011:CDM,
  author =       "Yu-Ru Lin and Jimeng Sun and Hari Sundaram and Aisling
                 Kelliher and Paul Castro and Ravi Konuru",
  title =        "Community Discovery via Metagraph Factorization",
  journal =      j-TKDD,
  volume =       "5",
  number =       "3",
  pages =        "17:1--17:??",
  month =        aug,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1993077.1993081",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  bibdate =      "Thu Aug 18 13:28:08 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Elkan:2012:GES,
  author =       "Charles Elkan and Yehuda Koren",
  title =        "Guest Editorial for Special Issue {KDD'10}",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "18:1--18:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086737.2086738",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Iwata:2012:SMT,
  author =       "Tomoharu Iwata and Takeshi Yamada and Yasushi Sakurai
                 and Naonori Ueda",
  title =        "Sequential Modeling of Topic Dynamics with Multiple
                 Timescales",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "19:1--19:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086737.2086739",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We propose an online topic model for sequentially
                 analyzing the time evolution of topics in document
                 collections. Topics naturally evolve with multiple
                 timescales. For example, some words may be used
                 consistently over one hundred years, while other words
                 emerge and disappear over periods of a few days. Thus,
                 in the proposed model, current topic-specific
                 distributions over words are assumed to be generated
                 based on the multiscale word distributions of the
                 previous epoch. Considering both the long- and
                 short-timescale dependency yields a more robust model.
                 We derive efficient online inference procedures based
                 on a stochastic EM algorithm, in which the model is
                 sequentially updated using newly obtained data; this
                 means that past data are not required to make the
                 inference. We demonstrate the effectiveness of the
                 proposed method in terms of predictive performance and
                 computational efficiency by examining collections of
                 real documents with timestamps.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Huh:2012:DTM,
  author =       "Seungil Huh and Stephen E. Fienberg",
  title =        "Discriminative Topic Modeling Based on Manifold
                 Learning",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "20:1--20:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086737.2086740",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Topic modeling has become a popular method used for
                 data analysis in various domains including text
                 documents. Previous topic model approaches, such as
                 probabilistic Latent Semantic Analysis (pLSA) and
                 Latent Dirichlet Allocation (LDA), have shown
                 impressive success in discovering low-rank hidden
                 structures for modeling text documents. These
                 approaches, however do not take into account the
                 manifold structure of the data, which is generally
                 informative for nonlinear dimensionality reduction
                 mapping. More recent topic model approaches, Laplacian
                 PLSI (LapPLSI) and Locally-consistent Topic Model
                 (LTM), have incorporated the local manifold structure
                 into topic models and have shown resulting benefits.
                 But they fall short of achieving full discriminating
                 power of manifold learning as they only enhance the
                 proximity between the low-rank representations of
                 neighboring pairs without any consideration for
                 non-neighboring pairs. In this article, we propose a
                 new approach, Discriminative Topic Model (DTM), which
                 separates non-neighboring pairs from each other in
                 addition to bringing neighboring pairs closer together,
                 thereby preserving the global manifold structure as
                 well as improving local consistency. We also present a
                 novel model-fitting algorithm based on the generalized
                 EM algorithm and the concept of Pareto improvement. We
                 empirically demonstrate the success of DTM in terms of
                 unsupervised clustering and semisupervised
                 classification accuracies on text corpora and
                 robustness to parameters compared to state-of-the-art
                 techniques.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "20",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Gomez-Rodriguez:2012:IND,
  author =       "Manuel Gomez-Rodriguez and Jure Leskovec and Andreas
                 Krause",
  title =        "Inferring Networks of Diffusion and Influence",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "21:1--21:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086737.2086741",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Information diffusion and virus propagation are
                 fundamental processes taking place in networks. While
                 it is often possible to directly observe when nodes
                 become infected with a virus or publish the
                 information, observing individual transmissions (who
                 infects whom, or who influences whom) is typically very
                 difficult. Furthermore, in many applications, the
                 underlying network over which the diffusions and
                 propagations spread is actually unobserved. We tackle
                 these challenges by developing a method for tracing
                 paths of diffusion and influence through networks and
                 inferring the networks over which contagions propagate.
                 Given the times when nodes adopt pieces of information
                 or become infected, we identify the optimal network
                 that best explains the observed infection times. Since
                 the optimization problem is NP-hard to solve exactly,
                 we develop an efficient approximation algorithm that
                 scales to large datasets and finds provably
                 near-optimal networks. We demonstrate the effectiveness
                 of our approach by tracing information diffusion in a
                 set of 170 million blogs and news articles over a one
                 year period to infer how information flows through the
                 online media space. We find that the diffusion network
                 of news for the top 1,000 media sites and blogs tends
                 to have a core-periphery structure with a small set of
                 core media sites that diffuse information to the rest
                 of the Web. These sites tend to have stable circles of
                 influence with more general news media sites acting as
                 connectors between them.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "21",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chen:2012:LIS,
  author =       "Jianhui Chen and Ji Liu and Jieping Ye",
  title =        "Learning Incoherent Sparse and Low-Rank Patterns from
                 Multiple Tasks",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "22:1--22:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086737.2086742",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We consider the problem of learning incoherent sparse
                 and low-rank patterns from multiple tasks. Our approach
                 is based on a linear multitask learning formulation, in
                 which the sparse and low-rank patterns are induced by a
                 cardinality regularization term and a low-rank
                 constraint, respectively. This formulation is
                 nonconvex; we convert it into its convex surrogate,
                 which can be routinely solved via semidefinite
                 programming for small-size problems. We propose
                 employing the general projected gradient scheme to
                 efficiently solve such a convex surrogate; however, in
                 the optimization formulation, the objective function is
                 nondifferentiable and the feasible domain is
                 nontrivial. We present the procedures for computing the
                 projected gradient and ensuring the global convergence
                 of the projected gradient scheme. The computation of
                 the projected gradient involves a constrained
                 optimization problem; we show that the optimal solution
                 to such a problem can be obtained via solving an
                 unconstrained optimization subproblem and a Euclidean
                 projection subproblem. We also present two projected
                 gradient algorithms and analyze their rates of
                 convergence in detail. In addition, we illustrate the
                 use of the presented projected gradient algorithms for
                 the proposed multitask learning formulation using the
                 least squares loss. Experimental results on a
                 collection of real-world data sets demonstrate the
                 effectiveness of the proposed multitask learning
                 formulation and the efficiency of the proposed
                 projected gradient algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "22",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Yu:2012:LLC,
  author =       "Hsiang-Fu Yu and Cho-Jui Hsieh and Kai-Wei Chang and
                 Chih-Jen Lin",
  title =        "Large Linear Classification When Data Cannot Fit in
                 Memory",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "23:1--23:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086737.2086743",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Recent advances in linear classification have shown
                 that for applications such as document classification,
                 the training process can be extremely efficient.
                 However, most of the existing training methods are
                 designed by assuming that data can be stored in the
                 computer memory. These methods cannot be easily applied
                 to data larger than the memory capacity due to the
                 random access to the disk. We propose and analyze a
                 block minimization framework for data larger than the
                 memory size. At each step a block of data is loaded
                 from the disk and handled by certain learning methods.
                 We investigate two implementations of the proposed
                 framework for primal and dual SVMs, respectively.
                 Because data cannot fit in memory, many design
                 considerations are very different from those for
                 traditional algorithms. We discuss and compare with
                 existing approaches that are able to handle data larger
                 than memory. Experiments using data sets 20 times
                 larger than the memory demonstrate the effectiveness of
                 the proposed method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "23",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Shahaf:2012:CTL,
  author =       "Dafna Shahaf and Carlos Guestrin",
  title =        "Connecting Two (or Less) Dots: Discovering Structure
                 in News Articles",
  journal =      j-TKDD,
  volume =       "5",
  number =       "4",
  pages =        "24:1--24:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2086737.2086744",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 16 15:19:57 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Finding information is becoming a major part of our
                 daily life. Entire sectors, from Web users to
                 scientists and intelligence analysts, are increasingly
                 struggling to keep up with the larger and larger
                 amounts of content published every day. With this much
                 data, it is often easy to miss the big picture. In this
                 article, we investigate methods for automatically
                 connecting the dots---providing a structured, easy way
                 to navigate within a new topic and discover hidden
                 connections. We focus on the news domain: given two
                 news articles, our system automatically finds a
                 coherent chain linking them together. For example, it
                 can recover the chain of events starting with the
                 decline of home prices (January 2007), and ending with
                 the health care debate (2009). We formalize the
                 characteristics of a good chain and provide a fast
                 search-driven algorithm to connect two fixed endpoints.
                 We incorporate user feedback into our framework,
                 allowing the stories to be refined and personalized. We
                 also provide a method to handle partially-specified
                 endpoints, for users who do not know both ends of a
                 story. Finally, we evaluate our algorithm over real
                 news data. Our user studies demonstrate that the
                 objective we propose captures the users' intuitive
                 notion of coherence, and that our algorithm effectively
                 helps users understand the news.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "24",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Ienco:2012:CDL,
  author =       "Dino Ienco and Ruggero G. Pensa and Rosa Meo",
  title =        "From Context to Distance: Learning Dissimilarity for
                 Categorical Data Clustering",
  journal =      j-TKDD,
  volume =       "6",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133360.2133361",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Clustering data described by categorical attributes is
                 a challenging task in data mining applications. Unlike
                 numerical attributes, it is difficult to define a
                 distance between pairs of values of a categorical
                 attribute, since the values are not ordered. In this
                 article, we propose a framework to learn a
                 context-based distance for categorical attributes. The
                 key intuition of this work is that the distance between
                 two values of a categorical attribute A$_i$ can be
                 determined by the way in which the values of the other
                 attributes A$_j$ are distributed in the dataset
                 objects: if they are similarly distributed in the
                 groups of objects in correspondence of the distinct
                 values of A$_i$ a low value of distance is obtained. We
                 propose also a solution to the critical point of the
                 choice of the attributes A$_j$. We validate our
                 approach by embedding our distance learning framework
                 in a hierarchical clustering algorithm. We applied it
                 on various real world and synthetic datasets, both low
                 and high-dimensional. Experimental results show that
                 our method is competitive with respect to the state of
                 the art of categorical data clustering approaches. We
                 also show that our approach is scalable and has a low
                 impact on the overall computational time of a
                 clustering task.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Li:2012:EMG,
  author =       "Chun Li and Qingyan Yang and Jianyong Wang and Ming
                 Li",
  title =        "Efficient Mining of Gap-Constrained Subsequences and
                 Its Various Applications",
  journal =      j-TKDD,
  volume =       "6",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133360.2133362",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Mining frequent subsequence patterns is a typical
                 data-mining problem and various efficient sequential
                 pattern mining algorithms have been proposed. In many
                 application domains (e.g., biology), the frequent
                 subsequences confined by the predefined gap
                 requirements are more meaningful than the general
                 sequential patterns. In this article, we propose two
                 algorithms, Gap-BIDE for mining closed gap-constrained
                 subsequences from a set of input sequences, and
                 Gap-Connect for mining repetitive gap-constrained
                 subsequences from a single input sequence. Inspired by
                 some state-of-the-art closed or constrained sequential
                 pattern mining algorithms, the Gap-BIDE algorithm
                 adopts an efficient approach to finding the complete
                 set of closed sequential patterns with gap constraints,
                 while the Gap-Connect algorithm efficiently mines an
                 approximate set of long patterns by connecting short
                 patterns. We also present several methods for feature
                 selection from the set of gap-constrained patterns for
                 the purpose of classification and clustering. Our
                 extensive performance study shows that our approaches
                 are very efficient in mining frequent subsequences with
                 gap constraints, and the gap-constrained pattern based
                 classification/clustering approaches can achieve
                 high-quality results.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Liu:2012:IBA,
  author =       "Fei Tony Liu and Kai Ming Ting and Zhi-Hua Zhou",
  title =        "Isolation-Based Anomaly Detection",
  journal =      j-TKDD,
  volume =       "6",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133360.2133363",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Anomalies are data points that are few and different.
                 As a result of these properties, we show that,
                 anomalies are susceptible to a mechanism called
                 isolation. This article proposes a method called
                 Isolation Forest ($i$ Forest), which detects anomalies
                 purely based on the concept of isolation without
                 employing any distance or density
                 measure---fundamentally different from all existing
                 methods. As a result, $i$ Forest is able to exploit
                 subsampling (i) to achieve a low linear time-complexity
                 and a small memory-requirement and (ii) to deal with
                 the effects of swamping and masking effectively. Our
                 empirical evaluation shows that $i$ Forest outperforms
                 ORCA, one-class SVM, LOF and Random Forests in terms of
                 AUC, processing time, and it is robust against masking
                 and swamping effects. $i$ Forest also works well in
                 high dimensional problems containing a large number of
                 irrelevant attributes, and when anomalies are not
                 available in training sample.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Jin:2012:MML,
  author =       "Yu Jin and Nick Duffield and Jeffrey Erman and Patrick
                 Haffner and Subhabrata Sen and Zhi-Li Zhang",
  title =        "A Modular Machine Learning System for Flow-Level
                 Traffic Classification in Large Networks",
  journal =      j-TKDD,
  volume =       "6",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2133360.2133364",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The ability to accurately and scalably classify
                 network traffic is of critical importance to a wide
                 range of management tasks of large networks, such as
                 tier-1 ISP networks and global enterprise networks.
                 Guided by the practical constraints and requirements of
                 traffic classification in large networks, in this
                 article, we explore the design of an accurate and
                 scalable machine learning based flow-level traffic
                 classification system, which is trained on a dataset of
                 flow-level data that has been annotated with
                 application protocol labels by a packet-level
                 classifier. Our system employs a lightweight modular
                 architecture, which combines a series of simple linear
                 binary classifiers, each of which can be efficiently
                 implemented and trained on vast amounts of flow data in
                 parallel, and embraces three key innovative mechanisms,
                 weighted threshold sampling, logistic calibration, and
                 intelligent data partitioning, to achieve scalability
                 while attaining high accuracy. Evaluations using real
                 traffic data from multiple locations in a large ISP
                 show that our system accurately reproduces the labels
                 of the packet level classifier when runs on (unlabeled)
                 flow records, while meeting the scalability and
                 stability requirements of large ISP networks. Using
                 training and test datasets that are two months apart
                 and collected from two different locations, the flow
                 error rates are only 3\% for TCP flows and 0.4\% for
                 UDP flows. We further show that such error rates can be
                 reduced by combining the information of spatial
                 distributions of flows, or collective traffic
                 statistics, during classification. We propose a novel
                 two-step model, which seamlessly integrates these
                 collective traffic statistics into the existing traffic
                 classification system. Experimental results display
                 performance improvement on all traffic classes and an
                 overall error rate reduction by 15\%. In addition to a
                 high accuracy, at runtime, our implementation easily
                 scales to classify traffic on 10Gbps links.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Mavroeidis:2012:SSF,
  author =       "Dimitrios Mavroeidis and Panagis Magdalinos",
  title =        "A Sequential Sampling Framework for Spectral $k$-Means
                 Based on Efficient Bootstrap Accuracy Estimations:
                 Application to Distributed Clustering",
  journal =      j-TKDD,
  volume =       "6",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2297456.2297457",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The scalability of learning algorithms has always been
                 a central concern for data mining researchers, and
                 nowadays, with the rapid increase in data storage
                 capacities and availability, its importance has
                 increased. To this end, sampling has been studied by
                 several researchers in an effort to derive sufficiently
                 accurate models using only small data fractions. In
                 this article we focus on spectral $k$-means, that is,
                 the $k$-means approximation as derived by the spectral
                 relaxation, and propose a sequential sampling framework
                 that iteratively enlarges the sample size until the
                 $k$-means results (objective function and cluster
                 structure) become indistinguishable from the asymptotic
                 (infinite-data) output. In the proposed framework we
                 adopt a commonly applied principle in data mining
                 research that considers the use of minimal assumptions
                 concerning the data generating distribution. This
                 restriction imposes several challenges, mainly related
                 to the efficiency of the sequential sampling procedure.
                 These challenges are addressed using elements of matrix
                 perturbation theory and statistics. Moreover, although
                 the main focus is on spectral $k$-means, we also
                 demonstrate that the proposed framework can be
                 generalized to handle spectral clustering. The proposed
                 sequential sampling framework is consecutively employed
                 for addressing the distributed clustering problem,
                 where the task is to construct a global model for data
                 that resides in distributed network nodes. The main
                 challenge in this context is related to the bandwidth
                 constraints that are commonly imposed, thus requiring
                 that the distributed clustering algorithm consumes a
                 minimal amount of network load. This illustrates the
                 applicability of the proposed approach, as it enables
                 the determination of a minimal sample size that can be
                 used for constructing an accurate clustering model that
                 entails the distributional characteristics of the data.
                 As opposed to the relevant distributed $k$-means
                 approaches, our framework takes into account the fact
                 that the choice of the number of clusters has a crucial
                 effect on the required amount of communication. More
                 precisely, the proposed algorithm is able to derive a
                 statistical estimation of the required relative sizes
                 for all possible values of $k$. This unique feature of
                 our distributed clustering framework enables a network
                 administrator to choose an economic solution that
                 identifies the crude cluster structure of a dataset and
                 not devote excessive network resources for identifying
                 all the ``correct'' detailed clusters.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Das:2012:MIG,
  author =       "Sanmay Das and Malik Magdon-Ismail",
  title =        "A Model for Information Growth in Collective Wisdom
                 Processes",
  journal =      j-TKDD,
  volume =       "6",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2297456.2297458",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Collaborative media such as wikis have become
                 enormously successful venues for information creation.
                 Articles accrue information through the asynchronous
                 editing of users who arrive both seeking information
                 and possibly able to contribute information. Most
                 articles stabilize to high-quality, trusted sources of
                 information representing the collective wisdom of all
                 the users who edited the article. We propose a model
                 for information growth which relies on two main
                 observations: (i) as an article's quality improves, it
                 attracts visitors at a faster rate (a rich-get-richer
                 phenomenon); and, simultaneously, (ii) the chances that
                 a new visitor will improve the article drops (there is
                 only so much that can be said about a particular
                 topic). Our model is able to reproduce many features of
                 the edit dynamics observed on Wikipedia; in particular,
                 it captures the observed rise in the edit rate,
                 followed by $ 1 / t $ decay. Despite differences in the
                 media, we also document similar features in the comment
                 rates for a segment of the LiveJournal blogosphere.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Xu:2012:GME,
  author =       "Tianbing Xu and Zhongfei Zhang and Philip S. Yu and Bo
                 Long",
  title =        "Generative Models for Evolutionary Clustering",
  journal =      j-TKDD,
  volume =       "6",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2297456.2297459",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "This article studies evolutionary clustering, a
                 recently emerged hot topic with many important
                 applications, noticeably in dynamic social network
                 analysis. In this article, based on the recent
                 literature on nonparametric Bayesian models, we have
                 developed two generative models: DPChain and HDP-HTM.
                 DPChain is derived from the Dirichlet process mixture
                 (DPM) model, with an exponential decaying component
                 along with the time. HDP-HTM combines the hierarchical
                 dirichlet process (HDP) with a hierarchical transition
                 matrix (HTM) based on the proposed Infinite
                 hierarchical Markov state model (iHMS). Both models
                 substantially advance the literature on evolutionary
                 clustering, in the sense that not only do they both
                 perform better than those in the existing literature,
                 but more importantly, they are capable of automatically
                 learning the cluster numbers and explicitly addressing
                 the corresponding issues. Extensive evaluations have
                 demonstrated the effectiveness and the promise of these
                 two solutions compared to the state-of-the-art
                 literature.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2012:LME,
  author =       "Shaojun Wang and Dale Schuurmans and Yunxin Zhao",
  title =        "The Latent Maximum Entropy Principle",
  journal =      j-TKDD,
  volume =       "6",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2297456.2297460",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We present an extension to Jaynes' maximum entropy
                 principle that incorporates latent variables. The
                 principle of latent maximum entropy we propose is
                 different from both Jaynes' maximum entropy principle
                 and maximum likelihood estimation, but can yield better
                 estimates in the presence of hidden variables and
                 limited training data. We first show that solving for a
                 latent maximum entropy model poses a hard nonlinear
                 constrained optimization problem in general. However,
                 we then show that feasible solutions to this problem
                 can be obtained efficiently for the special case of
                 log-linear models---which forms the basis for an
                 efficient approximation to the latent maximum entropy
                 principle. We derive an algorithm that combines
                 expectation-maximization with iterative scaling to
                 produce feasible log-linear solutions. This algorithm
                 can be interpreted as an alternating minimization
                 algorithm in the information divergence, and reveals an
                 intimate connection between the latent maximum entropy
                 and maximum likelihood principles. To select a final
                 model, we generate a series of feasible candidates,
                 calculate the entropy of each, and choose the model
                 that attains the highest entropy. Our experimental
                 results show that estimation based on the latent
                 maximum entropy principle generally gives better
                 results than maximum likelihood when estimating latent
                 variable models on small observed data samples.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Bhattacharya:2012:CGC,
  author =       "Indrajit Bhattacharya and Shantanu Godbole and
                 Sachindra Joshi and Ashish Verma",
  title =        "Cross-Guided Clustering: Transfer of Relevant
                 Supervision across Tasks",
  journal =      j-TKDD,
  volume =       "6",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2297456.2297461",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:38 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Lack of supervision in clustering algorithms often
                 leads to clusters that are not useful or interesting to
                 human reviewers. We investigate if supervision can be
                 automatically transferred for clustering a target task,
                 by providing a relevant supervised partitioning of a
                 dataset from a different source task. The target
                 clustering is made more meaningful for the human user
                 by trading-off intrinsic clustering goodness on the
                 target task for alignment with relevant supervised
                 partitions in the source task, wherever possible. We
                 propose a cross-guided clustering algorithm that builds
                 on traditional k-means by aligning the target clusters
                 with source partitions. The alignment process makes use
                 of a cross-task similarity measure that discovers
                 hidden relationships across tasks. When the source and
                 target tasks correspond to different domains with
                 potentially different vocabularies, we propose a
                 projection approach using pivot vocabularies for the
                 cross-domain similarity measure. Using multiple
                 real-world and synthetic datasets, we show that our
                 approach improves clustering accuracy significantly
                 over traditional k-means and state-of-the-art
                 semi-supervised clustering baselines, over a wide range
                 of data characteristics and parameter settings.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2012:LBN,
  author =       "Zhenxing Wang and Laiwan Chan",
  title =        "Learning {Bayesian} networks from {Markov} random
                 fields: an efficient algorithm for linear models",
  journal =      j-TKDD,
  volume =       "6",
  number =       "3",
  pages =        "10:1--10:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2362383.2362384",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:40 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Dependency analysis is a typical approach for Bayesian
                 network learning, which infers the structures of
                 Bayesian networks by the results of a series of
                 conditional independence (CI) tests. In practice,
                 testing independence conditioning on large sets hampers
                 the performance of dependency analysis algorithms in
                 terms of accuracy and running time for the following
                 reasons. First, testing independence on large sets of
                 variables with limited samples is not stable. Second,
                 for most dependency analysis algorithms, the number of
                 CI tests grows at an exponential rate with the sizes of
                 conditioning sets, and the running time grows of the
                 same rate. Therefore, determining how to reduce the
                 number of CI tests and the sizes of conditioning sets
                 becomes a critical step in dependency analysis
                 algorithms. In this article, we address a two-phase
                 algorithm based on the observation that the structures
                 of Markov random fields are similar to those of
                 Bayesian networks. The first phase of the algorithm
                 constructs a Markov random field from data, which
                 provides a close approximation to the structure of the
                 true Bayesian network; the second phase of the
                 algorithm removes redundant edges according to CI tests
                 to get the true Bayesian network. Both phases use
                 Markov blanket information to reduce the sizes of
                 conditioning sets and the number of CI tests without
                 sacrificing accuracy. An empirical study shows that the
                 two-phase algorithm performs well in terms of accuracy
                 and efficiency.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chan:2012:CID,
  author =       "Jeffrey Chan and James Bailey and Christopher Leckie
                 and Michael Houle",
  title =        "{ciForager}: Incrementally discovering regions of
                 correlated change in evolving graphs",
  journal =      j-TKDD,
  volume =       "6",
  number =       "3",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2362383.2362385",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:40 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Data mining techniques for understanding how graphs
                 evolve over time have become increasingly important.
                 Evolving graphs arise naturally in diverse applications
                 such as computer network topologies, multiplayer games
                 and medical imaging. A natural and interesting problem
                 in evolving graph analysis is the discovery of compact
                 subgraphs that change in a similar manner. Such
                 subgraphs are known as regions of correlated change and
                 they can both summarise change patterns in graphs and
                 help identify the underlying events causing these
                 changes. However, previous techniques for discovering
                 regions of correlated change suffer from limited
                 scalability, making them unsuitable for analysing the
                 evolution of very large graphs. In this paper, we
                 introduce a new algorithm called ciForager, that
                 addresses this scalability challenge and offers
                 considerable improvements. The efficiency of ciForager
                 is based on the use of new incremental techniques for
                 detecting change, as well as the use of Voronoi
                 representations for efficiently determining distance.
                 We experimentally show that ciForager can achieve
                 speedups of up to 1000 times over previous approaches.
                 As a result, it becomes feasible for the first time to
                 discover regions of correlated change in extremely
                 large graphs, such as the entire BGP routing topology
                 of the Internet.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2012:CDS,
  author =       "Dingding Wang and Shenghuo Zhu and Tao Li and Yihong
                 Gong",
  title =        "Comparative document summarization via discriminative
                 sentence selection",
  journal =      j-TKDD,
  volume =       "6",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2362383.2362386",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:40 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Given a collection of document groups, a natural
                 question is to identify the differences among them.
                 Although traditional document summarization techniques
                 can summarize the content of the document groups one by
                 one, there exists a great necessity to generate a
                 summary of the differences among the document groups.
                 In this article, we study a novel problem, that of
                 summarizing the differences between document groups. A
                 discriminative sentence selection method is proposed to
                 extract the most discriminative sentences which
                 represent the specific characteristics of each document
                 group. Experiments and case studies on real-world data
                 sets demonstrate the effectiveness of our proposed
                 method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{deMelo:2012:FNO,
  author =       "Pedro O. S. {Vaz de Melo} and Virgilio A. F. Almeida
                 and Antonio A. F. Loureiro and Christos Faloutsos",
  title =        "Forecasting in the {NBA} and other team sports:
                 Network effects in action",
  journal =      j-TKDD,
  volume =       "6",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2362383.2362387",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Nov 6 18:30:40 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The multi-million sports-betting market is based on
                 the fact that the task of predicting the outcome of a
                 sports event is very hard. Even with the aid of an
                 uncountable number of descriptive statistics and
                 background information, only a few can correctly guess
                 the outcome of a game or a league. In this work, our
                 approach is to move away from the traditional way of
                 predicting sports events, and instead to model sports
                 leagues as networks of players and teams where the only
                 information available is the work relationships among
                 them. We propose two network-based models to predict
                 the behavior of teams in sports leagues. These models
                 are parameter-free, that is, they do not have a single
                 parameter, and moreover are sport-agnostic: they can be
                 applied directly to any team sports league. First, we
                 view a sports league as a network in evolution, and we
                 infer the implicit feedback behind network changes and
                 properties over the years. Then, we use this knowledge
                 to construct the network-based prediction models, which
                 can, with a significantly high probability, indicate
                 how well a team will perform over a season. We compare
                 our proposed models with other prediction models in two
                 of the most popular sports leagues: the National
                 Basketball Association (NBA) and the Major League
                 Baseball (MLB). Our model shows consistently good
                 results in comparison with the other models and,
                 relying upon the network properties of the teams, we
                 achieved a $ \approx 14 \% $ rank prediction accuracy
                 improvement over our best competitor.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Ghosh:2012:SIB,
  author =       "Joydeep Ghosh and Padhraic Smyth and Andrew Tomkins
                 and Rich Caruana",
  title =        "Special issue on best of {SIGKDD 2011}",
  journal =      j-TKDD,
  volume =       "6",
  number =       "4",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2382577.2382578",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:40 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Kaufman:2012:LDM,
  author =       "Shachar Kaufman and Saharon Rosset and Claudia Perlich
                 and Ori Stitelman",
  title =        "Leakage in data mining: Formulation, detection, and
                 avoidance",
  journal =      j-TKDD,
  volume =       "6",
  number =       "4",
  pages =        "15:1--15:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2382577.2382579",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:40 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Deemed ``one of the top ten data mining mistakes'',
                 leakage is the introduction of information about the
                 data mining target that should not be legitimately
                 available to mine from. In addition to our own industry
                 experience with real-life projects, controversies
                 around several major public data mining competitions
                 held recently such as the INFORMS 2010 Data Mining
                 Challenge and the IJCNN 2011 Social Network Challenge
                 are evidence that this issue is as relevant today as it
                 has ever been. While acknowledging the importance and
                 prevalence of leakage in both synthetic competitions
                 and real-life data mining projects, existing literature
                 has largely left this idea unexplored. What little has
                 been said turns out not to be broad enough to cover
                 more complex cases of leakage, such as those where the
                 classical independently and identically distributed
                 (i.i.d.) assumption is violated, that have been
                 recently documented. In our new approach, these cases
                 and others are explained by explicitly defining
                 modeling goals and analyzing the broader framework of
                 the data mining problem. The resulting definition
                 enables us to derive general methodology for dealing
                 with the issue. We show that it is possible to avoid
                 leakage with a simple specific approach to data
                 management followed by what we call a learn-predict
                 separation, and present several ways of detecting
                 leakage when the modeler has no control over how the
                 data have been collected. We also offer an alternative
                 point of view on leakage that is based on causal graph
                 modeling concepts.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Mampaey:2012:SDS,
  author =       "Michael Mampaey and Jilles Vreeken and Nikolaj Tatti",
  title =        "Summarizing data succinctly with the most informative
                 itemsets",
  journal =      j-TKDD,
  volume =       "6",
  number =       "4",
  pages =        "16:1--16:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2382577.2382580",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:40 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Knowledge discovery from data is an inherently
                 iterative process. That is, what we know about the data
                 greatly determines our expectations, and therefore,
                 what results we would find interesting and/or
                 surprising. Given new knowledge about the data, our
                 expectations will change. Hence, in order to avoid
                 redundant results, knowledge discovery algorithms
                 ideally should follow such an iterative updating
                 procedure. With this in mind, we introduce a
                 well-founded approach for succinctly summarizing data
                 with the most informative itemsets; using a
                 probabilistic maximum entropy model, we iteratively
                 find the itemset that provides us the most novel
                 information-that is, for which the frequency in the
                 data surprises us the most-and in turn we update our
                 model accordingly. As we use the maximum entropy
                 principle to obtain unbiased probabilistic models, and
                 only include those itemsets that are most informative
                 with regard to the current model, the summaries we
                 construct are guaranteed to be both descriptive and
                 nonredundant. The algorithm that we present, called
                 mtv, can either discover the top-$k$ most informative
                 itemsets, or we can employ either the Bayesian
                 Information Criterion (bic) or the Minimum Description
                 Length (mdl) principle to automatically identify the
                 set of itemsets that together summarize the data well.
                 In other words, our method will ``tell you what you
                 need to know'' about the data. Importantly, it is a
                 one-phase algorithm: rather than picking itemsets from
                 a user-provided candidate set, itemsets and their
                 supports are mined on-the-fly. To further its
                 applicability, we provide an efficient method to
                 compute the maximum entropy distribution using Quick
                 Inclusion-Exclusion. Experiments on our method, using
                 synthetic, benchmark, and real data, show that the
                 discovered summaries are succinct, and correctly
                 identify the key patterns in the data. The models they
                 form attain high likelihoods, and inspection shows that
                 they summarize the data well with increasingly
                 specific, yet nonredundant itemsets.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chu:2012:TLM,
  author =       "Shumo Chu and James Cheng",
  title =        "Triangle listing in massive networks",
  journal =      j-TKDD,
  volume =       "6",
  number =       "4",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2382577.2382581",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:40 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Triangle listing is one of the fundamental algorithmic
                 problems whose solution has numerous applications
                 especially in the analysis of complex networks, such as
                 the computation of clustering coefficients,
                 transitivity, triangular connectivity, trusses, etc.
                 Existing algorithms for triangle listing are mainly
                 in-memory algorithms, whose performance cannot scale
                 with the massive volume of today's fast growing
                 networks. When the input graph cannot fit in main
                 memory, triangle listing requires random disk accesses
                 that can incur prohibitively huge I/O cost. Some
                 streaming, semistreaming, and sampling algorithms have
                 been proposed but these are approximation algorithms.
                 We propose an I/O-efficient algorithm for triangle
                 listing. Our algorithm is exact and avoids random disk
                 access. Our results show that our algorithm is scalable
                 and outperforms the state-of-the-art in-memory and
                 local triangle estimation algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chattopadhyay:2012:MDA,
  author =       "Rita Chattopadhyay and Qian Sun and Wei Fan and Ian
                 Davidson and Sethuraman Panchanathan and Jieping Ye",
  title =        "Multisource domain adaptation and its application to
                 early detection of fatigue",
  journal =      j-TKDD,
  volume =       "6",
  number =       "4",
  pages =        "18:1--18:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2382577.2382582",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:40 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We consider the characterization of muscle fatigue
                 through a noninvasive sensing mechanism such as Surface
                 ElectroMyoGraphy (SEMG). While changes in the
                 properties of SEMG signals with respect to muscle
                 fatigue have been reported in the literature, the large
                 variation in these signals across different individuals
                 makes the task of modeling and classification of SEMG
                 signals challenging. Indeed, the variation in SEMG
                 parameters from subject to subject creates differences
                 in the data distribution. In this article, we propose
                 two transfer learning frameworks based on the
                 multisource domain adaptation methodology for detecting
                 different stages of fatigue using SEMG signals, that
                 addresses the distribution differences. In the proposed
                 frameworks, the SEMG data of a subject represent a
                 domain; data from multiple subjects in the training set
                 form the multiple source domains and the test subject
                 data form the target domain. SEMG signals are
                 predominantly different in conditional probability
                 distribution across subjects. The key feature of the
                 first framework is a novel weighting scheme that
                 addresses the conditional probability distribution
                 differences across multiple domains (subjects) and the
                 key feature of the second framework is a two-stage
                 domain adaptation methodology which combines weighted
                 data from multiple sources based on marginal
                 probability differences (first stage) as well as
                 conditional probability differences (second stage),
                 with the target domain data. The weights for minimizing
                 the marginal probability differences are estimated
                 independently, while the weights for minimizing
                 conditional probability differences are computed
                 simultaneously by exploiting the potential interaction
                 among multiple sources. We also provide a theoretical
                 analysis on the generalization performance of the
                 proposed multisource domain adaptation formulation
                 using the weighted Rademacher complexity measure. We
                 have validated the proposed frameworks on Surface
                 ElectroMyoGram signals collected from 8 people during a
                 fatigue-causing repetitive gripping activity.
                 Comprehensive experiments on the SEMG dataset
                 demonstrate that the proposed method improves the
                 classification accuracy by 20\% to 30\% over the cases
                 without any domain adaptation method and by 13\% to
                 30\% over existing state-of-the-art domain adaptation
                 methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wilkinson:2012:SIS,
  author =       "Leland Wilkinson and Anushka Anand and Tuan Nhon
                 Dang",
  title =        "Substantial improvements in the set-covering
                 projection classifier {CHIRP} (composite hypercubes on
                 iterated random projections)",
  journal =      j-TKDD,
  volume =       "6",
  number =       "4",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2382577.2382583",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:40 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In Wilkinson et al. [2011] we introduced a new
                 set-covering random projection classifier that achieved
                 average error lower than that of other classifiers in
                 the Weka platform. This classifier was based on an $
                 L^\infty $ norm distance function and exploited an
                 iterative sequence of three stages (projecting,
                 binning, and covering) to deal with the curse of
                 dimensionality, computational complexity, and nonlinear
                 separability. We now present substantial changes that
                 improve robustness and reduce training and testing time
                 by almost an order of magnitude without jeopardizing
                 CHIRP's outstanding error performance.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Angiulli:2013:NNB,
  author =       "Fabrizio Angiulli and Fabio Fassetti",
  title =        "Nearest Neighbor-Based Classification of Uncertain
                 Data",
  journal =      j-TKDD,
  volume =       "7",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2435209.2435210",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:44 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "This work deals with the problem of classifying
                 uncertain data. With this aim we introduce the
                 Uncertain Nearest Neighbor (UNN) rule, which represents
                 the generalization of the deterministic nearest
                 neighbor rule to the case in which uncertain objects
                 are available. The UNN rule relies on the concept of
                 nearest neighbor class, rather than on that of nearest
                 neighbor object. The nearest neighbor class of a test
                 object is the class that maximizes the probability of
                 providing its nearest neighbor. The evidence is that
                 the former concept is much more powerful than the
                 latter in the presence of uncertainty, in that it
                 correctly models the right semantics of the nearest
                 neighbor decision rule when applied to the uncertain
                 scenario. An effective and efficient algorithm to
                 perform uncertain nearest neighbor classification of a
                 generic (un)certain test object is designed, based on
                 properties that greatly reduce the temporal cost
                 associated with nearest neighbor class probability
                 computation. Experimental results are presented,
                 showing that the UNN rule is effective and efficient in
                 classifying uncertain data.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2013:CDS,
  author =       "Dingding Wang and Shenghuo Zhu and Tao Li and Yihong
                 Gong",
  title =        "Comparative Document Summarization via Discriminative
                 Sentence Selection",
  journal =      j-TKDD,
  volume =       "7",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2435209.2435211",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:44 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Given a collection of document groups, a natural
                 question is to identify the differences among these
                 groups. Although traditional document summarization
                 techniques can summarize the content of the document
                 groups one by one, there exists a great necessity to
                 generate a summary of the differences among the
                 document groups. In this article, we study a novel
                 problem of summarizing the differences between document
                 groups. A discriminative sentence selection method is
                 proposed to extract the most discriminative sentences
                 that represent the specific characteristics of each
                 document group. Experiments and case studies on
                 real-world data sets demonstrate the effectiveness of
                 our proposed method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Bayati:2013:MPA,
  author =       "Mohsen Bayati and David F. Gleich and Amin Saberi and
                 Ying Wang",
  title =        "Message-Passing Algorithms for Sparse Network
                 Alignment",
  journal =      j-TKDD,
  volume =       "7",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2435209.2435212",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:44 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Network alignment generalizes and unifies several
                 approaches for forming a matching or alignment between
                 the vertices of two graphs. We study a mathematical
                 programming framework for network alignment problem and
                 a sparse variation of it where only a small number of
                 matches between the vertices of the two graphs are
                 possible. We propose a new message passing algorithm
                 that allows us to compute, very efficiently,
                 approximate solutions to the sparse network alignment
                 problems with graph sizes as large as hundreds of
                 thousands of vertices. We also provide extensive
                 simulations comparing our algorithms with two of the
                 best solvers for network alignment problems on two
                 synthetic matching problems, two bioinformatics
                 problems, and three large ontology alignment problems
                 including a multilingual problem with a known labeled
                 alignment.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Li:2013:CWM,
  author =       "Bin Li and Steven C. H. Hoi and Peilin Zhao and
                 Vivekanand Gopalkrishnan",
  title =        "Confidence Weighted Mean Reversion Strategy for Online
                 Portfolio Selection",
  journal =      j-TKDD,
  volume =       "7",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2435209.2435213",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jun 24 13:02:44 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Online portfolio selection has been attracting
                 increasing attention from the data mining and machine
                 learning communities. All existing online portfolio
                 selection strategies focus on the first order
                 information of a portfolio vector, though the second
                 order information may also be beneficial to a strategy.
                 Moreover, empirical evidence shows that relative stock
                 prices may follow the mean reversion property, which
                 has not been fully exploited by existing strategies.
                 This article proposes a novel online portfolio
                 selection strategy named Confidence Weighted Mean
                 Reversion (CWMR). Inspired by the mean reversion
                 principle in finance and confidence weighted online
                 learning technique in machine learning, CWMR models the
                 portfolio vector as a Gaussian distribution, and
                 sequentially updates the distribution by following the
                 mean reversion trading principle. CWMR's closed-form
                 updates clearly reflect the mean reversion trading
                 idea. We also present several variants of CWMR
                 algorithms, including a CWMR mixture algorithm that is
                 theoretical universal. Empirically, CWMR strategy is
                 able to effectively exploit the power of mean reversion
                 for online portfolio selection. Extensive experiments
                 on various real markets show that the proposed strategy
                 is superior to the state-of-the-art techniques. The
                 experimental testbed including source codes and data
                 sets is available online.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Lou:2013:LPR,
  author =       "Tiancheng Lou and Jie Tang and John Hopcroft and
                 Zhanpeng Fang and Xiaowen Ding",
  title =        "Learning to predict reciprocity and triadic closure in
                 social networks",
  journal =      j-TKDD,
  volume =       "7",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2499907.2499908",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:06 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We study how links are formed in social networks. In
                 particular, we focus on investigating how a reciprocal
                 (two-way) link, the basic relationship in social
                 networks, is developed from a parasocial (one-way)
                 relationship and how the relationships further develop
                 into triadic closure, one of the fundamental processes
                 of link formation. We first investigate how geographic
                 distance and interactions between users influence the
                 formation of link structure among users. Then we study
                 how social theories including homophily, social
                 balance, and social status are satisfied over networks
                 with parasocial and reciprocal relationships. The study
                 unveils several interesting phenomena. For example,
                 ``friend's friend is a friend'' indeed exists in the
                 reciprocal relationship network, but does not hold in
                 the parasocial relationship network. We propose a
                 learning framework to formulate the problems of
                 predicting reciprocity and triadic closure into a
                 graphical model. We demonstrate that it is possible to
                 accurately infer 90\% of reciprocal relationships in a
                 Twitter network. The proposed model also achieves
                 better performance (+20--30\% in terms of F1-measure)
                 than several alternative methods for predicting the
                 triadic closure formation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Yang:2013:EOL,
  author =       "Haiqin Yang and Michael R. Lyu and Irwin King",
  title =        "Efficient online learning for multitask feature
                 selection",
  journal =      j-TKDD,
  volume =       "7",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2499907.2499909",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:06 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Learning explanatory features across multiple related
                 tasks, or MultiTask Feature Selection (MTFS), is an
                 important problem in the applications of data mining,
                 machine learning, and bioinformatics. Previous MTFS
                 methods fulfill this task by batch-mode training. This
                 makes them inefficient when data come sequentially or
                 when the number of training data is so large that they
                 cannot be loaded into the memory simultaneously. In
                 order to tackle these problems, we propose a novel
                 online learning framework to solve the MTFS problem. A
                 main advantage of the online algorithm is its
                 efficiency in both time complexity and memory cost. The
                 weights of the MTFS models at each iteration can be
                 updated by closed-form solutions based on the average
                 of previous subgradients. This yields the worst-case
                 bounds of the time complexity and memory cost at each
                 iteration, both in the order of $ O(d \times Q) $,
                 where $d$ is the number of feature dimensions and $Q$
                 is the number of tasks. Moreover, we provide
                 theoretical analysis for the average regret of the
                 online learning algorithms, which also guarantees the
                 convergence rate of the algorithms. Finally, we conduct
                 detailed experiments to show the characteristics and
                 merits of the online learning algorithms in solving
                 several MTFS problems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhang:2013:MRL,
  author =       "Yu Zhang and Dit-Yan Yeung",
  title =        "Multilabel relationship learning",
  journal =      j-TKDD,
  volume =       "7",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2499907.2499910",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:06 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multilabel learning problems are commonly found in
                 many applications. A characteristic shared by many
                 multilabel learning problems is that some labels have
                 significant correlations between them. In this article,
                 we propose a novel multilabel learning method, called
                 MultiLabel Relationship Learning (MLRL), which extends
                 the conventional support vector machine by explicitly
                 learning and utilizing the relationships between
                 labels. Specifically, we model the label relationships
                 using a label covariance matrix and use it to define a
                 new regularization term for the optimization problem.
                 MLRL learns the model parameters and the label
                 covariance matrix simultaneously based on a unified
                 convex formulation. To solve the convex optimization
                 problem, we use an alternating method in which each
                 subproblem can be solved efficiently. The relationship
                 between MLRL and two widely used maximum margin methods
                 for multilabel learning is investigated. Moreover, we
                 also propose a semisupervised extension of MLRL, called
                 SSMLRL, to demonstrate how to make use of unlabeled
                 data to help learn the label covariance matrix. Through
                 experiments conducted on some multilabel applications,
                 we find that MLRL not only gives higher classification
                 accuracy but also has better interpretability as
                 revealed by the label covariance matrix.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Peng:2013:EFF,
  author =       "Jing Peng and Guna Seetharaman and Wei Fan and Aparna
                 Varde",
  title =        "Exploiting {Fisher} and {Fukunaga--Koontz} transforms
                 in {Chernoff} dimensionality reduction",
  journal =      j-TKDD,
  volume =       "7",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2499907.2499911",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:06 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Knowledge discovery from big data demands effective
                 representation of data. However, big data are often
                 characterized by high dimensionality, which makes
                 knowledge discovery more difficult. Many techniques for
                 dimensionality reduction have been proposed, including
                 well-known Fisher's Linear Discriminant Analysis (LDA).
                 However, the Fisher criterion is incapable of dealing
                 with heteroscedasticity in the data. A technique based
                 on the Chernoff criterion for linear dimensionality
                 reduction has been proposed that is capable of
                 exploiting heteroscedastic information in the data.
                 While the Chernoff criterion has been shown to
                 outperform the Fisher's, a clear understanding of its
                 exact behavior is lacking. In this article, we show
                 precisely what can be expected from the Chernoff
                 criterion. In particular, we show that the Chernoff
                 criterion exploits the Fisher and Fukunaga-Koontz
                 transforms in computing its linear discriminants.
                 Furthermore, we show that a recently proposed
                 decomposition of the data space into four subspaces is
                 incomplete. We provide arguments on how to best enrich
                 the decomposition of the data space in order to account
                 for heteroscedasticity in the data. Finally, we provide
                 experimental results validating our theoretical
                 analysis.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Agarwal:2013:ISI,
  author =       "Deepak Agarwal and Rich Caruana and Jian Pei and Ke
                 Wang",
  title =        "Introduction to the {Special Issue ACM SIGKDD 2012}",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2513092.2513093",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Rakthanmanon:2013:ABD,
  author =       "Thanawin Rakthanmanon and Bilson Campana and Abdullah
                 Mueen and Gustavo Batista and Brandon Westover and
                 Qiang Zhu and Jesin Zakaria and Eamonn Keogh",
  title =        "Addressing Big Data Time Series: Mining Trillions of
                 Time Series Subsequences Under Dynamic Time Warping",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2500489",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Most time series data mining algorithms use similarity
                 search as a core subroutine, and thus the time taken
                 for similarity search is the bottleneck for virtually
                 all time series data mining algorithms, including
                 classification, clustering, motif discovery, anomaly
                 detection, and so on. The difficulty of scaling a
                 search to large datasets explains to a great extent why
                 most academic work on time series data mining has
                 plateaued at considering a few millions of time series
                 objects, while much of industry and science sits on
                 billions of time series objects waiting to be explored.
                 In this work we show that by using a combination of
                 four novel ideas we can search and mine massive time
                 series for the first time. We demonstrate the following
                 unintuitive fact: in large datasets we can exactly
                 search under Dynamic Time Warping (DTW) much more
                 quickly than the current state-of-the-art Euclidean
                 distance search algorithms. We demonstrate our work on
                 the largest set of time series experiments ever
                 attempted. In particular, the largest dataset we
                 consider is larger than the combined size of all of the
                 time series datasets considered in all data mining
                 papers ever published. We explain how our ideas allow
                 us to solve higher-level time series data mining
                 problems such as motif discovery and clustering at
                 scales that would otherwise be untenable. Moreover, we
                 show how our ideas allow us to efficiently support the
                 uniform scaling distance measure, a measure whose
                 utility seems to be underappreciated, but which we
                 demonstrate here. In addition to mining massive
                 datasets with up to one trillion datapoints, we will
                 show that our ideas also have implications for
                 real-time monitoring of data streams, allowing us to
                 handle much faster arrival rates and/or use cheaper and
                 lower powered devices than are currently possible.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Sun:2013:PIM,
  author =       "Yizhou Sun and Brandon Norick and Jiawei Han and
                 Xifeng Yan and Philip S. Yu and Xiao Yu",
  title =        "{PathSelClus}: Integrating Meta-Path Selection with
                 User-Guided Object Clustering in Heterogeneous
                 Information Networks",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2500492",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Real-world, multiple-typed objects are often
                 interconnected, forming heterogeneous information
                 networks. A major challenge for link-based clustering
                 in such networks is their potential to generate many
                 different results, carrying rather diverse semantic
                 meanings. In order to generate desired clustering, we
                 propose to use meta-path, a path that connects object
                 types via a sequence of relations, to control
                 clustering with distinct semantics. Nevertheless, it is
                 easier for a user to provide a few examples (seeds)
                 than a weighted combination of sophisticated meta-paths
                 to specify her clustering preference. Thus, we propose
                 to integrate meta-path selection with user-guided
                 clustering to cluster objects in networks, where a user
                 first provides a small set of object seeds for each
                 cluster as guidance. Then the system learns the weight
                 for each meta-path that is consistent with the
                 clustering result implied by the guidance, and
                 generates clusters under the learned weights of
                 meta-paths. A probabilistic approach is proposed to
                 solve the problem, and an effective and efficient
                 iterative algorithm, PathSelClus, is proposed to learn
                 the model, where the clustering quality and the
                 meta-path weights mutually enhance each other. Our
                 experiments with several clustering tasks in two real
                 networks and one synthetic network demonstrate the
                 power of the algorithm in comparison with the
                 baselines.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Bellare:2013:ASE,
  author =       "Kedar Bellare and Suresh Iyengar and Aditya
                 Parameswaran and Vibhor Rastogi",
  title =        "Active Sampling for Entity Matching with Guarantees",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2500490",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In entity matching, a fundamental issue while training
                 a classifier to label pairs of entities as either
                 duplicates or nonduplicates is the one of selecting
                 informative training examples. Although active learning
                 presents an attractive solution to this problem,
                 previous approaches minimize the misclassification rate
                 (0--1 loss) of the classifier, which is an unsuitable
                 metric for entity matching due to class imbalance
                 (i.e., many more nonduplicate pairs than duplicate
                 pairs). To address this, a recent paper [Arasu et al.
                 2010] proposes to maximize recall of the classifier
                 under the constraint that its precision should be
                 greater than a specified threshold. However, the
                 proposed technique requires the labels of all n input
                 pairs in the worst case. Our main result is an active
                 learning algorithm that approximately maximizes recall
                 of the classifier while respecting a precision
                 constraint with provably sublinear label complexity
                 (under certain distributional assumptions). Our
                 algorithm uses as a black box any active learning
                 module that minimizes 0--1 loss. We show that label
                 complexity of our algorithm is at most log n times the
                 label complexity of the black box, and also bound the
                 difference in the recall of classifier learnt by our
                 algorithm and the recall of the optimal classifier
                 satisfying the precision constraint. We provide an
                 empirical evaluation of our algorithm on several
                 real-world matching data sets that demonstrates the
                 effectiveness of our approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chattopadhyay:2013:BMA,
  author =       "Rita Chattopadhyay and Zheng Wang and Wei Fan and Ian
                 Davidson and Sethuraman Panchanathan and Jieping Ye",
  title =        "Batch Mode Active Sampling Based on Marginal
                 Probability Distribution Matching",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2513092.2513094",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Active Learning is a machine learning and data mining
                 technique that selects the most informative samples for
                 labeling and uses them as training data; it is
                 especially useful when there are large amount of
                 unlabeled data and labeling them is expensive.
                 Recently, batch-mode active learning, where a set of
                 samples are selected concurrently for labeling, based
                 on their collective merit, has attracted a lot of
                 attention. The objective of batch-mode active learning
                 is to select a set of informative samples so that a
                 classifier learned on these samples has good
                 generalization performance on the unlabeled data. Most
                 of the existing batch-mode active learning
                 methodologies try to achieve this by selecting samples
                 based on certain criteria. In this article we propose a
                 novel criterion which achieves good generalization
                 performance of a classifier by specifically selecting a
                 set of query samples that minimize the difference in
                 distribution between the labeled and the unlabeled
                 data, after annotation. We explicitly measure this
                 difference based on all candidate subsets of the
                 unlabeled data and select the best subset. The proposed
                 objective is an NP-hard integer programming
                 optimization problem. We provide two optimization
                 techniques to solve this problem. In the first one, the
                 problem is transformed into a convex quadratic
                 programming problem and in the second method the
                 problem is transformed into a linear programming
                 problem. Our empirical studies using publicly available
                 UCI datasets and two biomedical image databases
                 demonstrate the effectiveness of the proposed approach
                 in comparison with the state-of-the-art batch-mode
                 active learning methods. We also present two extensions
                 of the proposed approach, which incorporate uncertainty
                 of the predicted labels of the unlabeled data and
                 transfer learning in the proposed formulation. In
                 addition, we present a joint optimization framework for
                 performing both transfer and active learning
                 simultaneously unlike the existing approaches of
                 learning in two separate stages, that is, typically,
                 transfer learning followed by active learning. We
                 specifically minimize a common objective of reducing
                 distribution difference between the domain adapted
                 source, the queried and labeled samples and the rest of
                 the unlabeled target domain data. Our empirical studies
                 on two biomedical image databases and on a publicly
                 available 20 Newsgroups dataset show that incorporation
                 of uncertainty information and transfer learning
                 further improves the performance of the proposed active
                 learning based classifier. Our empirical studies also
                 show that the proposed transfer-active method based on
                 the joint optimization framework performs significantly
                 better than a framework which implements transfer and
                 active learning in two separate stages.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Briggs:2013:IAM,
  author =       "Forrest Briggs and Xiaoli Z. Fern and Raviv Raich and
                 Qi Lou",
  title =        "Instance Annotation for Multi-Instance Multi-Label
                 Learning",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2500491",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multi-instance multi-label learning (MIML) is a
                 framework for supervised classification where the
                 objects to be classified are bags of instances
                 associated with multiple labels. For example, an image
                 can be represented as a bag of segments and associated
                 with a list of objects it contains. Prior work on MIML
                 has focused on predicting label sets for previously
                 unseen bags. We instead consider the problem of
                 predicting instance labels while learning from data
                 labeled only at the bag level. We propose a regularized
                 rank-loss objective designed for instance annotation,
                 which can be instantiated with different aggregation
                 models connecting instance-level labels with bag-level
                 label sets. The aggregation models that we consider can
                 be factored as a linear function of a ``support
                 instance'' for each class, which is a single feature
                 vector representing a whole bag. Hence we name our
                 proposed methods rank-loss Support Instance Machines
                 (SIM). We propose two optimization methods for the
                 rank-loss objective, which is nonconvex. One is a
                 heuristic method that alternates between updating
                 support instances, and solving a convex problem in
                 which the support instances are treated as constant.
                 The other is to apply the constrained concave-convex
                 procedure (CCCP), which can also be interpreted as
                 iteratively updating support instances and solving a
                 convex problem. To solve the convex problem, we employ
                 the Pegasos framework of primal subgradient descent,
                 and prove that it finds an $ \epsilon $-suboptimal
                 solution in runtime that is linear in the number of
                 bags, instances, and $ 1 / \epsilon $. Additionally, we
                 suggest a method of extending the linear learning
                 algorithm to nonlinear classification, without
                 increasing the runtime asymptotically. Experiments on
                 artificial and real-world datasets including images and
                 audio show that the proposed methods achieve higher
                 accuracy than other loss functions used in prior work,
                 e.g., Hamming loss, and recent work in ambiguous label
                 classification.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Ji:2013:PFR,
  author =       "Ming Ji and Binbin Lin and Xiaofei He and Deng Cai and
                 Jiawei Han",
  title =        "Parallel Field Ranking",
  journal =      j-TKDD,
  volume =       "7",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2513092.2513096",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:07 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Recently, ranking data with respect to the intrinsic
                 geometric structure (manifold ranking) has received
                 considerable attentions, with encouraging performance
                 in many applications in pattern recognition,
                 information retrieval and recommendation systems. Most
                 of the existing manifold ranking methods focus on
                 learning a ranking function that varies smoothly along
                 the data manifold. However, beyond smoothness, a
                 desirable ranking function should vary monotonically
                 along the geodesics of the data manifold, such that the
                 ranking order along the geodesics is preserved. In this
                 article, we aim to learn a ranking function that varies
                 linearly and therefore monotonically along the
                 geodesics of the data manifold. Recent theoretical work
                 shows that the gradient field of a linear function on
                 the manifold has to be a parallel vector field.
                 Therefore, we propose a novel ranking algorithm on the
                 data manifolds, called Parallel Field Ranking.
                 Specifically, we try to learn a ranking function and a
                 vector field simultaneously. We require the vector
                 field to be close to the gradient field of the ranking
                 function, and the vector field to be as parallel as
                 possible. Moreover, we require the value of the ranking
                 function at the query point to be the highest, and then
                 decrease linearly along the manifold. Experimental
                 results on both synthetic data and real data
                 demonstrate the effectiveness of our proposed
                 algorithm.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Adali:2013:IPR,
  author =       "Sibel Adali and Malik Magdon-Ismail and Xiaohui Lu",
  title =        "{iHypR}: Prominence ranking in networks of
                 collaborations with hyperedges 1",
  journal =      j-TKDD,
  volume =       "7",
  number =       "4",
  pages =        "16:1--16:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541268.2541269",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:09 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We present a new algorithm called iHypR for computing
                 prominence of actors in social networks of
                 collaborations. Our algorithm builds on the assumption
                 that prominent actors collaborate on prominent objects,
                 and prominent objects are naturally grouped into
                 prominent clusters or groups (hyperedges in a graph).
                 iHypR makes use of the relationships between actors,
                 objects, and hyperedges to compute a global prominence
                 score for the actors in the network. We do not assume
                 the hyperedges are given in advance. Hyperedges
                 computed by our method can perform as well or even
                 better than ``true'' hyperedges. Our algorithm is
                 customized for networks of collaborations, but it is
                 generally applicable without further tuning. We show,
                 through extensive experimentation with three real-life
                 data sets and multiple external measures of prominence,
                 that our algorithm outperforms existing well-known
                 algorithms. Our work is the first to offer such an
                 extensive evaluation. We show that unlike most existing
                 algorithms, the performance is robust across multiple
                 measures of performance. Further, we give a detailed
                 study of the sensitivity of our algorithm to different
                 data sets and the design choices within the algorithm
                 that a user may wish to change. Our article illustrates
                 the various trade-offs that must be considered in
                 computing prominence in collaborative social
                 networks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Huang:2013:STP,
  author =       "Jin Huang and Feiping Nie and Heng Huang and Yi-Cheng
                 Tu and Yu Lei",
  title =        "Social trust prediction using heterogeneous networks",
  journal =      j-TKDD,
  volume =       "7",
  number =       "4",
  pages =        "17:1--17:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541268.2541270",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:09 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Along with increasing popularity of social websites,
                 online users rely more on the trustworthiness
                 information to make decisions, extract and filter
                 information, and tag and build connections with other
                 users. However, such social network data often suffer
                 from severe data sparsity and are not able to provide
                 users with enough information. Therefore, trust
                 prediction has emerged as an important topic in social
                 network research. Traditional approaches are primarily
                 based on exploring trust graph topology itself.
                 However, research in sociology and our life experience
                 suggest that people who are in the same social circle
                 often exhibit similar behaviors and tastes. To take
                 advantage of the ancillary information for trust
                 prediction, the challenge then becomes what to transfer
                 and how to transfer. In this article, we address this
                 problem by aggregating heterogeneous social networks
                 and propose a novel joint social networks mining (JSNM)
                 method. Our new joint learning model explores the
                 user-group-level similarity between correlated graphs
                 and simultaneously learns the individual graph
                 structure; therefore, the shared structures and
                 patterns from multiple social networks can be utilized
                 to enhance the prediction tasks. As a result, we not
                 only improve the trust prediction in the target graph
                 but also facilitate other information retrieval tasks
                 in the auxiliary graphs. To optimize the proposed
                 objective function, we use the alternative technique to
                 break down the objective function into several
                 manageable subproblems. We further introduce the
                 auxiliary function to solve the optimization problems
                 with rigorously proved convergence. The extensive
                 experiments have been conducted on both synthetic and
                 real- world data. All empirical results demonstrate the
                 effectiveness of our method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Guzzo:2013:SIF,
  author =       "Antonella Guzzo and Luigi Moccia and Domenico
                 Sacc{\`a} and Edoardo Serra",
  title =        "Solving inverse frequent itemset mining with
                 infrequency constraints via large-scale linear
                 programs",
  journal =      j-TKDD,
  volume =       "7",
  number =       "4",
  pages =        "18:1--18:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541268.2541271",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:09 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Inverse frequent set mining (IFM) is the problem of
                 computing a transaction database D satisfying given
                 support constraints for some itemsets, which are
                 typically the frequent ones. This article proposes a
                 new formulation of IFM, called IFM$_I$ (IFM with
                 infrequency constraints), where the itemsets that are
                 not listed as frequent are constrained to be
                 infrequent; that is, they must have a support less than
                 or equal to a specified unique threshold. An instance
                 of IFM$_I$ can be seen as an instance of the original
                 IFM by making explicit the infrequency constraints for
                 the minimal infrequent itemsets, corresponding to the
                 so-called negative generator border defined in the
                 literature. The complexity increase from PSPACE
                 (complexity of IFM) to NEXP (complexity of IFM$_I$) is
                 caused by the cardinality of the negative generator
                 border, which can be exponential in the original input
                 size. Therefore, the article introduces a specific
                 problem parameter $ \kappa $ that computes an upper
                 bound to this cardinality using a hypergraph
                 interpretation for which minimal infrequent itemsets
                 correspond to minimal transversals. By fixing a
                 constant k, the article formulates a $k$-bounded
                 definition of the problem, called $k$-IFM$_I$, that
                 collects all instances for which the value of the
                 parameter $ \kappa $ is less than or equal to $k$-its
                 complexity is in PSPACE as for IFM. The bounded problem
                 is encoded as an integer linear program with a large
                 number of variables (actually exponential w.r.t. the
                 number of constraints), which is thereafter
                 approximated by relaxing integer constraints-the
                 decision problem of solving the linear program is
                 proven to be in NP. In order to solve the linear
                 program, a column generation technique is used that is
                 a variation of the simplex method designed to solve
                 large-scale linear programs, in particular with a huge
                 number of variables. The method at each step requires
                 the solution of an auxiliary integer linear program,
                 which is proven to be NP hard in this case and for
                 which a greedy heuristic is presented. The resulting
                 overall column generation solution algorithm enjoys
                 very good scaling as evidenced by the intensive
                 experimentation, thereby paving the way for its
                 application in real-life scenarios.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Balcazar:2013:FCP,
  author =       "Jos{\'e} L. Balc{\'a}zar",
  title =        "Formal and computational properties of the confidence
                 boost of association rules",
  journal =      j-TKDD,
  volume =       "7",
  number =       "4",
  pages =        "19:1--19:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541268.2541272",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:09 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Some existing notions of redundancy among association
                 rules allow for a logical-style characterization and
                 lead to irredundant bases of absolutely minimum size.
                 We push the intuition of redundancy further to find an
                 intuitive notion of novelty of an association rule,
                 with respect to other rules. Namely, an irredundant
                 rule is so because its confidence is higher than what
                 the rest of the rules would suggest; then, one can ask:
                 how much higher? We propose to measure such a sort of
                 novelty through the confidence boost of a rule. Acting
                 as a complement to confidence and support, the
                 confidence boost helps to obtain small and crisp sets
                 of mined association rules and solves the well-known
                 problem that, in certain cases, rules of negative
                 correlation may pass the confidence bound. We analyze
                 the properties of two versions of the notion of
                 confidence boost, one of them a natural generalization
                 of the other. We develop algorithms to filter rules
                 according to their confidence boost, compare the
                 concept to some similar notions in the literature, and
                 describe the results of some experimentation employing
                 the new notions on standard benchmark datasets. We
                 describe an open source association mining tool that
                 embodies one of our variants of confidence boost in
                 such a way that the data mining process does not
                 require the user to select any value for any
                 parameter.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Ang:2013:CPN,
  author =       "Hock Hee Ang and Vivekanand Gopalkrishnan and Steven
                 C. H. Hoi and Wee Keong Ng",
  title =        "Classification in {P2P} networks with cascade support
                 vector machines",
  journal =      j-TKDD,
  volume =       "7",
  number =       "4",
  pages =        "20:1--20:??",
  month =        nov,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2541268.2541273",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:09 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Classification in Peer-to-Peer (P2P) networks is
                 important to many real applications, such as
                 distributed intrusion detection, distributed
                 recommendation systems, and distributed antispam
                 detection. However, it is very challenging to perform
                 classification in P2P networks due to many practical
                 issues, such as scalability, peer dynamism, and
                 asynchronism. This article investigates the practical
                 techniques of constructing Support Vector Machine (SVM)
                 classifiers in the P2P networks. In particular, we
                 demonstrate how to efficiently cascade SVM in a P2P
                 network with the use of reduced SVM. In addition, we
                 propose to fuse the concept of cascade SVM with
                 bootstrap aggregation to effectively balance the
                 trade-off between classification accuracy, model
                 construction, and prediction cost. We provide
                 theoretical insights for the proposed solutions and
                 conduct an extensive set of empirical studies on a
                 number of large-scale datasets. Encouraging results
                 validate the efficacy of the proposed approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "20",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chen:2014:ISI,
  author =       "Wei Chen and Jie Tang",
  title =        "Introduction to special issue on computational aspects
                 of social and information networks: Theory,
                 methodologies, and applications {(TKDD-CASIN)}",
  journal =      j-TKDD,
  volume =       "8",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2556608",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:11 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Yang:2014:USN,
  author =       "Zhi Yang and Christo Wilson and Xiao Wang and Tingting
                 Gao and Ben Y. Zhao and Yafei Dai",
  title =        "Uncovering social network {Sybils} in the wild",
  journal =      j-TKDD,
  volume =       "8",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2556609",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:11 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Sybil accounts are fake identities created to unfairly
                 increase the power or resources of a single malicious
                 user. Researchers have long known about the existence
                 of Sybil accounts in online communities such as
                 file-sharing systems, but they have not been able to
                 perform large-scale measurements to detect them or
                 measure their activities. In this article, we describe
                 our efforts to detect, characterize, and understand
                 Sybil account activity in the Renren Online Social
                 Network (OSN). We use ground truth provided by Renren
                 Inc. to build measurement-based Sybil detectors and
                 deploy them on Renren to detect more than 100,000 Sybil
                 accounts. Using our full dataset of 650,000 Sybils, we
                 examine several aspects of Sybil behavior. First, we
                 study their link creation behavior and find that
                 contrary to prior conjecture, Sybils in OSNs do not
                 form tight-knit communities. Next, we examine the
                 fine-grained behaviors of Sybils on Renren using
                 clickstream data. Third, we investigate
                 behind-the-scenes collusion between large groups of
                 Sybils. Our results reveal that Sybils with no explicit
                 social ties still act in concert to launch attacks.
                 Finally, we investigate enhanced techniques to identify
                 stealthy Sybils. In summary, our study advances the
                 understanding of Sybil behavior on OSNs and shows that
                 Sybils can effectively avoid existing community-based
                 Sybil detectors. We hope that our results will foster
                 new research on Sybil detection that is based on novel
                 types of Sybil features.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Jin:2014:SAR,
  author =       "Ruoming Jin and Victor E. Lee and Longjie Li",
  title =        "Scalable and axiomatic ranking of network role
                 similarity",
  journal =      j-TKDD,
  volume =       "8",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2518176",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:11 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "A key task in analyzing social networks and other
                 complex networks is role analysis: describing and
                 categorizing nodes according to how they interact with
                 other nodes. Two nodes have the same role if they
                 interact with equivalent sets of neighbors. The most
                 fundamental role equivalence is automorphic
                 equivalence. Unfortunately, the fastest algorithms
                 known for graph automorphism are nonpolynomial.
                 Moreover, since exact equivalence is rare, a more
                 meaningful task is measuring the role similarity
                 between any two nodes. This task is closely related to
                 the structural or link-based similarity problem that
                 SimRank addresses. However, SimRank and other existing
                 similarity measures are not sufficient because they do
                 not guarantee to recognize automorphically or
                 structurally equivalent nodes. This article makes two
                 contributions. First, we present and justify several
                 axiomatic properties necessary for a role similarity
                 measure or metric. Second, we present RoleSim, a new
                 similarity metric that satisfies these axioms and can
                 be computed with a simple iterative algorithm. We
                 rigorously prove that RoleSim satisfies all of these
                 axiomatic properties. We also introduce Iceberg
                 RoleSim, a scalable algorithm that discovers all pairs
                 with RoleSim scores above a user-defined threshold $
                 \theta $. We demonstrate the interpretative power of
                 RoleSim on both synthetic and real datasets.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Mcauley:2014:DSC,
  author =       "Julian Mcauley and Jure Leskovec",
  title =        "Discovering social circles in ego networks",
  journal =      j-TKDD,
  volume =       "8",
  number =       "1",
  pages =        "4:1--4:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2556612",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:11 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "People's personal social networks are big and
                 cluttered, and currently there is no good way to
                 automatically organize them. Social networking sites
                 allow users to manually categorize their friends into
                 social circles (e.g., ``circles'' on Google+, and
                 ``lists'' on Facebook and Twitter). However, circles
                 are laborious to construct and must be manually updated
                 whenever a user's network grows. In this article, we
                 study the novel task of automatically identifying
                 users' social circles. We pose this task as a
                 multimembership node clustering problem on a user's ego
                 network, a network of connections between her friends.
                 We develop a model for detecting circles that combines
                 network structure as well as user profile information.
                 For each circle, we learn its members and the
                 circle-specific user profile similarity metric.
                 Modeling node membership to multiple circles allows us
                 to detect overlapping as well as hierarchically nested
                 circles. Experiments show that our model accurately
                 identifies circles on a diverse set of data from
                 Facebook, Google+, and Twitter, for all of which we
                 obtain hand-labeled ground truth.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Abrahao:2014:SFA,
  author =       "Bruno Abrahao and Sucheta Soundarajan and John
                 Hopcroft and Robert Kleinberg",
  title =        "A separability framework for analyzing community
                 structure",
  journal =      j-TKDD,
  volume =       "8",
  number =       "1",
  pages =        "5:1--5:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2527231",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:11 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Four major factors govern the intricacies of community
                 extraction in networks: (1) the literature offers a
                 multitude of disparate community detection algorithms
                 whose output exhibits high structural variability
                 across the collection, (2) communities identified by
                 algorithms may differ structurally from real
                 communities that arise in practice, (3) there is no
                 consensus characterizing how to discriminate
                 communities from noncommunities, and (4) the
                 application domain includes a wide variety of networks
                 of fundamentally different natures. In this article, we
                 present a class separability framework to tackle these
                 challenges through a comprehensive analysis of
                 community properties. Our approach enables the
                 assessment of the structural dissimilarity among the
                 output of multiple community detection algorithms and
                 between the output of algorithms and communities that
                 arise in practice. In addition, our method provides us
                 with a way to organize the vast collection of community
                 detection algorithms by grouping those that behave
                 similarly. Finally, we identify the most discriminative
                 graph-theoretical properties of community signature and
                 the small subset of properties that account for most of
                 the biases of the different community detection
                 algorithms. We illustrate our approach with an
                 experimental analysis, which reveals nuances of the
                 structure of real and extracted communities. In our
                 experiments, we furnish our framework with the output
                 of 10 different community detection procedures,
                 representative of categories of popular algorithms
                 available in the literature, applied to a diverse
                 collection of large-scale real network datasets whose
                 domains span biology, online shopping, and social
                 systems. We also analyze communities identified by
                 annotations that accompany the data, which reflect
                 exemplar communities in various domain. We characterize
                 these communities using a broad spectrum of community
                 properties to produce the different structural classes.
                 As our experiments show that community structure is not
                 a universal concept, our framework enables an informed
                 choice of the most suitable community detection method
                 for identifying communities of a specific type in a
                 given network and allows for a comparison of existing
                 community detection algorithms while guiding the design
                 of new ones.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhong:2014:UBL,
  author =       "Erheng Zhong and Wei Fan and Qiang Yang",
  title =        "User behavior learning and transfer in composite
                 social networks",
  journal =      j-TKDD,
  volume =       "8",
  number =       "1",
  pages =        "6:1--6:??",
  month =        feb,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2556613",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Mar 13 09:16:11 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Accurate prediction of user behaviors is important for
                 many social media applications, including social
                 marketing, personalization, and recommendation. A major
                 challenge lies in that although many previous works
                 model user behavior from only historical behavior logs,
                 the available user behavior data or interactions
                 between users and items in a given social network are
                 usually very limited and sparse (e.g., $ \geq 99.9 \% $
                 empty), which makes models overfit the rare
                 observations and fail to provide accurate predictions.
                 We observe that many people are members of several
                 social networks in the same time, such as Facebook,
                 Twitter, and Tencent's QQ. Importantly, users'
                 behaviors and interests in different networks influence
                 one another. This provides an opportunity to leverage
                 the knowledge of user behaviors in different networks
                 by considering the overlapping users in different
                 networks as bridges, in order to alleviate the data
                 sparsity problem, and enhance the predictive
                 performance of user behavior modeling. Combining
                 different networks ``simply and naively'' does not work
                 well. In this article, we formulate the problem to
                 model multiple networks as ``adaptive composite
                 transfer'' and propose a framework called ComSoc.
                 ComSoc first selects the most suitable networks inside
                 a composite social network via a hierarchical Bayesian
                 model, parameterized for individual users. It then
                 builds topic models for user behavior prediction using
                 both the relationships in the selected networks and
                 related behavior data. With different relational
                 regularization, we introduce different implementations,
                 corresponding to different ways to transfer knowledge
                 from composite social relations. To handle big data, we
                 have implemented the algorithm using Map/Reduce. We
                 demonstrate that the proposed composite network-based
                 user behavior models significantly improve the
                 predictive accuracy over a number of existing
                 approaches on several real-world applications,
                 including a very large social networking dataset from
                 Tencent Inc.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Ahmed:2014:NSS,
  author =       "Nesreen K. Ahmed and Jennifer Neville and Ramana
                 Kompella",
  title =        "Network Sampling: From Static to Streaming Graphs",
  journal =      j-TKDD,
  volume =       "8",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2601438",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Jun 26 05:48:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Network sampling is integral to the analysis of
                 social, information, and biological networks. Since
                 many real-world networks are massive in size,
                 continuously evolving, and/or distributed in nature,
                 the network structure is often sampled in order to
                 facilitate study. For these reasons, a more thorough
                 and complete understanding of network sampling is
                 critical to support the field of network science. In
                 this paper, we outline a framework for the general
                 problem of network sampling by highlighting the
                 different objectives, population and units of interest,
                 and classes of network sampling methods. In addition,
                 we propose a spectrum of computational models for
                 network sampling methods, ranging from the
                 traditionally studied model based on the assumption of
                 a static domain to a more challenging model that is
                 appropriate for streaming domains. We design a family
                 of sampling methods based on the concept of graph
                 induction that generalize across the full spectrum of
                 computational models (from static to streaming) while
                 efficiently preserving many of the topological
                 properties of the input graphs. Furthermore, we
                 demonstrate how traditional static sampling algorithms
                 can be modified for graph streams for each of the three
                 main classes of sampling methods: node, edge, and
                 topology-based sampling. Experimental results indicate
                 that our proposed family of sampling methods more
                 accurately preserve the underlying properties of the
                 graph in both static and streaming domains. Finally, we
                 study the impact of network sampling algorithms on the
                 parameter estimation and performance evaluation of
                 relational classification algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Ge:2014:RMA,
  author =       "Yong Ge and Guofei Jiang and Min Ding and Hui Xiong",
  title =        "Ranking Metric Anomaly in Invariant Networks",
  journal =      j-TKDD,
  volume =       "8",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2601436",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Jun 26 05:48:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The management of large-scale distributed information
                 systems relies on the effective use and modeling of
                 monitoring data collected at various points in the
                 distributed information systems. A traditional approach
                 to model monitoring data is to discover invariant
                 relationships among the monitoring data. Indeed, we can
                 discover all invariant relationships among all pairs of
                 monitoring data and generate invariant networks, where
                 a node is a monitoring data source (metric) and a link
                 indicates an invariant relationship between two
                 monitoring data. Such an invariant network
                 representation can help system experts to localize and
                 diagnose the system faults by examining those broken
                 invariant relationships and their related metrics,
                 since system faults usually propagate among the
                 monitoring data and eventually lead to some broken
                 invariant relationships. However, at one time, there
                 are usually a lot of broken links (invariant
                 relationships) within an invariant network. Without
                 proper guidance, it is difficult for system experts to
                 manually inspect this large number of broken links. To
                 this end, in this article, we propose the problem of
                 ranking metrics according to the anomaly levels for a
                 given invariant network, while this is a nontrivial
                 task due to the uncertainties and the complex nature of
                 invariant networks. Specifically, we propose two types
                 of algorithms for ranking metric anomaly by link
                 analysis in invariant networks. Along this line, we
                 first define two measurements to quantify the anomaly
                 level of each metric, and introduce the m Rank
                 algorithm. Also, we provide a weighted score mechanism
                 and develop the g Rank algorithm, which involves an
                 iterative process to obtain a score to measure the
                 anomaly levels. In addition, some extended algorithms
                 based on m Rank and g Rank algorithms are developed by
                 taking into account the probability of being broken as
                 well as noisy links. Finally, we validate all the
                 proposed algorithms on a large number of real-world and
                 synthetic data sets to illustrate the effectiveness and
                 efficiency of different algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhang:2014:DGP,
  author =       "Gensheng Zhang and Xiao Jiang and Ping Luo and Min
                 Wang and Chengkai Li",
  title =        "Discovering General Prominent Streaks in Sequence
                 Data",
  journal =      j-TKDD,
  volume =       "8",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2601439",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Jun 26 05:48:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "This article studies the problem of prominent streak
                 discovery in sequence data. Given a sequence of values,
                 a prominent streak is a long consecutive subsequence
                 consisting of only large (small) values, such as
                 consecutive games of outstanding performance in sports,
                 consecutive hours of heavy network traffic, and
                 consecutive days of frequent mentioning of a person in
                 social media. Prominent streak discovery provides
                 insightful data patterns for data analysis in many
                 real-world applications and is an enabling technique
                 for computational journalism. Given its real-world
                 usefulness and complexity, the research on prominent
                 streaks in sequence data opens a spectrum of
                 challenging problems. A baseline approach to finding
                 prominent streaks is a quadratic algorithm that
                 exhaustively enumerates all possible streaks and
                 performs pairwise streak dominance comparison. For more
                 efficient methods, we make the observation that
                 prominent streaks are in fact skyline points in two
                 dimensions-streak interval length and minimum value in
                 the interval. Our solution thus hinges on the idea to
                 separate the two steps in prominent streak discovery:
                 candidate streak generation and skyline operation over
                 candidate streaks. For candidate generation, we propose
                 the concept of local prominent streak (LPS). We prove
                 that prominent streaks are a subset of LPSs and the
                 number of LPSs is less than the length of a data
                 sequence, in comparison with the quadratic number of
                 candidates produced by the brute-force baseline method.
                 We develop efficient algorithms based on the concept of
                 LPS. The nonlinear local prominent streak (NLPS)-based
                 method considers a superset of LPSs as candidates, and
                 the linear local prominent streak (LLPS)-based method
                 further guarantees to consider only LPSs. The proposed
                 properties and algorithms are also extended for
                 discovering general top-$k$, multisequence, and
                 multidimensional prominent streaks. The results of
                 experiments using multiple real datasets verified the
                 effectiveness of the proposed methods and showed orders
                 of magnitude performance improvement against the
                 baseline method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Schifanella:2014:MTD,
  author =       "Claudio Schifanella and K. Sel{\c{c}}uk Candan and
                 Maria Luisa Sapino",
  title =        "Multiresolution Tensor Decompositions with Mode
                 Hierarchies",
  journal =      j-TKDD,
  volume =       "8",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2532169",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Jun 26 05:48:22 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Tensors (multidimensional arrays) are widely used for
                 representing high-order dimensional data, in
                 applications ranging from social networks, sensor data,
                 and Internet traffic. Multiway data analysis
                 techniques, in particular tensor decompositions, allow
                 extraction of hidden correlations among multiway data
                 and thus are key components of many data analysis
                 frameworks. Intuitively, these algorithms can be
                 thought of as multiway clustering schemes, which
                 consider multiple facets of the data in identifying
                 clusters, their weights, and contributions of each data
                 element. Unfortunately, algorithms for fitting multiway
                 models are, in general, iterative and very time
                 consuming. In this article, we observe that, in many
                 applications, there is a priori background knowledge
                 (or metadata) about one or more domain dimensions. This
                 metadata is often in the form of a hierarchy that
                 clusters the elements of a given data facet (or mode).
                 We investigate whether such single-mode data
                 hierarchies can be used to boost the efficiency of
                 tensor decomposition process, without significant
                 impact on the final decomposition quality. We consider
                 each domain hierarchy as a guide to help provide
                 higher- or lower-resolution views of the data in the
                 tensor on demand and we rely on these metadata-induced
                 multiresolution tensor representations to develop a
                 multiresolution approach to tensor decomposition. In
                 this article, we focus on an alternating least squares
                 (ALS)--based implementation of the two most important
                 decomposition models such as the PARAllel FACtors
                 (PARAFAC, which decomposes a tensor into a diagonal
                 tensor and a set of factor matrices) and the Tucker
                 (which produces as result a core tensor and a set of
                 dimension-subspaces matrices). Experiment results show
                 that, when the available metadata is used as a rough
                 guide, the proposed multiresolution method helps fit
                 both PARAFAC and Tucker models with consistent (under
                 different parameters settings) savings in execution
                 time and memory consumption, while preserving the
                 quality of the decomposition.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Huang:2014:RMN,
  author =       "Jin Huang and Feiping Nie and Heng Huang and Chris
                 Ding",
  title =        "Robust Manifold Nonnegative Matrix Factorization",
  journal =      j-TKDD,
  volume =       "8",
  number =       "3",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2601434",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jun 3 13:50:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Nonnegative Matrix Factorization (NMF) has been one of
                 the most widely used clustering techniques for
                 exploratory data analysis. However, since each data
                 point enters the objective function with squared
                 residue error, a few outliers with large errors easily
                 dominate the objective function. In this article, we
                 propose a Robust Manifold Nonnegative Matrix
                 Factorization (RMNMF) method using l$_{2, 1}$ -norm and
                 integrating NMF and spectral clustering under the same
                 clustering framework. We also point out the solution
                 uniqueness issue for the existing NMF methods and
                 propose an additional orthonormal constraint to address
                 this problem. With the new constraint, the conventional
                 auxiliary function approach no longer works. We tackle
                 this difficult optimization problem via a novel
                 Augmented Lagrangian Method (ALM)--based algorithm and
                 convert the original constrained optimization problem
                 on one variable into a multivariate constrained
                 problem. The new objective function then can be
                 decomposed into several subproblems that each has a
                 closed-form solution. More importantly, we reveal the
                 connection of our method with robust K -means and
                 spectral clustering, and we demonstrate its theoretical
                 significance. Extensive experiments have been conducted
                 on nine benchmark datasets, and all empirical results
                 show the effectiveness of our method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhang:2014:RAL,
  author =       "Yu Zhang and Dit-Yan Yeung",
  title =        "A Regularization Approach to Learning Task
                 Relationships in Multitask Learning",
  journal =      j-TKDD,
  volume =       "8",
  number =       "3",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2538028",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jun 3 13:50:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multitask learning is a learning paradigm that seeks
                 to improve the generalization performance of a learning
                 task with the help of some other related tasks. In this
                 article, we propose a regularization approach to
                 learning the relationships between tasks in multitask
                 learning. This approach can be viewed as a novel
                 generalization of the regularized formulation for
                 single-task learning. Besides modeling positive task
                 correlation, our approach-multitask relationship
                 learning (MTRL)-can also describe negative task
                 correlation and identify outlier tasks based on the
                 same underlying principle. By utilizing a
                 matrix-variate normal distribution as a prior on the
                 model parameters of all tasks, our MTRL method has a
                 jointly convex objective function. For efficiency, we
                 use an alternating method to learn the optimal model
                 parameters for each task as well as the relationships
                 between tasks. We study MTRL in the symmetric multitask
                 learning setting and then generalize it to the
                 asymmetric setting as well. We also discuss some
                 variants of the regularization approach to demonstrate
                 the use of other matrix-variate priors for learning
                 task relationships. Moreover, to gain more insight into
                 our model, we also study the relationships between MTRL
                 and some existing multitask learning methods.
                 Experiments conducted on a toy problem as well as
                 several benchmark datasets demonstrate the
                 effectiveness of MTRL as well as its high
                 interpretability revealed by the task covariance
                 matrix.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Lin:2014:SCR,
  author =       "Ming Lin and Shifeng Weng and Changshui Zhang",
  title =        "On the Sample Complexity of Random {Fourier} Features
                 for Online Learning: How Many Random {Fourier} Features
                 Do We Need?",
  journal =      j-TKDD,
  volume =       "8",
  number =       "3",
  pages =        "13:1--13:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2611378",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jun 3 13:50:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We study the sample complexity of random Fourier
                 features for online kernel learning-that is, the number
                 of random Fourier features required to achieve good
                 generalization performance. We show that when the loss
                 function is strongly convex and smooth, online kernel
                 learning with random Fourier features can achieve an $
                 O (l o g T / T) $ bound for the excess risk with only $
                 O (1 / \lambda^2) $ random Fourier features, where T is
                 the number of training examples and \lambda is the
                 modulus of strong convexity. This is a significant
                 improvement compared to the existing result for batch
                 kernel learning that requires $ O(T) $ random Fourier
                 features to achieve a generalization bound $ O(1 /
                 \sqrt T) $. Our empirical study verifies that online
                 kernel learning with a limited number of random Fourier
                 features can achieve similar generalization performance
                 as online learning using full kernel matrix. We also
                 present an enhanced online learning algorithm with
                 random Fourier features that improves the
                 classification performance by multiple passes of
                 training examples and a partial average.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Eyal:2014:PIM,
  author =       "Ron Eyal and Avi Rosenfeld and Sigal Sina and Sarit
                 Kraus",
  title =        "Predicting and Identifying Missing Node Information in
                 Social Networks",
  journal =      j-TKDD,
  volume =       "8",
  number =       "3",
  pages =        "14:1--14:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2536775",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Jun 26 05:48:23 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In recent years, social networks have surged in
                 popularity. One key aspect of social network research
                 is identifying important missing information that is
                 not explicitly represented in the network, or is not
                 visible to all. To date, this line of research
                 typically focused on finding the connections that are
                 missing between nodes, a challenge typically termed as
                 the link prediction problem. This article introduces
                 the missing node identification problem, where missing
                 members in the social network structure must be
                 identified. In this problem, indications of missing
                 nodes are assumed to exist. Given these indications and
                 a partial network, we must assess which indications
                 originate from the same missing node and determine the
                 full network structure. Toward solving this problem, we
                 present the missing node identification by spectral
                 clustering algorithm (MISC), an approach based on a
                 spectral clustering algorithm, combined with nodes'
                 pairwise affinity measures that were adopted from link
                 prediction research. We evaluate the performance of our
                 approach in different problem settings and scenarios,
                 using real-life data from Facebook. The results show
                 that our approach has beneficial results and can be
                 effective in solving the missing node identification
                 problem. In addition, this article also presents
                 R-MISC, which uses a sparse matrix representation,
                 efficient algorithms for calculating the nodes'
                 pairwise affinity, and a proprietary dimension
                 reduction technique to enable scaling the MISC
                 algorithm to large networks of more than 100,000 nodes.
                 Last, we consider problem settings where some of the
                 indications are unknown. Two algorithms are suggested
                 for this problem: speculative MISC, based on MISC, and
                 missing link completion, based on classical link
                 prediction literature. We show that speculative MISC
                 outperforms missing link completion.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Webb:2014:EDM,
  author =       "Geoffrey I. Webb and Jilles Vreeken",
  title =        "Efficient Discovery of the Most Interesting
                 Associations",
  journal =      j-TKDD,
  volume =       "8",
  number =       "3",
  pages =        "15:1--15:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2601433",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Jun 26 05:48:23 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Self-sufficient itemsets have been proposed as an
                 effective approach to summarizing the key associations
                 in data. However, their computation appears highly
                 demanding, as assessing whether an itemset is
                 self-sufficient requires consideration of all pairwise
                 partitions of the itemset into pairs of subsets as well
                 as consideration of all supersets. This article
                 presents the first published algorithm for efficiently
                 discovering self-sufficient itemsets. This
                 branch-and-bound algorithm deploys two powerful pruning
                 mechanisms based on upper bounds on itemset value and
                 statistical significance level. It demonstrates that
                 finding top-$k$ productive and nonredundant itemsets,
                 with postprocessing to identify those that are not
                 independently productive, can efficiently identify
                 small sets of key associations. We present extensive
                 evaluation of the strengths and limitations of the
                 technique, including comparisons with alternative
                 approaches to finding the most interesting
                 associations.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Shabtai:2014:ODM,
  author =       "Asaf Shabtai and Maya Bercovitch and Lior Rokach and
                 Yuval Elovici",
  title =        "Optimizing Data Misuse Detection",
  journal =      j-TKDD,
  volume =       "8",
  number =       "3",
  pages =        "16:1--16:??",
  month =        jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2611520",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jun 3 13:50:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Data misuse may be performed by entities such as an
                 organization's employees and business partners who are
                 granted access to sensitive information and misuse
                 their privileges. We assume that users can be either
                 trusted or untrusted. The access of untrusted parties
                 to data objects (e.g., client and patient records)
                 should be monitored in an attempt to detect misuse.
                 However, monitoring data objects is resource intensive
                 and time-consuming and may also cause disturbance or
                 inconvenience to the involved employees. Therefore, the
                 monitored data objects should be carefully selected. In
                 this article, we present two optimization problems
                 carefully designed for selecting specific data objects
                 for monitoring, such that the detection rate is
                 maximized and the monitoring effort is minimized. In
                 the first optimization problem, the goal is to select
                 data objects for monitoring that are accessed by at
                 most c trusted agents while ensuring access to at least
                 k monitored objects by each untrusted agent (both c and
                 k are integer variable). As opposed to the first
                 optimization problem, the goal of the second
                 optimization problem is to select monitored data
                 objects that maximize the number of monitored data
                 objects accessed by untrusted agents while ensuring
                 that each trusted agent does not access more than d
                 monitored data objects (d is an integer variable as
                 well). Two efficient heuristic algorithms for solving
                 these optimization problems are proposed, and
                 experiments were conducted simulating different
                 scenarios to evaluate the algorithms' performance.
                 Moreover, we compared the heuristic algorithms'
                 performance to the optimal solution and conducted
                 sensitivity analysis on the three parameters (c, k, and
                 d) and on the ratio between the trusted and untrusted
                 agents.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Hernandez-Orallo:2014:PRC,
  author =       "Jos{\'e} Hern{\'a}ndez-Orallo",
  title =        "Probabilistic Reframing for Cost-Sensitive
                 Regression",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "17:1--17:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2641758",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:02 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Common-day applications of predictive models usually
                 involve the full use of the available contextual
                 information. When the operating context changes, one
                 may fine-tune the by-default (incontextual) prediction
                 or may even abstain from predicting a value (a reject).
                 Global reframing solutions, where the same function is
                 applied to adapt the estimated outputs to a new cost
                 context, are possible solutions here. An alternative
                 approach, which has not been studied in a comprehensive
                 way for regression in the knowledge discovery and data
                 mining literature, is the use of a local (e.g.,
                 probabilistic) reframing approach, where decisions are
                 made according to the estimated output and a
                 reliability, confidence, or probability estimation. In
                 this article, we advocate for a simple two-parameter
                 (mean and variance) approach, working with a normal
                 conditional probability density. Given the conditional
                 mean produced by any regression technique, we develop
                 lightweight ``enrichment'' methods that produce good
                 estimates of the conditional variance, which are used
                 by the probabilistic (local) reframing methods. We
                 apply these methods to some very common families of
                 cost-sensitive problems, such as optimal predictions in
                 (auction) bids, asymmetric loss scenarios, and
                 rejection rules.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Miettinen:2014:MMD,
  author =       "Pauli Miettinen and Jilles Vreeken",
  title =        "{MDL4BMF}: Minimum Description Length for {Boolean}
                 Matrix Factorization",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2601437",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:45:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Matrix factorizations-where a given data matrix is
                 approximated by a product of two or more factor
                 matrices-are powerful data mining tools. Among other
                 tasks, matrix factorizations are often used to separate
                 global structure from noise. This, however, requires
                 solving the ``model order selection problem'' of
                 determining the proper rank of the factorization, that
                 is, to answer where fine-grained structure stops, and
                 where noise starts. Boolean Matrix Factorization
                 (BMF)-where data, factors, and matrix product are
                 Boolean-has in recent years received increased
                 attention from the data mining community. The technique
                 has desirable properties, such as high interpretability
                 and natural sparsity. Yet, so far no method for
                 selecting the correct model order for BMF has been
                 available. In this article, we propose the use of the
                 Minimum Description Length (MDL) principle for this
                 task. Besides solving the problem, this well-founded
                 approach has numerous benefits; for example, it is
                 automatic, does not require a likelihood function, is
                 fast, and, as experiments show, is highly accurate. We
                 formulate the description length function for BMF in
                 general-making it applicable for any BMF algorithm. We
                 discuss how to construct an appropriate encoding:
                 starting from a simple and intuitive approach, we
                 arrive at a highly efficient data-to-model--based
                 encoding for BMF. We extend an existing algorithm for
                 BMF to use MDL to identify the best Boolean matrix
                 factorization, analyze the complexity of the problem,
                 and perform an extensive experimental evaluation to
                 study its behavior.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Tang:2014:FSS,
  author =       "Jiliang Tang and Huan Liu",
  title =        "Feature Selection for Social Media Data",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "19:1--19:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629587",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:45:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Feature selection is widely used in preparing
                 high-dimensional data for effective data mining. The
                 explosive popularity of social media produces massive
                 and high-dimensional data at an unprecedented rate,
                 presenting new challenges to feature selection. Social
                 media data consists of (1) traditional
                 high-dimensional, attribute-value data such as posts,
                 tweets, comments, and images, and (2) linked data that
                 provides social context for posts and describes the
                 relationships between social media users as well as who
                 generates the posts, and so on. The nature of social
                 media also determines that its data is massive, noisy,
                 and incomplete, which exacerbates the already
                 challenging problem of feature selection. In this
                 article, we study a novel feature selection problem of
                 selecting features for social media data with its
                 social context. In detail, we illustrate the
                 differences between attribute-value data and social
                 media data, investigate if linked data can be exploited
                 in a new feature selection framework by taking
                 advantage of social science theories. We design and
                 conduct experiments on datasets from real-world social
                 media Web sites, and the empirical results demonstrate
                 that the proposed framework can significantly improve
                 the performance of feature selection. Further
                 experiments are conducted to evaluate the effects of
                 user--user and user--post relationships manifested in
                 linked data on feature selection, and research issues
                 for future work will be discussed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Riondato:2014:EDA,
  author =       "Matteo Riondato and Eli Upfal",
  title =        "Efficient Discovery of Association Rules and Frequent
                 Itemsets through Sampling with Tight Performance
                 Guarantees",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "20:1--20:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629586",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:45:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The tasks of extracting (top-$K$) Frequent Itemsets
                 (FIs) and Association Rules (ARs) are fundamental
                 primitives in data mining and database applications.
                 Exact algorithms for these problems exist and are
                 widely used, but their running time is hindered by the
                 need of scanning the entire dataset, possibly multiple
                 times. High-quality approximations of FIs and ARs are
                 sufficient for most practical uses. Sampling techniques
                 can be used for fast discovery of approximate
                 solutions, but works exploring this technique did not
                 provide satisfactory performance guarantees on the
                 quality of the approximation due to the difficulty of
                 bounding the probability of under- or oversampling any
                 one of an unknown number of frequent itemsets. We
                 circumvent this issue by applying the statistical
                 concept of Vapnik--Chervonenkis (VC) dimension to
                 develop a novel technique for providing tight bounds on
                 the sample size that guarantees approximation of the
                 (top-$K$) FIs and ARs within user-specified parameters.
                 The resulting sample size is linearly dependent on the
                 VC-dimension of a range space associated with the
                 dataset. We analyze the VC-dimension of this range
                 space and show that it is upper bounded by an
                 easy-to-compute characteristic quantity of the dataset,
                 the d-index, namely, the maximum integer d such that
                 the dataset contains at least d transactions of length
                 at least d such that no one of them is a superset of or
                 equal to another. We show that this bound is tight for
                 a large class of datasets. The resulting sample size is
                 a significant improvement over previous known results.
                 We present an extensive experimental evaluation of our
                 technique on real and artificial datasets,
                 demonstrating the practicality of our methods, and
                 showing that they achieve even higher quality
                 approximations than what is guaranteed by the
                 analysis.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "20",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Burton:2014:DSC,
  author =       "Scott H. Burton and Christophe G. Giraud-Carrier",
  title =        "Discovering Social Circles in Directed Graphs",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "21:1--21:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2641759",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:02 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We examine the problem of identifying social circles,
                 or sets of cohesive and mutually aware nodes
                 surrounding an initial query set, in directed graphs
                 where the complete graph is not known beforehand. This
                 problem differs from local community mining, in that
                 the query set defines the circle of interest. We
                 explicitly handle edge direction, as in many cases
                 relationships are not symmetric, and focus on the local
                 context because many real-world graphs cannot be
                 feasibly known. We outline several issues that are
                 unique to this context, introduce a quality function to
                 measure the value of including a particular node in an
                 emerging social circle, and describe a greedy social
                 circle discovery algorithm. We demonstrate the
                 effectiveness of this approach on artificial
                 benchmarks, large networks with topical community
                 labels, and several real-world case studies.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "21",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Paul:2014:RPL,
  author =       "Saurabh Paul and Christos Boutsidis and Malik
                 Magdon-Ismail and Petros Drineas",
  title =        "Random Projections for Linear Support Vector
                 Machines",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "22:1--22:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2641760",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:45:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Let $X$ be a data matrix of rank $ \rho $, whose rows
                 represent $n$ points in $d$-dimensional space. The
                 linear support vector machine constructs a hyperplane
                 separator that maximizes the 1-norm soft margin. We
                 develop a new oblivious dimension reduction technique
                 that is precomputed and can be applied to any input
                 matrix $X$. We prove that, with high probability, the
                 margin and minimum enclosing ball in the feature space
                 are preserved to within $ \epsilon $-relative error,
                 ensuring comparable generalization as in the original
                 space in the case of classification. For regression, we
                 show that the margin is preserved to $ \epsilon
                 $-relative error with high probability. We present
                 extensive experiments with real and synthetic data to
                 support our theory.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "22",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Erdo:2014:RGN,
  author =       "D{\'o}ra Erd{\H{o}}s and Rainer Gemulla and Evimaria
                 Terzi",
  title =        "Reconstructing Graphs from Neighborhood Data",
  journal =      j-TKDD,
  volume =       "8",
  number =       "4",
  pages =        "23:1--23:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2641761",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:02 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Consider a social network and suppose that we are only
                 given the number of common friends between each pair of
                 users. Can we reconstruct the underlying network?
                 Similarly, consider a set of documents and the words
                 that appear in them. If we only know the number of
                 common words for every pair of documents, as well as
                 the number of common documents for every pair of words,
                 can we infer which words appear in which documents? In
                 this article, we develop a general methodology for
                 answering questions like these. We formalize these
                 questions in what we call the {\em R}econstruct
                 problem: given information about the common neighbors
                 of nodes in a network, our goal is to reconstruct the
                 hidden binary matrix that indicates the presence or
                 absence of relationships between individual nodes. In
                 fact, we propose two different variants of this
                 problem: one where the number of connections of every
                 node (i.e., the degree of every node) is known and a
                 second one where it is unknown. We call these variants
                 the degree-aware and the degree-oblivious versions of
                 the Reconstruct problem, respectively. Our algorithms
                 for both variants exploit the properties of the
                 singular value decomposition of the hidden binary
                 matrix. More specifically, we show that using the
                 available neighborhood information, we can reconstruct
                 the hidden matrix by finding the components of its
                 singular value decomposition and then combining them
                 appropriately. Our extensive experimental study
                 suggests that our methods are able to reconstruct
                 binary matrices of different characteristics with up to
                 100\% accuracy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "23",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Acharya:2014:OFC,
  author =       "Ayan Acharya and Eduardo R. Hruschka and Joydeep Ghosh
                 and Sreangsu Acharyya",
  title =        "An Optimization Framework for Combining Ensembles of
                 Classifiers and Clusterers with Applications to
                 Nontransductive Semisupervised Learning and Transfer
                 Learning",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "1:1--1:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2601435",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:05 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Unsupervised models can provide supplementary soft
                 constraints to help classify new ``target'' data
                 because similar instances in the target set are more
                 likely to share the same class label. Such models can
                 also help detect possible differences between training
                 and target distributions, which is useful in
                 applications where concept drift may take place, as in
                 transfer learning settings. This article describes a
                 general optimization framework that takes as input
                 class membership estimates from existing classifiers
                 learned on previously encountered ``source'' (or
                 training) data, as well as a similarity matrix from a
                 cluster ensemble operating solely on the target (or
                 test) data to be classified, and yields a consensus
                 labeling of the target data. More precisely, the
                 application settings considered are nontransductive
                 semisupervised and transfer learning scenarios where
                 the training data are used only to build an ensemble of
                 classifiers and are subsequently discarded before
                 classifying the target data. The framework admits a
                 wide range of loss functions and
                 classification/clustering methods. It exploits
                 properties of Bregman divergences in conjunction with
                 Legendre duality to yield a principled and scalable
                 approach. A variety of experiments show that the
                 proposed framework can yield results substantially
                 superior to those provided by na{\"\i}vely applying
                 classifiers learned on the original task to the target
                 data. In addition, we show that the proposed approach,
                 even not being conceptually transductive, can provide
                 better results compared to some popular transductive
                 learning techniques.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Boedihardjo:2014:FEL,
  author =       "Arnold P. Boedihardjo and Chang-Tien Lu and Bingsheng
                 Wang",
  title =        "A Framework for Exploiting Local Information to
                 Enhance Density Estimation of Data Streams",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "2:1--2:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629618",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:05 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The Probability Density Function (PDF) is the
                 fundamental data model for a variety of stream mining
                 algorithms. Existing works apply the standard
                 nonparametric Kernel Density Estimator (KDE) to
                 approximate the PDF of data streams. As a result, the
                 stream-based KDEs cannot accurately capture complex
                 local density features. In this article, we propose the
                 use of Local Region (LRs) to model local density
                 information in univariate data streams. In-depth
                 theoretical analyses are presented to justify the
                 effectiveness of the LR-based KDE. Based on the
                 analyses, we develop the General Local rEgion AlgorithM
                 (GLEAM) to enhance the estimation quality of
                 structurally complex univariate distributions for
                 existing stream-based KDEs. A set of algorithmic
                 optimizations is designed to improve the query
                 throughput of GLEAM and to achieve its linear order
                 computation. Additionally, a comprehensive suite of
                 experiments was conducted to test the effectiveness and
                 efficiency of GLEAM.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Ordonez:2014:BVS,
  author =       "Carlos Ordonez and Carlos Garcia-Alvarado and
                 Veerabhadaran Baladandayuthapani",
  title =        "{Bayesian} Variable Selection in Linear Regression in
                 One Pass for Large Datasets",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "3:1--3:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629617",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:05 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Bayesian models are generally computed with Markov
                 Chain Monte Carlo (MCMC) methods. The main disadvantage
                 of MCMC methods is the large number of iterations they
                 need to sample the posterior distributions of model
                 parameters, especially for large datasets. On the other
                 hand, variable selection remains a challenging problem
                 due to its combinatorial search space, where Bayesian
                 models are a promising solution. In this work, we study
                 how to accelerate Bayesian model computation for
                 variable selection in linear regression. We propose a
                 fast Gibbs sampler algorithm, a widely used MCMC method
                 that incorporates several optimizations. We use a
                 Zellner prior for the regression coefficients, an
                 improper prior on variance, and a conjugate prior
                 Gaussian distribution, which enable dataset
                 summarization in one pass, thus exploiting an augmented
                 set of sufficient statistics. Thereafter, the algorithm
                 iterates in main memory. Sufficient statistics are
                 indexed with a sparse binary vector to efficiently
                 compute matrix projections based on selected variables.
                 Discovered variable subsets probabilities, selecting
                 and discarding each variable, are stored on a hash
                 table for fast retrieval in future iterations. We study
                 how to integrate our algorithm into a Database
                 Management System (DBMS), exploiting aggregate
                 User-Defined Functions for parallel data summarization
                 and stored procedures to manipulate matrices with
                 arrays. An experimental evaluation with real datasets
                 evaluates accuracy and time performance, comparing our
                 DBMS-based algorithm with the R package. Our algorithm
                 is shown to produce accurate results, scale linearly on
                 dataset size, and run orders of magnitude faster than
                 the R package.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Fei:2014:SSB,
  author =       "Hongliang Fei and Jun Huan",
  title =        "Structured Sparse Boosting for Graph Classification",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "4:1--4:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629328",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:05 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Boosting is a highly effective algorithm that produces
                 a linear combination of weak classifiers (a.k.a. base
                 learners) to obtain high-quality classification models.
                 In this article, we propose a generalized logit boost
                 algorithm in which base learners have structural
                 relationships in the functional space. Although such
                 relationships are generic, our work is particularly
                 motivated by the emerging topic of pattern-based
                 classification for semistructured data including
                 graphs. Toward an efficient incorporation of the
                 structure information, we have designed a general model
                 in which we use an undirected graph to capture the
                 relationship of subgraph-based base learners. In our
                 method, we employ both L$_1$ and Laplacian-based L$_2$
                 regularization to logit boosting to achieve model
                 sparsity and smoothness in the functional space spanned
                 by the base learners. We have derived efficient
                 optimization algorithms based on coordinate descent for
                 the new boosting formulation and theoretically prove
                 that it exhibits a natural grouping effect for nearby
                 spatial or overlapping base learners and that the
                 resulting estimator is consistent. Additionally,
                 motivated by the connection between logit boosting and
                 logistic regression, we extend our structured sparse
                 regularization framework to logistic regression for
                 vectorial data in which features are structured. Using
                 comprehensive experimental study and comparing our work
                 with the state-of-the-art, we have demonstrated the
                 effectiveness of the proposed learning method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Xu:2014:GGB,
  author =       "Zhiqiang Xu and Yiping Ke and Yi Wang and Hong Cheng
                 and James Cheng",
  title =        "{GBAGC}: a General {Bayesian} Framework for Attributed
                 Graph Clustering",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "5:1--5:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629616",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:05 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Graph clustering, also known as community detection,
                 is a long-standing problem in data mining. In recent
                 years, with the proliferation of rich attribute
                 information available for objects in real-world graphs,
                 how to leverage not only structural but also attribute
                 information for clustering attributed graphs becomes a
                 new challenge. Most existing works took a
                 distance-based approach. They proposed various distance
                 measures to fuse structural and attribute information
                 and then applied standard techniques for graph
                 clustering based on these distance measures. In this
                 article, we take an alternative view and propose a
                 novel Bayesian framework for attributed graph
                 clustering. Our framework provides a general and
                 principled solution to modeling both the structural and
                 the attribute aspects of a graph. It avoids the
                 artificial design of a distance measure in existing
                 methods and, furthermore, can seamlessly handle graphs
                 with different types of edges and vertex attributes. We
                 develop an efficient variational method for graph
                 clustering under this framework and derive two concrete
                 algorithms for clustering unweighted and weighted
                 attributed graphs. Experimental results on large
                 real-world datasets show that our algorithms
                 significantly outperform the state-of-the-art
                 distance-based method, in terms of both effectiveness
                 and efficiency.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Coscia:2014:UHO,
  author =       "Michele Coscia and Giulio Rossetti and Fosca Giannotti
                 and Dino Pedreschi",
  title =        "Uncovering Hierarchical and Overlapping Communities
                 with a Local-First Approach",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629511",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Aug 26 17:49:05 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Community discovery in complex networks is the task of
                 organizing a network's structure by grouping together
                 nodes related to each other. Traditional approaches are
                 based on the assumption that there is a global-level
                 organization in the network. However, in many
                 scenarios, each node is the bearer of complex
                 information and cannot be classified in disjoint
                 clusters. The top-down global view of the partition
                 approach is not designed for this. Here, we represent
                 this complex information as multiple latent labels, and
                 we postulate that edges in the networks are created
                 among nodes carrying similar labels. The latent labels
                 are the communities a node belongs to and we discover
                 them with a simple local-first approach to community
                 discovery. This is achieved by democratically letting
                 each node vote for the communities it sees surrounding
                 it in its limited view of the global system, its ego
                 neighborhood, using a label propagation algorithm,
                 assuming that each node is aware of the label it shares
                 with each of its connections. The local communities are
                 merged hierarchically, unveiling the modular
                 organization of the network at the global level and
                 identifying overlapping groups and groups of groups. We
                 tested this intuition against the state-of-the-art
                 overlapping community discovery and found that our new
                 method advances in the chosen scenarios in the quality
                 of the obtained communities. We perform a test on
                 benchmark and on real-world networks, evaluating the
                 quality of the community coverage by using the
                 extracted communities to predict the metadata attached
                 to the nodes, which we consider external information
                 about the latent labels. We also provide an explanation
                 about why real-world networks contain overlapping
                 communities and how our logic is able to capture them.
                 Finally, we show how our method is deterministic, is
                 incremental, and has a limited time complexity, so that
                 it can be used on real-world scale networks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2014:GML,
  author =       "Guangtao Wang and Qinbao Song and Xueying Zhang and
                 Kaiyuan Zhang",
  title =        "A Generic Multilabel Learning-Based Classification
                 Algorithm Recommendation Method",
  journal =      j-TKDD,
  volume =       "9",
  number =       "1",
  pages =        "7:1--7:??",
  month =        oct,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629474",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Oct 10 17:19:10 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "As more and more classification algorithms continue to
                 be developed, recommending appropriate algorithms to a
                 given classification problem is increasingly important.
                 This article first distinguishes the algorithm
                 recommendation methods by two dimensions: (1)
                 meta-features, which are a set of measures used to
                 characterize the learning problems, and (2)
                 meta-target, which represents the relative performance
                 of the classification algorithms on the learning
                 problem. In contrast to the existing algorithm
                 recommendation methods whose meta-target is usually in
                 the form of either the ranking of candidate algorithms
                 or a single algorithm, this article proposes a new and
                 natural multilabel form to describe the meta-target.
                 This is due to the fact that there would be multiple
                 algorithms being appropriate for a given problem in
                 practice. Furthermore, a novel multilabel
                 learning-based generic algorithm recommendation method
                 is proposed, which views the algorithm recommendation
                 as a multilabel learning problem and solves the problem
                 by the mature multilabel learning algorithms. To
                 evaluate the proposed multilabel learning-based
                 recommendation method, extensive experiments with 13
                 well-known classification algorithms, two kinds of
                 meta-targets such as algorithm ranking and single
                 algorithm, and five different kinds of meta-features
                 are conducted on 1,090 benchmark learning problems. The
                 results show the effectiveness of our proposed
                 multilabel learning-based recommendation method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2014:EEM,
  author =       "Pinghui Wang and John C. S. Lui and Bruno Ribeiro and
                 Don Towsley and Junzhou Zhao and Xiaohong Guan",
  title =        "Efficiently Estimating Motif Statistics of Large
                 Networks",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "8:1--8:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629564",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Exploring statistics of locally connected subgraph
                 patterns (also known as network motifs) has helped
                 researchers better understand the structure and
                 function of biological and Online Social Networks
                 (OSNs). Nowadays, the massive size of some critical
                 networks-often stored in already overloaded relational
                 databases-effectively limits the rate at which nodes
                 and edges can be explored, making it a challenge to
                 accurately discover subgraph statistics. In this work,
                 we propose sampling methods to accurately estimate
                 subgraph statistics from as few queried nodes as
                 possible. We present sampling algorithms that
                 efficiently and accurately estimate subgraph properties
                 of massive networks. Our algorithms require no
                 precomputation or complete network topology
                 information. At the same time, we provide theoretical
                 guarantees of convergence. We perform experiments using
                 widely known datasets and show that, for the same
                 accuracy, our algorithms require an order of magnitude
                 less queries (samples) than the current
                 state-of-the-art algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zheng:2014:FHE,
  author =       "Li Zheng and Tao Li and Chris Ding",
  title =        "A Framework for Hierarchical Ensemble Clustering",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2611380",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Ensemble clustering, as an important extension of the
                 clustering problem, refers to the problem of combining
                 different (input) clusterings of a given dataset to
                 generate a final (consensus) clustering that is a
                 better fit in some sense than existing clusterings.
                 Over the past few years, many ensemble clustering
                 approaches have been developed. However, most of them
                 are designed for partitional clustering methods, and
                 few research efforts have been reported for ensemble
                 hierarchical clustering methods. In this article, a
                 hierarchical ensemble clustering framework that can
                 naturally combine both partitional clustering and
                 hierarchical clustering results is proposed. In
                 addition, a novel method for learning the ultra-metric
                 distance from the aggregated distance matrices and
                 generating final hierarchical clustering with enhanced
                 cluster separation is developed based on the
                 ultra-metric distance for hierarchical clustering. We
                 study three important problems: dendrogram description,
                 dendrogram combination, and dendrogram selection. We
                 develop two approaches for dendrogram selection based
                 on tree distances, and we investigate various
                 dendrogram distances for representing dendrograms. We
                 provide a systematic empirical study of the ensemble
                 hierarchical clustering problem. Experimental results
                 demonstrate the effectiveness of our proposed
                 approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Huai:2014:TPC,
  author =       "Baoxing Huai and Enhong Chen and Hengshu Zhu and Hui
                 Xiong and Tengfei Bao and Qi Liu and Jilei Tian",
  title =        "Toward Personalized Context Recognition for Mobile
                 Users: a Semisupervised {Bayesian} {HMM} Approach",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629504",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The problem of mobile context recognition targets the
                 identification of semantic meaning of context in a
                 mobile environment. This plays an important role in
                 understanding mobile user behaviors and thus provides
                 the opportunity for the development of better
                 intelligent context-aware services. A key step of
                 context recognition is to model the personalized
                 contextual information of mobile users. Although many
                 studies have been devoted to mobile context modeling,
                 limited efforts have been made on the exploitation of
                 the sequential and dependency characteristics of mobile
                 contextual information. Also, the latent semantics
                 behind mobile context are often ambiguous and poorly
                 understood. Indeed, a promising direction is to
                 incorporate some domain knowledge of common contexts,
                 such as ``waiting for a bus'' or ``having dinner,'' by
                 modeling both labeled and unlabeled context data from
                 mobile users because there are often few labeled
                 contexts available in practice. To this end, in this
                 article, we propose a sequence-based semisupervised
                 approach to modeling personalized context for mobile
                 users. Specifically, we first exploit the Bayesian
                 Hidden Markov Model (B-HMM) for modeling context in the
                 form of probabilistic distributions and transitions of
                 raw context data. Also, we propose a sequential model
                 by extending B-HMM with the prior knowledge of
                 contextual features to model context more accurately.
                 Then, to efficiently learn the parameters and initial
                 values of the proposed models, we develop a novel
                 approach for parameter estimation by integrating the
                 Dirichlet Process Mixture (DPM) model and the Mixture
                 Unigram (MU) model. Furthermore, by incorporating both
                 user-labeled and unlabeled data, we propose a
                 semisupervised learning-based algorithm to identify and
                 model the latent semantics of context. Finally,
                 experimental results on real-world data clearly
                 validate both the efficiency and effectiveness of the
                 proposed approaches for recognizing personalized
                 context of mobile users.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Liu:2014:ADI,
  author =       "Siyuan Liu and Lei Chen and Lionel M. Ni",
  title =        "Anomaly Detection from Incomplete Data",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629668",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Anomaly detection (a.k.a., outlier or burst detection)
                 is a well-motivated problem and a major data mining and
                 knowledge discovery task. In this article, we study the
                 problem of population anomaly detection, one of the key
                 issues related to event monitoring and population
                 management within a city. Through studying detected
                 population anomalies, we can trace and analyze these
                 anomalies, which could help to model city traffic
                 design and event impact analysis and prediction.
                 Although a significant and interesting issue, it is
                 very hard to detect population anomalies and retrieve
                 anomaly trajectories, especially given that it is
                 difficult to get actual and sufficient population data.
                 To address the difficulties of a lack of real
                 population data, we take advantage of mobile phone
                 networks, which offer enormous spatial and temporal
                 communication data on persons. More importantly, we
                 claim that we can utilize these mobile phone data to
                 infer and approximate population data. Thus, we can
                 study the population anomaly detection problem by
                 taking advantages of unique features hidden in mobile
                 phone data. In this article, we present a system to
                 conduct Population Anomaly Detection (PAD). First, we
                 propose an effective clustering method,
                 correlation-based clustering, to cluster the incomplete
                 location information from mobile phone data (i.e., from
                 mobile call volume distribution to population density
                 distribution). Then, we design an adaptive
                 parameter-free detection method, R-scan, to capture the
                 distributed dynamic anomalies. Finally, we devise an
                 efficient algorithm, BT-miner, to retrieve anomaly
                 trajectories. The experimental results from real-life
                 mobile phone data confirm the effectiveness and
                 efficiency of the proposed algorithms. Finally, the
                 proposed methods are realized as a pilot system in a
                 city in China.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Gundecha:2014:UVR,
  author =       "Pritam Gundecha and Geoffrey Barbier and Jiliang Tang
                 and Huan Liu",
  title =        "User Vulnerability and Its Reduction on a Social
                 Networking Site",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2630421",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Privacy and security are major concerns for many users
                 of social media. When users share information (e.g.,
                 data and photos) with friends, they can make their
                 friends vulnerable to security and privacy breaches
                 with dire consequences. With the continuous expansion
                 of a user's social network, privacy settings alone are
                 often inadequate to protect a user's profile. In this
                 research, we aim to address some critical issues
                 related to privacy protection: (1) How can we measure
                 and assess individual users' vulnerability? (2) With
                 the diversity of one's social network friends, how can
                 one figure out an effective approach to maintaining
                 balance between vulnerability and social utility? In
                 this work, first we present a novel way to define
                 vulnerable friends from an individual user's
                 perspective. User vulnerability is dependent on whether
                 or not the user's friends' privacy settings protect the
                 friend and the individual's network of friends (which
                 includes the user). We show that it is feasible to
                 measure and assess user vulnerability and reduce one's
                 vulnerability without changing the structure of a
                 social networking site. The approach is to unfriend
                 one's most vulnerable friends. However, when such a
                 vulnerable friend is also socially important,
                 unfriending him or her would significantly reduce one's
                 own social status. We formulate this novel problem as
                 vulnerability minimization with social utility
                 constraints. We formally define the optimization
                 problem and provide an approximation algorithm with a
                 proven bound. Finally, we conduct a large-scale
                 evaluation of a new framework using a Facebook dataset.
                 We resort to experiments and observe how much
                 vulnerability an individual user can be decreased by
                 unfriending a vulnerable friend. We compare performance
                 of different unfriending strategies and discuss the
                 security risk of new friend requests. Additionally, by
                 employing different forms of social utility, we confirm
                 that the balance between user vulnerability and social
                 utility can be practically achieved.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Duan:2014:SRC,
  author =       "Lian Duan and W. Nick Street and Yanchi Liu and
                 Songhua Xu and Brook Wu",
  title =        "Selecting the Right Correlation Measure for Binary
                 Data",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2637484",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Finding the most interesting correlations among items
                 is essential for problems in many commercial, medical,
                 and scientific domains. Although there are numerous
                 measures available for evaluating correlations,
                 different correlation measures provide drastically
                 different results. Piatetsky-Shapiro provided three
                 mandatory properties for any reasonable correlation
                 measure, and Tan et al. proposed several properties to
                 categorize correlation measures; however, it is still
                 hard for users to choose the desirable correlation
                 measures according to their needs. In order to solve
                 this problem, we explore the effectiveness problem in
                 three ways. First, we propose two desirable properties
                 and two optional properties for correlation measure
                 selection and study the property satisfaction for
                 different correlation measures. Second, we study
                 different techniques to adjust correlation measures and
                 propose two new correlation measures: the Simplified $
                 \chi^2 $ with Continuity Correction and the Simplified
                 $ \chi^2 $ with Support. Third, we analyze the upper
                 and lower bounds of different measures and categorize
                 them by the bound differences. Combining these three
                 directions, we provide guidelines for users to choose
                 the proper measure according to their needs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Huang:2014:PBA,
  author =       "Hao Huang and Hong Qin and Shinjae Yoo and Dantong
                 Yu",
  title =        "Physics-Based Anomaly Detection Defined on Manifold
                 Space",
  journal =      j-TKDD,
  volume =       "9",
  number =       "2",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2641574",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Oct 7 18:49:26 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Current popular anomaly detection algorithms are
                 capable of detecting global anomalies but often fail to
                 distinguish local anomalies from normal instances.
                 Inspired by contemporary physics theory (i.e., heat
                 diffusion and quantum mechanics), we propose two
                 unsupervised anomaly detection algorithms. Building on
                 the embedding manifold derived from heat diffusion, we
                 devise Local Anomaly Descriptor (LAD), which faithfully
                 reveals the intrinsic neighborhood density. It uses a
                 scale-dependent umbrella operator to bridge global and
                 local properties, which makes LAD more informative
                 within an adaptive scope of neighborhood. To offer more
                 stability of local density measurement on scaling
                 parameter tuning, we formulate Fermi Density Descriptor
                 (FDD), which measures the probability of a fermion
                 particle being at a specific location. By choosing the
                 stable energy distribution function, FDD steadily
                 distinguishes anomalies from normal instances with any
                 scaling parameter setting. To further enhance the
                 efficacy of our proposed algorithms, we explore the
                 utility of anisotropic Gaussian kernel (AGK), which
                 offers better manifold-aware affinity information. We
                 also quantify and examine the effect of different
                 Laplacian normalizations for anomaly detection.
                 Comprehensive experiments on both synthetic and
                 benchmark datasets verify that our proposed algorithms
                 outperform the existing anomaly detection algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Gionis:2015:ISI,
  author =       "Aristides Gionis and Hang Li",
  title =        "Introduction to the Special Issue {ACM SIGKDD} 2013",
  journal =      j-TKDD,
  volume =       "9",
  number =       "3",
  pages =        "15:1--15:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700993",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Apr 14 09:22:28 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "15e",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Jha:2015:SES,
  author =       "Madhav Jha and C. Seshadhri and Ali Pinar",
  title =        "A Space-Efficient Streaming Algorithm for Estimating
                 Transitivity and Triangle Counts Using the Birthday
                 Paradox",
  journal =      j-TKDD,
  volume =       "9",
  number =       "3",
  pages =        "15:1--15:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700395",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 6 09:34:37 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We design a space-efficient algorithm that
                 approximates the transitivity (global clustering
                 coefficient) and total triangle count with only a
                 single pass through a graph given as a stream of edges.
                 Our procedure is based on the classic probabilistic
                 result, the birthday paradox. When the transitivity is
                 constant and there are more edges than wedges (common
                 properties for social networks), we can prove that our
                 algorithm requires $ O(\sqrt n) $ space ($n$ is the
                 number of vertices) to provide accurate estimates. We
                 run a detailed set of experiments on a variety of real
                 graphs and demonstrate that the memory requirement of
                 the algorithm is a tiny fraction of the graph. For
                 example, even for a graph with 200 million edges, our
                 algorithm stores just 40,000 edges to give accurate
                 results. Being a single pass streaming algorithm, our
                 procedure also maintains a real-time estimate of the
                 transitivity/number of triangles of a graph by storing
                 a minuscule fraction of edges.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Tang:2015:FMT,
  author =       "Lu-An Tang and Xiao Yu and Quanquan Gu and Jiawei Han
                 and Guofei Jiang and Alice Leung and Thomas {La
                 Porta}",
  title =        "A Framework of Mining Trajectories from Untrustworthy
                 Data in Cyber-Physical System",
  journal =      j-TKDD,
  volume =       "9",
  number =       "3",
  pages =        "16:1--16:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700394",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 6 09:34:37 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "A cyber-physical system (CPS) integrates physical
                 (i.e., sensor) devices with cyber (i.e., informational)
                 components to form a context-sensitive system that
                 responds intelligently to dynamic changes in real-world
                 situations. The CPS has wide applications in scenarios
                 such as environment monitoring, battlefield
                 surveillance, and traffic control. One key research
                 problem of CPS is called mining lines in the sand. With
                 a large number of sensors (sand) deployed in a
                 designated area, the CPS is required to discover all
                 trajectories (lines) of passing intruders in real time.
                 There are two crucial challenges that need to be
                 addressed: (1) the collected sensor data are not
                 trustworthy, and (2) the intruders do not send out any
                 identification information. The system needs to
                 distinguish multiple intruders and track their
                 movements. This study proposes a method called LiSM
                 (Line-in-the-Sand Miner) to discover trajectories from
                 untrustworthy sensor data. LiSM constructs a watching
                 network from sensor data and computes the locations of
                 intruder appearances based on the link information of
                 the network. The system retrieves a cone model from the
                 historical trajectories to track multiple intruders.
                 Finally, the system validates the mining results and
                 updates sensors' reliability scores in a feedback
                 process. In addition, LoRM (Line-on-the-Road Miner) is
                 proposed for trajectory discovery on road networks-
                 mining lines on the roads. LoRM employs a
                 filtering-and-refinement framework to reduce the
                 distance computational overhead on road networks and
                 uses a shortest-path-measure to track intruders. The
                 proposed methods are evaluated with extensive
                 experiments on big datasets. The experimental results
                 show that the proposed methods achieve higher accuracy
                 and efficiency in trajectory mining tasks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2015:QDR,
  author =       "Zheng Wang and Jieping Ye",
  title =        "Querying Discriminative and Representative Samples for
                 Batch Mode Active Learning",
  journal =      j-TKDD,
  volume =       "9",
  number =       "3",
  pages =        "17:1--17:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700408",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Fri Mar 6 09:34:37 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Empirical risk minimization (ERM) provides a
                 principled guideline for many machine learning and data
                 mining algorithms. Under the ERM principle, one
                 minimizes an upper bound of the true risk, which is
                 approximated by the summation of empirical risk and the
                 complexity of the candidate classifier class. To
                 guarantee a satisfactory learning performance, ERM
                 requires that the training data are i.i.d. sampled from
                 the unknown source distribution. However, this may not
                 be the case in active learning, where one selects the
                 most informative samples to label, and these data may
                 not follow the source distribution. In this article, we
                 generalize the ERM principle to the active learning
                 setting. We derive a novel form of upper bound for the
                 true risk in the active learning setting; by minimizing
                 this upper bound, we develop a practical batch mode
                 active learning method. The proposed formulation
                 involves a nonconvex integer programming optimization
                 problem. We solve it efficiently by an alternating
                 optimization method. Our method is shown to query the
                 most informative samples while preserving the source
                 distribution as much as possible, thus identifying the
                 most uncertain and representative queries. We further
                 extend our method to multiclass active learning by
                 introducing novel pseudolabels in the multiclass case
                 and developing an efficient algorithm. Experiments on
                 benchmark datasets and real-world applications
                 demonstrate the superior performance of our proposed
                 method compared to state-of-the-art methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Gopal:2015:HBI,
  author =       "Siddharth Gopal and Yiming Yang",
  title =        "Hierarchical {Bayesian} Inference and Recursive
                 Regularization for Large-Scale Classification",
  journal =      j-TKDD,
  volume =       "9",
  number =       "3",
  pages =        "18:1--18:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629585",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Apr 14 09:22:28 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In this article, we address open challenges in
                 large-scale classification, focusing on how to
                 effectively leverage the dependency structures
                 (hierarchical or graphical) among class labels, and how
                 to make the inference scalable in jointly optimizing
                 all model parameters. We propose two main approaches,
                 namely the hierarchical Bayesian inference framework
                 and the recursive regularization scheme. The key idea
                 in both approaches is to reinforce the similarity among
                 parameter across the nodes in a hierarchy or network
                 based on the proximity and connectivity of the nodes.
                 For scalability, we develop hierarchical variational
                 inference algorithms and fast dual coordinate descent
                 training procedures with parallelization. In our
                 experiments for classification problems with hundreds
                 of thousands of classes and millions of training
                 instances with terabytes of parameters, the proposed
                 methods show consistent and statistically significant
                 improvements over other competing approaches, and the
                 best results on multiple benchmark datasets for
                 large-scale classification.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Yin:2015:MLB,
  author =       "Hongzhi Yin and Bin Cui and Ling Chen and Zhiting Hu
                 and Chengqi Zhang",
  title =        "Modeling Location-Based User Rating Profiles for
                 Personalized Recommendation",
  journal =      j-TKDD,
  volume =       "9",
  number =       "3",
  pages =        "19:1--19:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2663356",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Apr 14 09:22:28 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "This article proposes LA-LDA, a location-aware
                 probabilistic generative model that exploits
                 location-based ratings to model user profiles and
                 produce recommendations. Most of the existing
                 recommendation models do not consider the spatial
                 information of users or items; however, LA-LDA supports
                 three classes of location-based ratings, namely spatial
                 user ratings for nonspatial items, nonspatial user
                 ratings for spatial items, and spatial user ratings for
                 spatial items. LA-LDA consists of two components,
                 ULA-LDA and ILA-LDA, which are designed to take into
                 account user and item location information,
                 respectively. The component ULA-LDA explicitly
                 incorporates and quantifies the influence from local
                 public preferences to produce recommendations by
                 considering user home locations, whereas the component
                 ILA-LDA recommends items that are closer in both taste
                 and travel distance to the querying users by capturing
                 item co-occurrence patterns, as well as item location
                 co-occurrence patterns. The two components of LA-LDA
                 can be applied either separately or collectively,
                 depending on the available types of location-based
                 ratings. To demonstrate the applicability and
                 flexibility of the LA-LDA model, we deploy it to both
                 top-$k$ recommendation and cold start recommendation
                 scenarios. Experimental evidence on large-scale
                 real-world data, including the data from Gowalla (a
                 location-based social network), DoubanEvent (an
                 event-based social network), and MovieLens (a movie
                 recommendation system), reveal that LA-LDA models user
                 profiles more accurately by outperforming existing
                 recommendation models for top-$k$ recommendation and
                 the cold start problem.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Hu:2015:PSD,
  author =       "Juhua Hu and De-Chuan Zhan and Xintao Wu and Yuan
                 Jiang and Zhi-Hua Zhou",
  title =        "Pairwised Specific Distance Learning from Physical
                 Linkages",
  journal =      j-TKDD,
  volume =       "9",
  number =       "3",
  pages =        "20:1--20:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700405",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Apr 14 09:22:28 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In real tasks, usually a good classification
                 performance can only be obtained when a good distance
                 metric is obtained; therefore, distance metric learning
                 has attracted significant attention in the past few
                 years. Typical studies of distance metric learning
                 evaluate how to construct an appropriate distance
                 metric that is able to separate training data points
                 from different classes or satisfy a set of constraints
                 (e.g., must-links and/or cannot-links). It is
                 noteworthy that this task becomes challenging when
                 there are only limited labeled training data points and
                 no constraints are given explicitly. Moreover, most
                 existing approaches aim to construct a global distance
                 metric that is applicable to all data points. However,
                 different data points may have different properties and
                 may require different distance metrics. We notice that
                 data points in real tasks are often connected by
                 physical links (e.g., people are linked with each other
                 in social networks; personal webpages are often
                 connected to other webpages, including nonpersonal
                 webpages), but the linkage information has not been
                 exploited in distance metric learning. In this article,
                 we develop a pairwised specific distance (PSD) approach
                 that exploits the structures of physical linkages and
                 in particular captures the key observations that
                 nonmetric and clique linkages imply the appearance of
                 different or unique semantics, respectively. It is
                 noteworthy that, rather than generating a global
                 distance, PSD generates different distances for
                 different pairs of data points; this property is
                 desired in applications involving complicated data
                 semantics. We mainly present PSD for multi-class
                 learning and further extend it to multi-label learning.
                 Experimental results validate the effectiveness of PSD,
                 especially in the scenarios in which there are very
                 limited labeled training data points and no explicit
                 constraints are given.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "20",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Soundarajan:2015:ULG,
  author =       "Sucheta Soundarajan and John E. Hopcroft",
  title =        "Use of Local Group Information to Identify Communities
                 in Networks",
  journal =      j-TKDD,
  volume =       "9",
  number =       "3",
  pages =        "21:1--21:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700404",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Apr 14 09:22:28 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The recent interest in networks has inspired a broad
                 range of work on algorithms and techniques to
                 characterize, identify, and extract communities from
                 networks. Such efforts are complicated by a lack of
                 consensus on what a ``community'' truly is, and these
                 disagreements have led to a wide variety of
                 mathematical formulations for describing communities.
                 Often, these mathematical formulations, such as
                 modularity and conductance, have been founded in the
                 general principle that communities, like a $ G(n, p) $
                 graph, are ``round,'' with connections throughout the
                 entire community, and so algorithms were developed to
                 optimize such mathematical measures. More recently, a
                 variety of algorithms have been developed that, rather
                 than expecting connectivity through the entire
                 community, seek out very small groups of well-connected
                 nodes and then connect these groups into larger
                 communities. In this article, we examine seven real
                 networks, each containing external annotation that
                 allows us to identify ``annotated communities.'' A
                 study of these annotated communities gives insight into
                 why the second category of community detection
                 algorithms may be more successful than the first
                 category. We then present a flexible algorithm template
                 that is based on the idea of joining together small
                 sets of nodes. In this template, we first identify very
                 small, tightly connected ``subcommunities'' of nodes,
                 each corresponding to a single node's ``perception'' of
                 the network around it. We then create a new network in
                 which each node represents such a subcommunity, and
                 then identify communities in this new network. Because
                 each node can appear in multiple subcommunities, this
                 method allows us to detect overlapping communities.
                 When evaluated on real data, we show that our template
                 outperforms many other state-of-the-art algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "21",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2015:UCN,
  author =       "Pinghui Wang and Junzhou Zhao and John C. S. Lui and
                 Don Towsley and Xiaohong Guan",
  title =        "Unbiased Characterization of Node Pairs over Large
                 Graphs",
  journal =      j-TKDD,
  volume =       "9",
  number =       "3",
  pages =        "22:1--22:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700393",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Apr 14 09:22:28 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Characterizing user pair relationships is important
                 for applications such as friend recommendation and
                 interest targeting in online social networks (OSNs).
                 Due to the large-scale nature of such networks, it is
                 infeasible to enumerate all user pairs and thus
                 sampling is used. In this article, we show that it is a
                 great challenge for OSN service providers to
                 characterize user pair relationships, even when they
                 possess the complete graph topology. The reason is that
                 when sampling techniques (i.e., uniform vertex sampling
                 (UVS) and random walk (RW)) are naively applied, they
                 can introduce large biases, particularly for estimating
                 similarity distribution of user pairs with constraints
                 like existence of mutual neighbors, which is important
                 for applications such as identifying network homophily.
                 Estimating statistics of user pairs is more challenging
                 in the absence of the complete topology information, as
                 an unbiased sampling technique like UVS is usually not
                 allowed and exploring the OSN graph topology is
                 expensive. To address these challenges, we present
                 unbiased sampling methods to characterize user pair
                 properties based on UVS and RW techniques. We carry out
                 an evaluation of our methods to show their accuracy and
                 efficiency. Finally, we apply our methods to three
                 OSNs-Foursquare, Douban, and Xiami-and discover that
                 significant homophily is present in these networks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "22",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Vlachos:2015:DPC,
  author =       "Michail Vlachos and Johannes Schneider and Vassilios
                 G. Vassiliadis",
  title =        "On Data Publishing with Clustering Preservation",
  journal =      j-TKDD,
  volume =       "9",
  number =       "3",
  pages =        "23:1--23:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700403",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Apr 14 09:22:28 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The emergence of cloud-based storage services is
                 opening up new avenues in data exchange and data
                 dissemination. This has amplified the interest in
                 right-protection mechanisms to establish ownership in
                 the event of data leakage. Current right-protection
                 technologies, however, rarely provide strong guarantees
                 on dataset utility after the protection process. This
                 work presents techniques that explicitly address this
                 topic and provably preserve the outcome of certain
                 mining operations. In particular, we take special care
                 to guarantee that the outcome of hierarchical
                 clustering operations remains the same before and after
                 right protection. Our approach considers all prevalent
                 hierarchical clustering variants: single-, complete-,
                 and average-linkage. We imprint the ownership in a
                 dataset using watermarking principles, and we derive
                 tight bounds on the expansion/contraction of distances
                 incurred by the process. We leverage our analysis to
                 design fast algorithms for right protection without
                 exhaustively searching the vast design space. Finally,
                 because the right-protection process introduces a
                 user-tunable distortion on the dataset, we explore the
                 possibility of using this mechanism for data
                 obfuscation. We quantify the tradeoff between
                 obfuscation and utility for spatiotemporal datasets and
                 discover very favorable characteristics of the process.
                 An additional advantage is that when one is interested
                 in both right-protecting and obfuscating the original
                 data values, the proposed mechanism can accomplish both
                 tasks simultaneously.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "23",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{VazDeMelo:2015:UDP,
  author =       "Pedro O. S. {Vaz De Melo} and Christos Faloutsos and
                 Renato Assun{\c{c}}{\~a}o and Rodrigo Alves and Antonio
                 A. F. Loureiro",
  title =        "Universal and Distinct Properties of Communication
                 Dynamics: How to Generate Realistic Inter-event Times",
  journal =      j-TKDD,
  volume =       "9",
  number =       "3",
  pages =        "24:1--24:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700399",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Apr 14 09:22:28 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "With the advancement of information systems, means of
                 communications are becoming cheaper, faster, and more
                 available. Today, millions of people carrying
                 smartphones or tablets are able to communicate
                 practically any time and anywhere they want. They can
                 access their e-mails, comment on weblogs, watch and
                 post videos and photos (as well as comment on them),
                 and make phone calls or text messages almost
                 ubiquitously. Given this scenario, in this article, we
                 tackle a fundamental aspect of this new era of
                 communication: How the time intervals between
                 communication events behave for different technologies
                 and means of communications. Are there universal
                 patterns for the Inter-Event Time Distribution (IED)?
                 How do inter-event times behave differently among
                 particular technologies? To answer these questions, we
                 analyzed eight different datasets from real and modern
                 communication data and found four well-defined patterns
                 seen in all the eight datasets. Moreover, we propose
                 the use of the Self-Feeding Process (SFP) to generate
                 inter-event times between communications. The SFP is an
                 extremely parsimonious point process that requires at
                 most two parameters and is able to generate inter-event
                 times with all the universal properties we observed in
                 the data. We also show three potential applications of
                 the SFP: as a framework to generate a synthetic dataset
                 containing realistic communication events of any one of
                 the analyzed means of communications, as a technique to
                 detect anomalies, and as a building block for more
                 specific models that aim to encompass the
                 particularities seen in each of the analyzed systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "24",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhang:2015:WIY,
  author =       "Jing Zhang and Jie Tang and Juanzi Li and Yang Liu and
                 Chunxiao Xing",
  title =        "Who Influenced You? {Predicting} Retweet via Social
                 Influence Locality",
  journal =      j-TKDD,
  volume =       "9",
  number =       "3",
  pages =        "25:1--25:??",
  month =        apr,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700398",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Apr 14 09:22:28 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Social influence occurs when one's opinions, emotions,
                 or behaviors are affected by others in a social
                 network. However, social influence takes many forms,
                 and its underlying mechanism is still unclear. For
                 example, how is one's behavior influenced by a group of
                 friends who know each other and by the friends from
                 different ego friend circles? In this article, we study
                 the social influence problem in a large microblogging
                 network. Particularly, we consider users' (re)tweet
                 behaviors and focus on investigating how friends in
                 one's ego network influence retweet behaviors. We
                 propose a novel notion of social influence locality and
                 develop two instantiation functions based on pairwise
                 influence and structural diversity. The defined
                 influence locality functions have strong predictive
                 power. Without any additional features, we can obtain
                 an F1-score of 71.65\% for predicting users' retweet
                 behaviors by training a logistic regression classifier
                 based on the defined influence locality functions. We
                 incorporate social influence locality into a factor
                 graph model, which can further leverage the
                 network-based correlation. Our experiments on the large
                 microblogging network show that the model significantly
                 improves the precision of retweet prediction. Our
                 analysis also reveals several intriguing discoveries.
                 For example, if you have six friends retweeting a
                 microblog, the average likelihood that you will also
                 retweet it strongly depends on the structure among the
                 six friends: The likelihood will significantly drop
                 (only 1/6) when the six friends do not know each other,
                 compared with the case when the six friends know each
                 other.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "25",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Xie:2015:MMA,
  author =       "Hong Xie and John C. S. Lui",
  title =        "Mathematical Modeling and Analysis of Product Rating
                 with Partial Information",
  journal =      j-TKDD,
  volume =       "9",
  number =       "4",
  pages =        "26:1--26:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700386",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Wed Jun 3 06:21:22 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Many Web services like Amazon, Epinions, and
                 TripAdvisor provide historical product ratings so that
                 users can evaluate the quality of products. Product
                 ratings are important because they affect how well a
                 product will be adopted by the market. The challenge is
                 that we only have partial information on these ratings:
                 each user assigns ratings to only a small subset of
                 products. Under this partial information setting, we
                 explore a number of fundamental questions. What is the
                 minimum number of ratings a product needs so that one
                 can make a reliable evaluation of its quality? How may
                 users' misbehavior, such as cheating in product rating,
                 affect the evaluation result? To answer these
                 questions, we present a probabilistic model to capture
                 various important factors (e.g., rating aggregation
                 rules, rating behavior) that may influence the product
                 quality assessment under the partial information
                 setting. We derive the minimum number of ratings needed
                 to produce a reliable indicator on the quality of a
                 product. We extend our model to accommodate users'
                 misbehavior in product rating. We derive the maximum
                 fraction of misbehaving users that a rating aggregation
                 rule can tolerate and the minimum number of ratings
                 needed to compensate. We carry out experiments using
                 both synthetic and real-world data (from Amazon and
                 TripAdvisor). We not only validate our model but also
                 show that the ``average rating rule'' produces more
                 reliable and robust product quality assessments than
                 the ``majority rating rule'' and the ``median rating
                 rule'' in aggregating product ratings. Last, we perform
                 experiments on two movie rating datasets (from Flixster
                 and Netflix) to demonstrate how to apply our framework
                 to improve the applications of recommender systems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "26",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Esuli:2015:OTQ,
  author =       "Andrea Esuli and Fabrizio Sebastiani",
  title =        "Optimizing Text Quantifiers for Multivariate Loss
                 Functions",
  journal =      j-TKDD,
  volume =       "9",
  number =       "4",
  pages =        "27:1--27:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700406",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Wed Jun 3 06:21:22 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We address the problem of quantification, a supervised
                 learning task whose goal is, given a class, to estimate
                 the relative frequency (or prevalence) of the class in
                 a dataset of unlabeled items. Quantification has
                 several applications in data and text mining, such as
                 estimating the prevalence of positive reviews in a set
                 of reviews of a given product or estimating the
                 prevalence of a given support issue in a dataset of
                 transcripts of phone calls to tech support. So far,
                 quantification has been addressed by learning a
                 general-purpose classifier, counting the unlabeled
                 items that have been assigned the class, and tuning the
                 obtained counts according to some heuristics. In this
                 article, we depart from the tradition of using
                 general-purpose classifiers and use instead a
                 supervised learning model for structured prediction,
                 capable of generating classifiers directly optimized
                 for the (multivariate and nonlinear) function used for
                 evaluating quantification accuracy. The experiments
                 that we have run on 5,500 binary high-dimensional
                 datasets (averaging more than 14,000 documents each)
                 show that this method is more accurate, more stable,
                 and more efficient than existing state-of-the-art
                 quantification methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "27",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Lin:2015:IMS,
  author =       "Bing-Rong Lin and Daniel Kifer",
  title =        "Information Measures in Statistical Privacy and Data
                 Processing Applications",
  journal =      j-TKDD,
  volume =       "9",
  number =       "4",
  pages =        "28:1--28:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700407",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Wed Jun 3 06:21:22 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In statistical privacy, utility refers to two
                 concepts: information preservation, how much
                 statistical information is retained by a sanitizing
                 algorithm, and usability, how (and with how much
                 difficulty) one extracts this information to build
                 statistical models, answer queries, and so forth. Some
                 scenarios incentivize a separation between information
                 preservation and usability, so that the data owner
                 first chooses a sanitizing algorithm to maximize a
                 measure of information preservation, and, afterward,
                 the data consumers process the sanitized output
                 according to their various individual needs [Ghosh et
                 al. 2009; Williams and McSherry 2010]. We analyze the
                 information-preserving properties of utility measures
                 with a combination of two new and three existing
                 utility axioms and study how violations of an axiom can
                 be fixed. We show that the average (over possible
                 outputs of the sanitizer) error of Bayesian decision
                 makers forms the unique class of utility measures that
                 satisfy all of the axioms. The axioms are agnostic to
                 Bayesian concepts such as subjective probabilities and
                 hence strengthen support for Bayesian views in privacy
                 research. In particular, this result connects
                 information preservation to aspects of usability-if the
                 information preservation of a sanitizing algorithm
                 should be measured as the average error of a Bayesian
                 decision maker, shouldn't Bayesian decision theory be a
                 good choice when it comes to using the sanitized
                 outputs for various purposes? We put this idea to the
                 test in the unattributed histogram problem where our
                 decision-theoretic postprocessing algorithm empirically
                 outperforms previously proposed approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "28",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Huang:2015:DAC,
  author =       "Hao Huang and Shinjae Yoo and Dantong Yu and Hong
                 Qin",
  title =        "Density-Aware Clustering Based on Aggregated Heat
                 Kernel and Its Transformation",
  journal =      j-TKDD,
  volume =       "9",
  number =       "4",
  pages =        "29:1--29:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700385",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Wed Jun 3 06:21:22 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Current spectral clustering algorithms suffer from the
                 sensitivity to existing noise and parameter scaling and
                 may not be aware of different density distributions
                 across clusters. If these problems are left untreated,
                 the consequent clustering results cannot accurately
                 represent true data patterns, in particular, for
                 complex real-world datasets with heterogeneous
                 densities. This article aims to solve these problems by
                 proposing a diffusion-based Aggregated Heat Kernel
                 (AHK) to improve the clustering stability, and a Local
                 Density Affinity Transformation (LDAT) to correct the
                 bias originating from different cluster densities. AHK
                 statistically models the heat diffusion traces along
                 the entire time scale, so it ensures robustness during
                 the clustering process, while LDAT probabilistically
                 reveals the local density of each instance and
                 suppresses the local density bias in the affinity
                 matrix. Our proposed framework integrates these two
                 techniques systematically. As a result, it not only
                 provides an advanced noise-resisting and density-aware
                 spectral mapping to the original dataset but also
                 demonstrates the stability during the processing of
                 tuning the scaling parameter (which usually controls
                 the range of neighborhood). Furthermore, our framework
                 works well with the majority of similarity kernels,
                 which ensures its applicability to many types of data
                 and problem domains. The systematic experiments on
                 different applications show that our proposed algorithm
                 outperforms state-of-the-art clustering algorithms for
                 the data with heterogeneous density distributions and
                 achieves robust clustering performance with respect to
                 tuning the scaling parameter and handling various
                 levels and types of noise.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "29",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Yu:2015:CSF,
  author =       "Kui Yu and Wei Ding and Dan A. Simovici and Hao Wang
                 and Jian Pei and Xindong Wu",
  title =        "Classification with Streaming Features: an
                 Emerging-Pattern Mining Approach",
  journal =      j-TKDD,
  volume =       "9",
  number =       "4",
  pages =        "30:1--30:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700409",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Wed Jun 3 06:21:22 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Many datasets from real-world applications have very
                 high-dimensional or increasing feature space. It is a
                 new research problem to learn and maintain a classifier
                 to deal with very high dimensionality or streaming
                 features. In this article, we adapt the well-known
                 emerging-pattern--based classification models and
                 propose a semi-streaming approach. For streaming
                 features, it is computationally expensive or even
                 prohibitive to mine long-emerging patterns, and it is
                 nontrivial to integrate emerging-pattern mining with
                 feature selection. We present an online feature
                 selection step, which is capable of selecting and
                 maintaining a pool of effective features from a feature
                 stream. Then, in our offline step, separated from the
                 online step, we periodically compute and update
                 emerging patterns from the pool of selected features
                 from the online step. We evaluate the effectiveness and
                 efficiency of the proposed method using a series of
                 benchmark datasets and a real-world case study on Mars
                 crater detection. Our proposed method yields
                 classification performance comparable to the
                 state-of-art static classification methods. Most
                 important, the proposed method is significantly faster
                 and can efficiently handle datasets with streaming
                 features.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "30",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Liu:2015:SEH,
  author =       "Guimei Liu and Haojun Zhang and Mengling Feng and
                 Limsoon Wong and See-Kiong Ng",
  title =        "Supporting Exploratory Hypothesis Testing and
                 Analysis",
  journal =      j-TKDD,
  volume =       "9",
  number =       "4",
  pages =        "31:1--31:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2701430",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Wed Jun 3 06:21:22 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Conventional hypothesis testing is carried out in a
                 hypothesis-driven manner. A scientist must first
                 formulate a hypothesis based on what he or she sees and
                 then devise a variety of experiments to test it. Given
                 the rapid growth of data, it has become virtually
                 impossible for a person to manually inspect all data to
                 find all of the interesting hypotheses for testing. In
                 this article, we propose and develop a data-driven
                 framework for automatic hypothesis testing and
                 analysis. We define a hypothesis as a comparison
                 between two or more subpopulations. We find
                 subpopulations for comparison using frequent pattern
                 mining techniques and then pair them up for statistical
                 hypothesis testing. We also generate additional
                 information for further analysis of the hypotheses that
                 are deemed significant. The number of hypotheses
                 generated can be very large, and many of them are very
                 similar. We develop algorithms to remove redundant
                 hypotheses and present a succinct set of significant
                 hypotheses to users. We conducted a set of experiments
                 to show the efficiency and effectiveness of the
                 proposed algorithms. The results show that our system
                 can help users (1) identify significant hypotheses
                 efficiently, (2) isolate the reasons behind significant
                 hypotheses efficiently, and (3) find confounding
                 factors that form Simpson's paradoxes with discovered
                 significant hypotheses.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "31",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Greco:2015:PDU,
  author =       "Gianluigi Greco and Antonella Guzzo and Francesco
                 Lupia and Luigi Pontieri",
  title =        "Process Discovery under Precedence Constraints",
  journal =      j-TKDD,
  volume =       "9",
  number =       "4",
  pages =        "32:1--32:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2710020",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Wed Jun 3 06:21:22 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Process discovery has emerged as a powerful approach
                 to support the analysis and the design of complex
                 processes. It consists of analyzing a set of traces
                 registering the sequence of tasks performed along
                 several enactments of a transactional system, in order
                 to build a process model that can explain all the
                 episodes recorded over them. An approach to accomplish
                 this task is presented that can benefit from the
                 background knowledge that, in many cases, is available
                 to the analysts taking care of the process (re-)design.
                 The approach is based on encoding the information
                 gathered from the log and the (possibly) given
                 background knowledge in terms of precedence
                 constraints, that is, of constraints over the topology
                 of the resulting process models. Mining algorithms are
                 eventually formulated in terms of reasoning problems
                 over precedence constraints, and the computational
                 complexity of such problems is thoroughly analyzed by
                 tracing their tractability frontier. Solution
                 algorithms are proposed and their properties analyzed.
                 These algorithms have been implemented in a prototype
                 system, and results of a thorough experimental activity
                 are discussed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "32",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Mirbakhsh:2015:ITR,
  author =       "Nima Mirbakhsh and Charles X. Ling",
  title =        "Improving Top-{$N$} Recommendation for Cold-Start
                 Users via Cross-Domain Information",
  journal =      j-TKDD,
  volume =       "9",
  number =       "4",
  pages =        "33:1--33:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2724720",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Wed Jun 3 06:21:22 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Making accurate recommendations for cold-start users
                 is a challenging yet important problem in
                 recommendation systems. Including more information from
                 other domains is a natural solution to improve the
                 recommendations. However, most previous work in
                 cross-domain recommendations has focused on improving
                 prediction accuracy with several severe limitations. In
                 this article, we extend our previous work on
                 clustering-based matrix factorization in single domains
                 into cross domains. In addition, we utilize recent
                 results on unobserved ratings. Our new method can more
                 effectively utilize data from auxiliary domains to
                 achieve better recommendations, especially for
                 cold-start users. For example, our method improves the
                 recall to 21\% on average for cold-start users, whereas
                 previous methods result in only 15\% recall in the
                 cross-domain Amazon dataset. We also observe almost the
                 same improvements in the Epinions dataset. Considering
                 that it is often difficult to make even a small
                 improvement in recommendations, for cold-start users in
                 particular, our result is quite significant.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "33",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Bonchi:2015:CCC,
  author =       "Francesco Bonchi and Aristides Gionis and Francesco
                 Gullo and Charalampos E. Tsourakakis and Antti
                 Ukkonen",
  title =        "Chromatic Correlation Clustering",
  journal =      j-TKDD,
  volume =       "9",
  number =       "4",
  pages =        "34:1--34:??",
  month =        jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2728170",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Wed Jun 3 06:21:22 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We study a novel clustering problem in which the
                 pairwise relations between objects are categorical.
                 This problem can be viewed as clustering the vertices
                 of a graph whose edges are of different types (colors).
                 We introduce an objective function that ensures the
                 edges within each cluster have, as much as possible,
                 the same color. We show that the problem is NP-hard and
                 propose a randomized algorithm with approximation
                 guarantee proportional to the maximum degree of the
                 input graph. The algorithm iteratively picks a random
                 edge as a pivot, builds a cluster around it, and
                 removes the cluster from the graph. Although being
                 fast, easy to implement, and parameter-free, this
                 algorithm tends to produce a relatively large number of
                 clusters. To overcome this issue we introduce a variant
                 algorithm, which modifies how the pivot is chosen and
                 how the cluster is built around the pivot. Finally, to
                 address the case where a fixed number of output
                 clusters is required, we devise a third algorithm that
                 directly optimizes the objective function based on the
                 alternating-minimization paradigm. We also extend our
                 objective function to handle cases where object's
                 relations are described by multiple labels. We modify
                 our randomized approximation algorithm to optimize such
                 an extended objective function and show that its
                 approximation guarantee remains proportional to the
                 maximum degree of the graph. We test our algorithms on
                 synthetic and real data from the domains of social
                 media, protein-interaction networks, and bibliometrics.
                 Results reveal that our algorithms outperform a
                 baseline algorithm both in the task of reconstructing a
                 ground-truth clustering and in terms of
                 objective-function value.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "34",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2015:LSC,
  author =       "Hua Wang and Feiping Nie and Heng Huang",
  title =        "Large-Scale Cross-Language {Web} Page Classification
                 via Dual Knowledge Transfer Using Fast Nonnegative
                 Matrix Trifactorization",
  journal =      j-TKDD,
  volume =       "10",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2710021",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jul 28 17:19:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "With the rapid growth of modern technologies, Internet
                 has reached almost every corner of the world. As a
                 result, it becomes more and more important to manage
                 and mine information contained in Web pages in
                 different languages. Traditional supervised learning
                 methods usually require a large amount of training data
                 to obtain accurate and robust classification models.
                 However, labeled Web pages did not increase as fast as
                 the growth of Internet. The lack of sufficient training
                 Web pages in many languages, especially for those in
                 uncommonly used languages, makes it a challenge for
                 traditional classification algorithms to achieve
                 satisfactory performance. To address this, we observe
                 that Web pages for a same topic from different
                 languages usually share some common semantic patterns,
                 though in different representation forms. In addition,
                 we also observe that the associations between word
                 clusters and Web page classes are another type of
                 reliable carriers to transfer knowledge across
                 languages. With these recognitions, in this article we
                 propose a novel joint nonnegative matrix
                 trifactorization (NMTF) based Dual Knowledge Transfer
                 (DKT) approach for cross-language Web page
                 classification. Our approach transfers knowledge from
                 the auxiliary language, in which abundant labeled Web
                 pages are available, to the target languages, in which
                 we want to classify Web pages, through two different
                 paths: word cluster approximation and the associations
                 between word clusters and Web page classes. With the
                 reinforcement between these two different knowledge
                 transfer paths, our approach can achieve better
                 classification accuracy. In order to deal with the
                 large-scale real world data, we further develop the
                 proposed DKT approach by constraining the factor
                 matrices of NMTF to be cluster indicator matrices. Due
                 to the nature of cluster indicator matrices, we can
                 decouple the proposed optimization objective and the
                 resulted subproblems are of much smaller sizes
                 involving much less matrix multiplications, which make
                 our new approach much more computationally efficient.
                 We evaluate the proposed approach in extensive
                 experiments using a real world cross-language Web page
                 data set. Promising results have demonstrated the
                 effectiveness of our approach that are consistent with
                 our theoretical analyses.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhou:2015:SIB,
  author =       "Yang Zhou and Ling Liu",
  title =        "Social Influence Based Clustering and Optimization
                 over Heterogeneous Information Networks",
  journal =      j-TKDD,
  volume =       "10",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2717314",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jul 28 17:19:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Social influence analysis has shown great potential
                 for strategic marketing decision. It is well known that
                 people influence one another based on both their social
                 connections and the social activities that they have
                 engaged in the past. In this article, we develop an
                 innovative and high-performance social influence based
                 graph clustering framework with four unique features.
                 First, we explicitly distinguish social connection
                 based influence (self-influence) and social activity
                 based influence (co-influence). We compute the
                 self-influence similarity between two members based on
                 their social connections within a single collaboration
                 network, and compute the co-influence similarity by
                 taking into account not only the set of activities that
                 people participate but also the semantic association
                 between these activities. Second, we define the concept
                 of influence-based similarity by introducing a unified
                 influence-based similarity matrix that employs an
                 iterative weight update method to integrate
                 self-influence and co-influence similarities. Third, we
                 design a dynamic learning algorithm, called SI-C
                 luster, for social influence based graph clustering. It
                 iteratively partitions a large social collaboration
                 network into K clusters based on both the social
                 network itself and the multiple associated activity
                 information networks, each representing a category of
                 activities that people have engaged. To make the
                 SI-Cluster algorithm converge fast, we transform
                 sophisticated nonlinear fractional programming problem
                 with respect to multiple weights into a straightforward
                 nonlinear parametric programming problem of single
                 variable. Finally, we develop an optimization technique
                 of diagonalizable-matrix approximation to speed up the
                 computation of self-influence similarity and
                 co-influence similarities. Our SI-Cluster-Opt
                 significantly improves the efficiency of SI-Cluster on
                 large graphs while maintaining high quality of
                 clustering results. Extensive experimental evaluation
                 on three real-world graphs shows that, compared to
                 existing representative graph clustering algorithms,
                 our SI-Cluster-Opt approach not only achieves a very
                 good balance between self-influence and co-influence
                 similarities but also scales extremely well for
                 clustering large graphs in terms of time complexity
                 while meeting the guarantee of high density, low
                 entropy and low Davies--Bouldin Index.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Papalexakis:2015:PSP,
  author =       "Evangelos E. Papalexakis and Christos Faloutsos and
                 Nicholas D. Sidiropoulos",
  title =        "{ParCube}: Sparse Parallelizable {CANDECOMP--PARAFAC}
                 Tensor Decomposition",
  journal =      j-TKDD,
  volume =       "10",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2729980",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jul 28 17:19:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "How can we efficiently decompose a tensor into sparse
                 factors, when the data do not fit in memory? Tensor
                 decompositions have gained a steadily increasing
                 popularity in data-mining applications; however, the
                 current state-of-art decomposition algorithms operate
                 on main memory and do not scale to truly large
                 datasets. In this work, we propose ParCube, a new and
                 highly parallelizable method for speeding up tensor
                 decompositions that is well suited to produce sparse
                 approximations. Experiments with even moderately large
                 data indicate over 90\% sparser outputs and 14 times
                 faster execution, with approximation error close to the
                 current state of the art irrespective of computation
                 and memory requirements. We provide theoretical
                 guarantees for the algorithm's correctness and we
                 experimentally validate our claims through extensive
                 experiments, including four different real world
                 datasets (Enron, Lbnl, Facebook and Nell),
                 demonstrating its effectiveness for data-mining
                 practitioners. In particular, we are the first to
                 analyze the very large Nell dataset using a sparse
                 tensor decomposition, demonstrating that ParCube
                 enables us to handle effectively and efficiently very
                 large datasets. Finally, we make our highly scalable
                 parallel implementation publicly available, enabling
                 reproducibility of our work.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Ahmed:2015:AMC,
  author =       "Rezwan Ahmed and George Karypis",
  title =        "Algorithms for Mining the Coevolving Relational Motifs
                 in Dynamic Networks",
  journal =      j-TKDD,
  volume =       "10",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2733380",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jul 28 17:19:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Computational methods and tools that can efficiently
                 and effectively analyze the temporal changes in dynamic
                 complex relational networks enable us to gain
                 significant insights regarding the entity relations and
                 their evolution. This article introduces a new class of
                 dynamic graph patterns, referred to as coevolving
                 relational motifs (CRMs), which are designed to
                 identify recurring sets of entities whose relations
                 change in a consistent way over time. CRMs can provide
                 evidence to the existence of, possibly unknown,
                 coordination mechanisms by identifying the relational
                 motifs that evolve in a similar and highly conserved
                 fashion. We developed an algorithm to efficiently
                 analyze the frequent relational changes between the
                 entities of the dynamic networks and capture all
                 frequent coevolutions as CRMs. Our algorithm follows a
                 depth-first exploration of the frequent CRM lattice and
                 incorporates canonical labeling for redundancy
                 elimination. Experimental results based on multiple
                 real world dynamic networks show that the method is
                 able to efficiently identify CRMs. In addition, a
                 qualitative analysis of the results shows that the
                 discovered patterns can be used as features to
                 characterize the dynamic network.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Campello:2015:HDE,
  author =       "Ricardo J. G. B. Campello and Davoud Moulavi and
                 Arthur Zimek and J{\"o}rg Sander",
  title =        "Hierarchical Density Estimates for Data Clustering,
                 Visualization, and Outlier Detection",
  journal =      j-TKDD,
  volume =       "10",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2733381",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jul 28 17:19:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "An integrated framework for density-based cluster
                 analysis, outlier detection, and data visualization is
                 introduced in this article. The main module consists of
                 an algorithm to compute hierarchical estimates of the
                 level sets of a density, following Hartigan's classic
                 model of density-contour clusters and trees. Such an
                 algorithm generalizes and improves existing
                 density-based clustering techniques with respect to
                 different aspects. It provides as a result a complete
                 clustering hierarchy composed of all possible
                 density-based clusters following the nonparametric
                 model adopted, for an infinite range of density
                 thresholds. The resulting hierarchy can be easily
                 processed so as to provide multiple ways for data
                 visualization and exploration. It can also be further
                 postprocessed so that: (i) a normalized score of
                 ``outlierness'' can be assigned to each data object,
                 which unifies both the global and local perspectives of
                 outliers into a single definition; and (ii) a ``flat''
                 (i.e., nonhierarchical) clustering solution composed of
                 clusters extracted from local cuts through the cluster
                 tree (possibly corresponding to different density
                 thresholds) can be obtained, either in an unsupervised
                 or in a semisupervised way. In the unsupervised
                 scenario, the algorithm corresponding to this
                 postprocessing module provides a global, optimal
                 solution to the formal problem of maximizing the
                 overall stability of the extracted clusters. If
                 partially labeled objects or instance-level constraints
                 are provided by the user, the algorithm can solve the
                 problem by considering both constraints
                 violations/satisfactions and cluster stability
                 criteria. An asymptotic complexity analysis, both in
                 terms of running time and memory space, is described.
                 Experiments are reported that involve a variety of
                 synthetic and real datasets, including comparisons with
                 state-of-the-art, density-based clustering and (global
                 and local) outlier detection methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Berardi:2015:UTR,
  author =       "Giacomo Berardi and Andrea Esuli and Fabrizio
                 Sebastiani",
  title =        "Utility-Theoretic Ranking for Semiautomated Text
                 Classification",
  journal =      j-TKDD,
  volume =       "10",
  number =       "1",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742548",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jul 28 17:19:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Semiautomated Text Classification (SATC) may be
                 defined as the task of ranking a set D of automatically
                 labelled textual documents in such a way that, if a
                 human annotator validates (i.e., inspects and corrects
                 where appropriate) the documents in a top-ranked
                 portion of D with the goal of increasing the overall
                 labelling accuracy of D, the expected increase is
                 maximized. An obvious SATC strategy is to rank D so
                 that the documents that the classifier has labelled
                 with the lowest confidence are top ranked. In this
                 work, we show that this strategy is suboptimal. We
                 develop new utility-theoretic ranking methods based on
                 the notion of validation gain, defined as the
                 improvement in classification effectiveness that would
                 derive by validating a given automatically labelled
                 document. We also propose a new effectiveness measure
                 for SATC-oriented ranking methods, based on the
                 expected reduction in classification error brought
                 about by partially validating a list generated by a
                 given ranking method. We report the results of
                 experiments showing that, with respect to the baseline
                 method mentioned earlier, and according to the proposed
                 measure, our utility-theoretic ranking methods can
                 achieve substantially higher expected reductions in
                 classification error.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Yu:2015:DIP,
  author =       "Zhiwen Yu and Zhu Wang and Huilei He and Jilei Tian
                 and Xinjiang Lu and Bin Guo",
  title =        "Discovering Information Propagation Patterns in
                 Microblogging Services",
  journal =      j-TKDD,
  volume =       "10",
  number =       "1",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742801",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jul 28 17:19:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "During the last decade, microblog has become an
                 important social networking service with billions of
                 users all over the world, acting as a novel and
                 efficient platform for the creation and dissemination
                 of real-time information. Modeling and revealing the
                 information propagation patterns in microblogging
                 services cannot only lead to more accurate
                 understanding of user behaviors and provide insights
                 into the underlying sociology, but also enable useful
                 applications such as trending prediction,
                 recommendation and filtering, spam detection and viral
                 marketing. In this article, we aim to reveal the
                 information propagation patterns in Sina Weibo, the
                 biggest microblogging service in China. First, the
                 cascade of each message is represented as a tree based
                 on its retweeting process. Afterwards, we divide the
                 information propagation pattern into two levels, that
                 is, the macro level and the micro level. On one hand,
                 the macro propagation patterns refer to general
                 propagation modes that are extracted by grouping
                 propagation trees based on hierarchical clustering. On
                 the other hand, the micro propagation patterns are
                 frequent information flow patterns that are discovered
                 using tree-based mining techniques. Experimental
                 results show that several interesting patterns are
                 extracted, such as popular message propagation,
                 artificial propagation, and typical information flows
                 between different types of users.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhang:2015:SMB,
  author =       "Xianchao Zhang and Xiaotong Zhang and Han Liu",
  title =        "Smart Multitask {Bregman} Clustering and Multitask
                 Kernel Clustering",
  journal =      j-TKDD,
  volume =       "10",
  number =       "1",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2747879",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jul 28 17:19:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Traditional clustering algorithms deal with a single
                 clustering task on a single dataset. However, there are
                 many related tasks in the real world, which motivates
                 multitask clustering. Recently some multitask
                 clustering algorithms have been proposed, and among
                 them multitask Bregman clustering (MBC) is a very
                 applicable method. MBC alternatively updates clusters
                 and learns relationships between clusters of different
                 tasks, and the two phases boost each other. However,
                 the boosting does not always have positive effects on
                 improving the clustering performance, it may also cause
                 negative effects. Another issue of MBC is that it
                 cannot deal with nonlinear separable data. In this
                 article, we show that in MBC, the process of using
                 cluster relationship to boost the cluster updating
                 phase may cause negative effects, that is, cluster
                 centroids may be skewed under some conditions. We
                 propose a smart multitask Bregman clustering (S-MBC)
                 algorithm which can identify the negative effects of
                 the boosting and avoid the negative effects if they
                 occur. We then propose a multitask kernel clustering
                 (MKC) framework for nonlinear separable data by using a
                 similar framework like MBC in the kernel space. We also
                 propose a specific optimization method, which is quite
                 different from that of MBC, to implement the MKC
                 framework. Since MKC can also cause negative effects
                 like MBC, we further extend the framework of MKC to a
                 smart multitask kernel clustering (S-MKC) framework in
                 a similar way that S-MBC is extended from MBC. We
                 conduct experiments on 10 real world multitask
                 clustering datasets to evaluate the performance of
                 S-MBC and S-MKC. The results on clustering accuracy
                 show that: (1) compared with the original MBC algorithm
                 MBC, S-MBC and S-MKC perform much better; (2) compared
                 with the convex discriminative multitask relationship
                 clustering (DMTRC) algorithms DMTRC-L and DMTRC-R which
                 also avoid negative transfer, S-MBC and S-MKC perform
                 worse in the (ideal) case in which different tasks have
                 the same cluster number and the empirical label
                 marginal distribution in each task distributes evenly,
                 but better or comparable in other (more general) cases.
                 Moreover, S-MBC and S-MKC can work on the datasets in
                 which different tasks have different number of
                 clusters, violating the assumptions of DMTRC-L and
                 DMTRC-R. The results on efficiency show that S-MBC and
                 S-MKC consume more computational time than MBC and less
                 computational time than DMTRC-L and DMTRC-R. Overall
                 S-MBC and S-MKC are competitive compared with the
                 state-of-the-art multitask clustering algorithms in
                 synthetical terms of accuracy, efficiency and
                 applicability.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wei:2015:MTP,
  author =       "Wei Wei and Kathleen M. Carley",
  title =        "Measuring Temporal Patterns in Dynamic Social
                 Networks",
  journal =      j-TKDD,
  volume =       "10",
  number =       "1",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2749465",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jul 28 17:19:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Given social networks over time, how can we measure
                 network activities across different timesteps with a
                 limited number of metrics? We propose two classes of
                 dynamic metrics for assessing temporal evolution
                 patterns of agents in terms of persistency and
                 emergence. For each class of dynamic metrics, we
                 implement it using three different temporal aggregation
                 models ranging from the most commonly used Average
                 Aggregation Model to more the complex models such as
                 the Exponential Aggregation Model. We argue that the
                 problem of measuring temporal patterns can be
                 formulated using Recency and Primacy effect, which is a
                 concept used to characterize human cognitive processes.
                 Experimental results show that the way metrics model
                 Recency--Primacy effect is closely related to their
                 abilities to measure temporal patterns. Furthermore,
                 our results indicate that future network agent
                 activities can be predicted based on history
                 information using dynamic metrics. By conducting
                 multiple experiments, we are also able to find an
                 optimal length of history information that is most
                 relevant to future activities. This optimal length is
                 highly consistent within a dataset and can be used as
                 an intrinsic metric to evaluate a dynamic social
                 network.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Liu:2015:RAT,
  author =       "Siyuan Liu and Qiang Qu and Shuhui Wang",
  title =        "Rationality Analytics from Trajectories",
  journal =      j-TKDD,
  volume =       "10",
  number =       "1",
  pages =        "10:1--10:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2735634",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jul 28 17:19:31 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The availability of trajectories tracking the
                 geographical locations of people as a function of time
                 offers an opportunity to study human behaviors. In this
                 article, we study rationality from the perspective of
                 user decision on visiting a point of interest (POI)
                 which is represented as a trajectory. However, the
                 analysis of rationality is challenged by a number of
                 issues, for example, how to model a trajectory in terms
                 of complex user decision processes? and how to detect
                 hidden factors that have significant impact on the
                 rational decision making? In this study, we propose
                 Rationality Analysis Model (RAM) to analyze rationality
                 from trajectories in terms of a set of impact factors.
                 In order to automatically identify hidden factors, we
                 propose a method, Collective Hidden Factor Retrieval
                 (CHFR), which can also be generalized to parse multiple
                 trajectories at the same time or parse individual
                 trajectories of different time periods. Extensive
                 experimental study is conducted on three large-scale
                 real-life datasets (i.e., taxi trajectories, user
                 shopping trajectories, and visiting trajectories in a
                 theme park). The results show that the proposed methods
                 are efficient, effective, and scalable. We also deploy
                 a system in a large theme park to conduct a field
                 study. Interesting findings and user feedback of the
                 field study are provided to support other applications
                 in user behavior mining and analysis, such as business
                 intelligence and user management for marketing
                 purposes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Jia:2015:SGR,
  author =       "Adele Lu Jia and Siqi Shen and Ruud {Van De Bovenkamp}
                 and Alexandru Iosup and Fernando Kuipers and Dick H. J.
                 Epema",
  title =        "Socializing by Gaming: Revealing Social Relationships
                 in Multiplayer Online Games",
  journal =      j-TKDD,
  volume =       "10",
  number =       "2",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2736698",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Oct 26 17:19:18 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multiplayer Online Games (MOGs) like Defense of the
                 Ancients and StarCraft II have attracted hundreds of
                 millions of users who communicate, interact, and
                 socialize with each other through gaming. In MOGs, rich
                 social relationships emerge and can be used to improve
                 gaming services such as match recommendation and game
                 population retention, which are important for the user
                 experience and the commercial value of the companies
                 who run these MOGs. In this work, we focus on
                 understanding social relationships in MOGs. We propose
                 a graph model that is able to capture social
                 relationships of a variety of types and strengths. We
                 apply our model to real-world data collected from three
                 MOGs that contain in total over ten years of behavioral
                 history for millions of players and matches. We compare
                 social relationships in MOGs across different game
                 genres and with regular online social networks like
                 Facebook. Taking match recommendation as an example
                 application of our model, we propose SAMRA, a Socially
                 Aware Match Recommendation Algorithm that takes social
                 relationships into account. We show that our model not
                 only improves the precision of traditional link
                 prediction approaches, but also potentially helps
                 players enjoy games to a higher extent.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Papagelis:2015:RSG,
  author =       "Manos Papagelis",
  title =        "Refining Social Graph Connectivity via Shortcut Edge
                 Addition",
  journal =      j-TKDD,
  volume =       "10",
  number =       "2",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2757281",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Oct 26 17:19:18 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Small changes on the structure of a graph can have a
                 dramatic effect on its connectivity. While in the
                 traditional graph theory, the focus is on well-defined
                 properties of graph connectivity, such as
                 biconnectivity, in the context of a social graph,
                 connectivity is typically manifested by its ability to
                 carry on social processes. In this paper, we consider
                 the problem of adding a small set of nonexisting edges
                 (shortcuts) in a social graph with the main objective
                 of minimizing its characteristic path length. This
                 property determines the average distance between pairs
                 of vertices and essentially controls how broadly
                 information can propagate through a network. We
                 formally define the problem of interest, characterize
                 its hardness and propose a novel method, path
                 screening, which quickly identifies important shortcuts
                 to guide the augmentation of the graph. We devise a
                 sampling-based variant of our method that can scale up
                 the computation in larger graphs. The claims of our
                 methods are formally validated. Through experiments on
                 real and synthetic data, we demonstrate that our
                 methods are a multitude of times faster than standard
                 approaches, their accuracy outperforms sensible
                 baselines and they can ease the spread of information
                 in a network, for a varying range of conditions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Hong:2015:CAR,
  author =       "Liang Hong and Lei Zou and Cheng Zeng and Luming Zhang
                 and Jian Wang and Jilei Tian",
  title =        "Context-Aware Recommendation Using Role-Based Trust
                 Network",
  journal =      j-TKDD,
  volume =       "10",
  number =       "2",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2751562",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Oct 26 17:19:18 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Recommender systems have been studied comprehensively
                 in both academic and industrial fields over the past
                 decade. As user interests can be affected by context at
                 any time and any place in mobile scenarios, rich
                 context information becomes more and more important for
                 personalized context-aware recommendations. Although
                 existing context-aware recommender systems can make
                 context-aware recommendations to some extent, they
                 suffer several inherent weaknesses: (1) Users'
                 context-aware interests are not modeled realistically,
                 which reduces the recommendation quality; (2) Current
                 context-aware recommender systems ignore trust
                 relations among users. Trust relations are actually
                 context-aware and associated with certain aspects
                 (i.e., categories of items) in mobile scenarios. In
                 this article, we define a term role to model common
                 context-aware interests among a group of users. We
                 propose an efficient role mining algorithm to mine
                 roles from a ``user-context-behavior'' matrix, and a
                 role-based trust model to calculate context-aware trust
                 value between two users. During online recommendation,
                 given a user u in a context c, an efficient weighted
                 set similarity query (WSSQ) algorithm is designed to
                 build u 's role-based trust network in context c.
                 Finally, we make recommendations to u based on u 's
                 role-based trust network by considering both
                 context-aware roles and trust relations. Extensive
                 experiments demonstrate that our recommendation
                 approach outperforms the state-of-the-art methods in
                 both effectiveness and efficiency.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhang:2015:OBF,
  author =       "Lei Zhang and Ping Luo and Linpeng Tang and Enhong
                 Chen and Qi Liu and Min Wang and Hui Xiong",
  title =        "Occupancy-Based Frequent Pattern Mining",
  journal =      j-TKDD,
  volume =       "10",
  number =       "2",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2753765",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Oct 26 17:19:18 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Frequent pattern mining is an important data mining
                 problem with many broad applications. Most studies in
                 this field use support (frequency) to measure the
                 popularity of a pattern, namely the fraction of
                 transactions or sequences that include the pattern in a
                 data set. In this study, we introduce a new interesting
                 measure, namely occupancy, to measure the completeness
                 of a pattern in its supporting transactions or
                 sequences. This is motivated by some real-world pattern
                 recommendation applications in which an interesting
                 pattern should not only be frequent, but also occupies
                 a large portion of its supporting transactions or
                 sequences. With the definition of occupancy we call a
                 pattern dominant if its occupancy value is above a
                 user-specified threshold. Then, our task is to identify
                 the qualified patterns which are both dominant and
                 frequent. Also, we formulate the problem of mining
                 top-k qualified patterns, that is, finding k qualified
                 patterns with maximum values on a user-defined function
                 of support and occupancy, for example, weighted sum of
                 support and occupancy. The challenge to these tasks is
                 that the value of occupancy does not change
                 monotonically when more items are appended to a given
                 pattern. Therefore, we propose a general algorithm
                 called DOFRA (DOminant and FRequent pattern mining
                 Algorithm) for mining these qualified patterns, which
                 explores the upper bound properties on occupancy to
                 drastically reduce the search process. Finally, we show
                 the effectiveness of DOFRA in two real-world
                 applications and also demonstrate the efficiency of
                 DOFRA on several real and large synthetic datasets.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chen:2015:AAS,
  author =       "Hung-Hsuan Chen and C. Lee Giles",
  title =        "{ASCOS++}: an Asymmetric Similarity Measure for
                 Weighted Networks to Address the Problem of {SimRank}",
  journal =      j-TKDD,
  volume =       "10",
  number =       "2",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2776894",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Oct 26 17:19:18 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In this article, we explore the relationships among
                 digital objects in terms of their similarity based on
                 vertex similarity measures. We argue that SimRank --- a
                 famous similarity measure --- and its families, such as
                 P-Rank and SimRank++, fail to capture similar node
                 pairs in certain conditions, especially when two nodes
                 can only reach each other through paths of odd lengths.
                 We present new similarity measures ASCOS and ASCOS++ to
                 address the problem. ASCOS outputs a more complete
                 similarity score than SimRank and SimRank's families.
                 ASCOS++ enriches ASCOS to include edge weight into the
                 measure, giving all edges and network weights an
                 opportunity to make their contribution. We show that
                 both ASCOS++ and ASCOS can be reformulated and applied
                 on a distributed environment for parallel contribution.
                 Experimental results show that ASCOS++ reports a better
                 score than SimRank and several famous similarity
                 measures. Finally, we re-examine previous use cases of
                 SimRank, and explain appropriate and inappropriate use
                 cases. We suggest future SimRank users following the
                 rules proposed here before na{\"\i}vely applying it. We
                 also discuss the relationship between ASCOS++ and
                 PageRank.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zafarani:2015:UIA,
  author =       "Reza Zafarani and Lei Tang and Huan Liu",
  title =        "User Identification Across Social Media",
  journal =      j-TKDD,
  volume =       "10",
  number =       "2",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2747880",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Oct 26 17:19:18 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "People use various social media sites for different
                 purposes. The information on each site is often
                 partial. When sources of complementary information are
                 integrated, a better profile of a user can be built.
                 This profile can help improve online services such as
                 advertising across sites. To integrate these sources of
                 information, it is necessary to identify individuals
                 across social media sites. This paper aims to address
                 the cross-media user identification problem. We provide
                 evidence on the existence of a mapping among identities
                 of individuals across social media sites, study the
                 feasibility of finding this mapping, and illustrate and
                 develop means for finding this mapping. Our studies
                 show that effective approaches that exploit information
                 redundancies due to users' unique behavioral patterns
                 can be utilized to find such a mapping. This study
                 paves the way for analysis and mining across social
                 networking sites, and facilitates the creation of novel
                 online services across sites. In particular,
                 recommending friends and advertising across networks,
                 analyzing information diffusion across sites, and
                 studying specific user behavior such as user migration
                 across sites in social media are one of the many areas
                 that can benefit from the results of this study.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Li:2015:RUC,
  author =       "Lei Li and Wei Peng and Saurabh Kataria and Tong Sun
                 and Tao Li",
  title =        "Recommending Users and Communities in Social Media",
  journal =      j-TKDD,
  volume =       "10",
  number =       "2",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2757282",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Oct 26 17:19:18 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Social media has become increasingly prevalent in the
                 last few years, not only enabling people to connect
                 with each other by social links, but also providing
                 platforms for people to share information and interact
                 over diverse topics. Rich user-generated information,
                 for example, users' relationships and daily posts, are
                 often available in most social media service websites.
                 Given such information, a challenging problem is to
                 provide reasonable user and community recommendation
                 for a target user, and consequently, help the target
                 user engage in the daily discussions and activities
                 with his/her friends or like-minded people. In this
                 article, we propose a unified framework of recommending
                 users and communities that utilizes the information in
                 social media. Given a user's profile or a set of
                 keywords as input, our framework is capable of
                 recommending influential users and topic-cohesive
                 interactive communities that are most relevant to the
                 given user or keywords. With the proposed framework,
                 users can find other individuals or communities sharing
                 similar interests, and then have more interaction with
                 these users or within the communities. We present a
                 generative topic model to discover user-oriented and
                 community-oriented topics simultaneously, which enables
                 us to capture the exact topical interests of users, as
                 well as the focuses of communities. Extensive
                 experimental evaluation and case studies on a dataset
                 collected from Twitter demonstrate the effectiveness of
                 our proposed framework compared with other
                 probabilistic-topic-model-based recommendation
                 methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Yu:2015:GGA,
  author =       "Rose Yu and Xinran He and Yan Liu",
  title =        "{GLAD}: Group Anomaly Detection in Social Media
                 Analysis",
  journal =      j-TKDD,
  volume =       "10",
  number =       "2",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2811268",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Oct 26 17:19:18 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Traditional anomaly detection on social media mostly
                 focuses on individual point anomalies while anomalous
                 phenomena usually occur in groups. Therefore, it is
                 valuable to study the collective behavior of
                 individuals and detect group anomalies. Existing group
                 anomaly detection approaches rely on the assumption
                 that the groups are known, which can hardly be true in
                 real world social media applications. In this article,
                 we take a generative approach by proposing a
                 hierarchical Bayes model: Group Latent Anomaly
                 Detection (GLAD) model. GLAD takes both pairwise and
                 point-wise data as input, automatically infers the
                 groups and detects group anomalies simultaneously. To
                 account for the dynamic properties of the social media
                 data, we further generalize GLAD to its dynamic
                 extension d-GLAD. We conduct extensive experiments to
                 evaluate our models on both synthetic and real world
                 datasets. The empirical results demonstrate that our
                 approach is effective and robust in discovering latent
                 groups and detecting group anomalies.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chakrabarti:2015:BPL,
  author =       "Aniket Chakrabarti and Venu Satuluri and Atreya
                 Srivathsan and Srinivasan Parthasarathy",
  title =        "A {Bayesian} Perspective on Locality Sensitive Hashing
                 with Extensions for Kernel Methods",
  journal =      j-TKDD,
  volume =       "10",
  number =       "2",
  pages =        "19:1--19:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2778990",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Oct 26 17:19:18 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Given a collection of objects and an associated
                 similarity measure, the all-pairs similarity search
                 problem asks us to find all pairs of objects with
                 similarity greater than a certain user-specified
                 threshold. In order to reduce the number of candidates
                 to search, locality-sensitive hashing (LSH) based
                 indexing methods are very effective. However, most such
                 methods only use LSH for the first phase of similarity
                 search --- that is, efficient indexing for candidate
                 generation. In this article, we present BayesLSH, a
                 principled Bayesian algorithm for the subsequent phase
                 of similarity search --- performing candidate pruning
                 and similarity estimation using LSH. A simpler variant,
                 BayesLSH-Lite, which calculates similarities exactly,
                 is also presented. Our algorithms are able to quickly
                 prune away a large majority of the false positive
                 candidate pairs, leading to significant speedups over
                 baseline approaches. For BayesLSH, we also provide
                 probabilistic guarantees on the quality of the output,
                 both in terms of accuracy and recall. Finally, the
                 quality of BayesLSH's output can be easily tuned and
                 does not require any manual setting of the number of
                 hashes to use for similarity estimation, unlike
                 standard approaches. For two state-of-the-art candidate
                 generation algorithms, AllPairs and LSH, BayesLSH
                 enables significant speedups, typically in the range 2
                 $ \times $ --20 $ \times $ for a wide variety of
                 datasets. We also extend the BayesLSH algorithm for
                 kernel methods --- in which the similarity between two
                 data objects is defined by a kernel function. Since the
                 embedding of data points in the transformed kernel
                 space is unknown, algorithms such as AllPairs which
                 rely on building inverted index structure for fast
                 similarity search do not work with kernel functions.
                 Exhaustive search across all possible pairs is also not
                 an option since the dataset can be huge and computing
                 the kernel values for each pair can be prohibitive. We
                 propose K-BayesLSH an all-pairs similarity search
                 problem for kernel functions. K-BayesLSH leverages a
                 recently proposed idea --- kernelized locality
                 sensitive hashing (KLSH) --- for hash bit computation
                 and candidate generation, and uses the aforementioned
                 BayesLSH idea for candidate pruning and similarity
                 estimation. We ran a broad spectrum of experiments on a
                 variety of datasets drawn from different domains and
                 with distinct kernels and find a speedup of 2 $ \times
                 $ --7 $ \times $ over vanilla KLSH.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhang:2015:DAV,
  author =       "Yao Zhang and B. Aditya Prakash",
  title =        "Data-Aware Vaccine Allocation Over Large Networks",
  journal =      j-TKDD,
  volume =       "10",
  number =       "2",
  pages =        "20:1--20:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2803176",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Oct 26 17:19:18 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Given a graph, like a social/computer network or the
                 blogosphere, in which an infection (or meme or virus)
                 has been spreading for some time, how to select the k
                 best nodes for immunization/quarantining immediately?
                 Most previous works for controlling propagation (say
                 via immunization) have concentrated on developing
                 strategies for vaccination preemptively before the
                 start of the epidemic. While very useful to provide
                 insights in to which baseline policies can best control
                 an infection, they may not be ideal to make real-time
                 decisions as the infection is progressing. In this
                 paper, we study how to immunize healthy nodes, in the
                 presence of already infected nodes. Efficient
                 algorithms for such a problem can help public-health
                 experts make more informed choices, tailoring their
                 decisions to the actual distribution of the epidemic on
                 the ground. First we formulate the Data-Aware
                 Vaccination problem, and prove it is NP-hard and also
                 that it is hard to approximate. Secondly, we propose
                 three effective polynomial-time heuristics DAVA,
                 DAVA-prune and DAVA-fast, of varying degrees of
                 efficiency and performance. Finally, we also
                 demonstrate the scalability and effectiveness of our
                 algorithms through extensive experiments on multiple
                 real networks including large epidemiology datasets
                 (containing millions of interactions). Our algorithms
                 show substantial gains of up to ten times more healthy
                 nodes at the end against many other intuitive and
                 nontrivial competitors.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "20",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Rowe:2016:MUD,
  author =       "Matthew Rowe",
  title =        "Mining User Development Signals for Online Community
                 Churner Detection",
  journal =      j-TKDD,
  volume =       "10",
  number =       "3",
  pages =        "21:1--21:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2798730",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Feb 25 05:56:34 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Churners are users who stop using a given service
                 after previously signing up. In the domain of
                 telecommunications and video games, churners represent
                 a loss of revenue as a user leaving indicates that they
                 will no longer pay for the service. In the context of
                 online community platforms (e.g., community message
                 boards, social networking sites, question--answering
                 systems, etc.), the churning of a user can represent
                 different kinds of loss: of social capital, of
                 expertise, or of a vibrant individual who is a mediator
                 for interaction and communication. Detecting which
                 users are likely to churn from online communities,
                 therefore, enables community managers to offer
                 incentives to entice those users back; as retention is
                 less expensive than re-signing users up. In this
                 article, we tackle the task of detecting churners on
                 four online community platforms by mining user
                 development signals. These signals explain how users
                 have evolved along different dimensions (i.e., social
                 and lexical) relative to their prior behaviour and the
                 community in which they have interacted. We present a
                 linear model, based upon elastic-net regularisation,
                 that uses extracted features from the signals to detect
                 churners. Our evaluation of this model against several
                 state of the art baselines, including our own prior
                 work, empirically demonstrates the superior performance
                 that this approach achieves for several experimental
                 settings. This article presents a novel approach to
                 churn prediction that takes a different route from
                 existing approaches that are based on measuring static
                 social network properties of users (e.g., centrality,
                 in-degree, etc.).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "21",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Prat-Perez:2016:PTT,
  author =       "Arnau Prat-P{\'e}rez and David Dominguez-Sal and
                 Josep-M. Brunat and Josep-Lluis Larriba-Pey",
  title =        "Put Three and Three Together: Triangle-Driven
                 Community Detection",
  journal =      j-TKDD,
  volume =       "10",
  number =       "3",
  pages =        "22:1--22:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2775108",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Feb 25 05:56:34 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Community detection has arisen as one of the most
                 relevant topics in the field of graph data mining due
                 to its applications in many fields such as biology,
                 social networks, or network traffic analysis. Although
                 the existing metrics used to quantify the quality of a
                 community work well in general, under some
                 circumstances, they fail at correctly capturing such
                 notion. The main reason is that these metrics consider
                 the internal community edges as a set, but ignore how
                 these actually connect the vertices of the community.
                 We propose the Weighted Community Clustering (WCC),
                 which is a new community metric that takes the triangle
                 instead of the edge as the minimal structural motif
                 indicating the presence of a strong relation in a
                 graph. We theoretically analyse WCC in depth and
                 formally prove, by means of a set of properties, that
                 the maximization of WCC guarantees communities with
                 cohesion and structure. In addition, we propose
                 Scalable Community Detection (SCD), a community
                 detection algorithm based on WCC, which is designed to
                 be fast and scalable on SMP machines, showing
                 experimentally that WCC correctly captures the concept
                 of community in social networks using real datasets.
                 Finally, using ground-truth data, we show that SCD
                 provides better quality than the best disjoint
                 community detection algorithms of the state of the art
                 while performing faster.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "22",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Guo:2016:MDM,
  author =       "Zhen Guo and Zhongfei (Mark) Zhang and Eric P. Xing
                 and Christos Faloutsos",
  title =        "Multimodal Data Mining in a Multimedia Database Based
                 on Structured Max Margin Learning",
  journal =      j-TKDD,
  volume =       "10",
  number =       "3",
  pages =        "23:1--23:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742549",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Feb 25 05:56:34 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Mining knowledge from a multimedia database has
                 received increasing attentions recently since huge
                 repositories are made available by the development of
                 the Internet. In this article, we exploit the relations
                 among different modalities in a multimedia database and
                 present a framework for general multimodal data mining
                 problem where image annotation and image retrieval are
                 considered as the special cases. Specifically, the
                 multimodal data mining problem can be formulated as a
                 structured prediction problem where we learn the
                 mapping from an input to the structured and
                 interdependent output variables. In addition, in order
                 to reduce the demanding computation, we propose a new
                 max margin structure learning approach called Enhanced
                 Max Margin Learning (EMML) framework, which is much
                 more efficient with a much faster convergence rate than
                 the existing max margin learning methods, as verified
                 through empirical evaluations. Furthermore, we apply
                 EMML framework to develop an effective and efficient
                 solution to the multimodal data mining problem that is
                 highly scalable in the sense that the query response
                 time is independent of the database scale. The EMML
                 framework allows an efficient multimodal data mining
                 query in a very large scale multimedia database, and
                 excels many existing multimodal data mining methods in
                 the literature that do not scale up at all. The
                 performance comparison with a state-of-the-art
                 multimodal data mining method is reported for the
                 real-world image databases.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "23",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Myers:2016:DAK,
  author =       "Risa B. Myers and John C. Frenzel and Joseph R. Ruiz
                 and Christopher M. Jermaine",
  title =        "Do Anesthesiologists Know What They Are Doing?
                 {Mining} a Surgical Time-Series Database to Correlate
                 Expert Assessment with Outcomes",
  journal =      j-TKDD,
  volume =       "10",
  number =       "3",
  pages =        "24:1--24:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2822897",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Feb 25 05:56:34 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Anesthesiologists are taught to carefully manage
                 patient vital signs during surgery. Unfortunately,
                 there is little empirical evidence that vital sign
                 management, as currently practiced, is correlated with
                 patient outcomes. We seek to validate or repudiate
                 current clinical practice and determine whether or not
                 clinician evaluation of surgical vital signs correlate
                 with outcomes. Using a database of over 90,000 cases,
                 we attempt to determine whether those cases that
                 anesthesiologists would subjectively decide are ``low
                 quality'' are more likely to result in negative
                 outcomes. The problem reduces to one of
                 multi-dimensional time-series classification. Our
                 approach is to have a set of expert anesthesiologists
                 independently label a small number of training cases,
                 from which we build classifiers and label all 90,000
                 cases. We then use the labeling to search for
                 correlation with outcomes and compare the prevalence of
                 important 30-day outcomes between providers. To mimic
                 the providers' quality labels, we consider several
                 standard classification methods, such as dynamic time
                 warping in conjunction with a kNN classifier, as well
                 as complexity invariant distance, and a regression
                 based upon the feature extraction methods outlined by
                 Mao et al. 2012 (using features such as time-series
                 mean, standard deviation, skew, etc.). We also propose
                 a new feature selection mechanism that learns a hidden
                 Markov model to segment the time series; the fraction
                 of time that each series spends in each state is used
                 to label the series using a regression-based
                 classifier. In the end, we obtain strong, empirical
                 evidence that current best practice is correlated with
                 reduced negative patient outcomes. We also learn that
                 all of the experts were able to significantly separate
                 cases by outcome, with higher prevalence of negative
                 30-day outcomes in the cases labeled as ``low quality''
                 for almost all of the outcomes investigated.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "24",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Namata:2016:CGI,
  author =       "Galileo Mark Namata and Ben London and Lise Getoor",
  title =        "Collective Graph Identification",
  journal =      j-TKDD,
  volume =       "10",
  number =       "3",
  pages =        "25:1--25:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818378",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Feb 25 05:56:34 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Data describing networks---such as communication
                 networks, transaction networks, disease transmission
                 networks, collaboration networks, etc.---are becoming
                 increasingly available. While observational data can be
                 useful, it often only hints at the actual underlying
                 process that governs interactions and attributes. For
                 example, an email communication network provides
                 insight into its users and their relationships, but is
                 not the same as the ``real'' underlying social network.
                 In this article, we introduce the problem of graph
                 identification, i.e., discovering the latent graph
                 structure underlying an observed network. We cast the
                 problem as a probabilistic inference task, in which we
                 must infer the nodes, edges, and node labels of a
                 hidden graph, based on evidence. This entails solving
                 several canonical problems in network analysis: entity
                 resolution (determining when two observations
                 correspond to the same entity), link prediction
                 (inferring the existence of links), and node labeling
                 (inferring hidden attributes). While each of these
                 subproblems has been well studied in isolation, here we
                 consider them as a single, collective task. We present
                 a simple, yet novel, approach to address all three
                 subproblems simultaneously. Our approach, which we
                 refer to as C$^3$, consists of a collection of Coupled
                 Collective Classifiers that are applied iteratively to
                 propagate inferred information among the subproblems.
                 We consider variants of C$^3$ using different learning
                 and inference techniques and empirically demonstrate
                 that C$^3$ is superior, both in terms of predictive
                 accuracy and running time, to state-of-the-art
                 probabilistic approaches on four real problems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "25",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Subbian:2016:MIU,
  author =       "Karthik Subbian and Charu Aggarwal and Jaideep
                 Srivastava",
  title =        "Mining Influencers Using Information Flows in Social
                 Streams",
  journal =      j-TKDD,
  volume =       "10",
  number =       "3",
  pages =        "26:1--26:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2815625",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Feb 25 05:56:34 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The problem of discovering information flow trends in
                 social networks has become increasingly relevant due to
                 the increasing amount of content in online social
                 networks, and its relevance as a tool for research into
                 the content trends analysis in the network. An
                 important part of this analysis is to determine the key
                 patterns of flow in the underlying network. Almost all
                 the work in this area has focused on fixed models of
                 the network structure, and edge-based transmission
                 between nodes. In this article, we propose a fully
                 content-centered model of flow analysis in networks, in
                 which the analysis is based on actual content
                 transmissions in the underlying social stream, rather
                 than a static model of transmission on the edges.
                 First, we introduce the problem of influence analysis
                 in the context of information flow in networks. We then
                 propose a novel algorithm InFlowMine to discover the
                 information flow patterns in the network and
                 demonstrate the effectiveness of the discovered
                 information flows using an influence mining
                 application. This application illustrates the
                 flexibility and effectiveness of our information flow
                 model to find topic- or network-specific influencers,
                 or their combinations. We empirically show that our
                 information flow mining approach is effective and
                 efficient than the existing methods on a number of
                 different measures.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "26",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Angiulli:2016:TGU,
  author =       "Fabrizio Angiulli and Fabio Fassetti",
  title =        "Toward Generalizing the Unification with Statistical
                 Outliers: The Gradient Outlier Factor Measure",
  journal =      j-TKDD,
  volume =       "10",
  number =       "3",
  pages =        "27:1--27:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2829956",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Feb 25 05:56:34 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In this work, we introduce a novel definition of
                 outlier, namely the Gradient Outlier Factor (or GOF),
                 with the aim to provide a definition that unifies with
                 the statistical one on some standard distributions but
                 has a different behavior in the presence of mixture
                 distributions. Intuitively, the GOF score measures the
                 probability to stay in the neighborhood of a certain
                 object. It is directly proportional to the density and
                 inversely proportional to the variation of the density.
                 We derive formal properties under which the GOF
                 definition unifies the statistical outlier definition
                 and show that the unification holds for some standard
                 distributions, while the GOF is able to capture tails
                 in the presence of different distributions even if
                 their densities sensibly differ. Moreover, we provide a
                 probabilistic interpretation of the GOF score, by means
                 of the notion of density of the data density.
                 Experimental results confirm that there are scenarios
                 in which the novel definition can be profitably
                 employed. To the best of our knowledge, except for
                 distance-based outlier, no other data mining outlier
                 definition has a so clearly established relationship
                 with statistical outliers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "27",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Koutra:2016:DPM,
  author =       "Danai Koutra and Neil Shah and Joshua T. Vogelstein
                 and Brian Gallagher and Christos Faloutsos",
  title =        "{DeltaCon}: Principled Massive-Graph Similarity
                 Function with Attribution",
  journal =      j-TKDD,
  volume =       "10",
  number =       "3",
  pages =        "28:1--28:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2824443",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Feb 25 05:56:34 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "How much has a network changed since yesterday? How
                 different is the wiring of Bob's brain (a left-handed
                 male) and Alice's brain (a right-handed female), and
                 how is it different? Graph similarity with given node
                 correspondence, i.e., the detection of changes in the
                 connectivity of graphs, arises in numerous settings. In
                 this work, we formally state the axioms and desired
                 properties of the graph similarity functions, and
                 evaluate when state-of-the-art methods fail to detect
                 crucial connectivity changes in graphs. We propose D
                 eltaCon, a principled, intuitive, and scalable
                 algorithm that assesses the similarity between two
                 graphs on the same nodes (e.g., employees of a company,
                 customers of a mobile carrier). In conjunction, we
                 propose DeltaCon-Attr, a related approach that enables
                 attribution of change or dissimilarity to responsible
                 nodes and edges. Experiments on various synthetic and
                 real graphs showcase the advantages of our method over
                 existing similarity measures. Finally, we employ
                 DeltaCon and DeltaCon-Attr on real applications: (a) we
                 classify people to groups of high and low creativity
                 based on their brain connectivity graphs, (b) do
                 temporal anomaly detection in the who-emails-whom Enron
                 graph and find the top culprits for the changes in the
                 temporal corporate email graph, and (c) recover pairs
                 of test-retest large brain scans ({\sim}17M edges, up
                 to 90M edges) for 21 subjects.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "28",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhao:2016:MPA,
  author =       "Wayne Xin Zhao and Jinpeng Wang and Yulan He and
                 Ji-Rong Wen and Edward Y. Chang and Xiaoming Li",
  title =        "Mining Product Adopter Information from Online Reviews
                 for Improving Product Recommendation",
  journal =      j-TKDD,
  volume =       "10",
  number =       "3",
  pages =        "29:1--29:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2842629",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Feb 25 05:56:34 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We present in this article an automated framework that
                 extracts product adopter information from online
                 reviews and incorporates the extracted information into
                 feature-based matrix factorization for more effective
                 product recommendation. In specific, we propose a
                 bootstrapping approach for the extraction of product
                 adopters from review text and categorize them into a
                 number of different demographic categories. The
                 aggregated demographic information of many product
                 adopters can be used to characterize both products and
                 users in the form of distributions over different
                 demographic categories. We further propose a
                 graph-based method to iteratively update user- and
                 product-related distributions more reliably in a
                 heterogeneous user--product graph and incorporate them
                 as features into the matrix factorization approach for
                 product recommendation. Our experimental results on a
                 large dataset crawled from J ingDong, the largest B2C
                 e-commerce website in China, show that our proposed
                 framework outperforms a number of competitive baselines
                 for product recommendation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "29",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Duarte:2016:AMR,
  author =       "Jo{\~a}o Duarte and Jo{\~a}o Gama and Albert Bifet",
  title =        "Adaptive Model Rules From High-Speed Data Streams",
  journal =      j-TKDD,
  volume =       "10",
  number =       "3",
  pages =        "30:1--30:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2829955",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Feb 25 05:56:34 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Decision rules are one of the most expressive and
                 interpretable models for machine learning. In this
                 article, we present Adaptive Model Rules (AMRules), the
                 first stream rule learning algorithm for regression
                 problems. In AMRules, the antecedent of a rule is a
                 conjunction of conditions on the attribute values, and
                 the consequent is a linear combination of the
                 attributes. In order to maintain a regression model
                 compatible with the most recent state of the process
                 generating data, each rule uses a Page-Hinkley test to
                 detect changes in this process and react to changes by
                 pruning the rule set. Online learning might be strongly
                 affected by outliers. AMRules is also equipped with
                 outliers detection mechanisms to avoid model adaption
                 using anomalous examples. In the experimental section,
                 we report the results of AMRules on benchmark
                 regression problems, and compare the performance of our
                 system with other streaming regression algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "30",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Lu:2016:SCB,
  author =       "Faming Lu and Qingtian Zeng and Hua Duan",
  title =        "Synchronization-Core-Based Discovery of Processes with
                 Decomposable Cyclic Dependencies",
  journal =      j-TKDD,
  volume =       "10",
  number =       "3",
  pages =        "31:1--31:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2845086",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Feb 25 05:56:34 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Traditional process discovery techniques mine process
                 models based upon event traces giving little
                 consideration to workflow relevant data recorded in
                 event logs. The neglect of such information usually
                 leads to incorrect discovered models, especially when
                 activities have decomposable cyclic dependencies. To
                 address this problem, the recorded workflow relevant
                 data and decision tree learning technique are utilized
                 to classify cases into case clusters. Each case cluster
                 contains causality and concurrency activity
                 dependencies only. Then, a set of activity ordering
                 relations are derived based on case clusters. And a
                 synchronization-core-based process model is discovered
                 from the ordering relations and composite cases.
                 Finally, the discovered model is transformed to a BPMN
                 model. The proposed approach is validated with a
                 medical treatment process and an open event log.
                 Meanwhile, a prototype system is presented.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "31",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Liu:2016:EAW,
  author =       "Yashu Liu and Jie Wang and Jieping Ye",
  title =        "An Efficient Algorithm For Weak Hierarchical Lasso",
  journal =      j-TKDD,
  volume =       "10",
  number =       "3",
  pages =        "32:1--32:??",
  month =        feb,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2791295",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Thu Feb 25 05:56:34 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Linear regression is a widely used tool in data mining
                 and machine learning. In many applications, fitting a
                 regression model with only linear effects may not be
                 sufficient for predictive or explanatory purposes. One
                 strategy that has recently received increasing
                 attention in statistics is to include feature
                 interactions to capture the nonlinearity in the
                 regression model. Such model has been applied
                 successfully in many biomedical applications. One major
                 challenge in the use of such model is that the data
                 dimensionality is significantly higher than the
                 original data, resulting in the small sample size large
                 dimension problem. Recently, weak hierarchical Lasso, a
                 sparse interaction regression model, is proposed that
                 produces a sparse and hierarchical structured estimator
                 by exploiting the Lasso penalty and a set of
                 hierarchical constraints. However, the hierarchical
                 constraints make it a non-convex problem and the
                 existing method finds the solution to its convex
                 relaxation, which needs additional conditions to
                 guarantee the hierarchical structure. In this article,
                 we propose to directly solve the non-convex weak
                 hierarchical Lasso by making use of the General
                 Iterative Shrinkage and Thresholding (GIST)
                 optimization framework, which has been shown to be
                 efficient for solving non-convex sparse formulations.
                 The key step in GIST is to compute a sequence of
                 proximal operators. One of our key technical
                 contributions is to show that the proximal operator
                 associated with the non-convex weak hierarchical Lasso
                 admits a closed-form solution. However, a naive
                 approach for solving each subproblem of the proximal
                 operator leads to a quadratic time complexity, which is
                 not desirable for large-size problems. We have
                 conducted extensive experiments on both synthetic and
                 real datasets. Results show that our proposed algorithm
                 is much more efficient and effective than its convex
                 relaxation. To this end, we further develop an
                 efficient algorithm for computing the subproblems with
                 a linearithmic time complexity. In addition, we extend
                 the technique to perform the optimization-based
                 hierarchical testing of pairwise interactions for
                 binary classification problems, which is essentially
                 the proximal operator associated with weak hierarchical
                 Lasso. Simulation studies show that the non-convex
                 hierarchical testing framework outperforms the convex
                 relaxation when a hierarchical structure exists between
                 main effects and interactions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "32",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2016:ISI,
  author =       "Wei Wang and Jure Leskovec",
  title =        "Introduction to the Special Issue of Best Papers in
                 {ACM SIGKDD 2014}",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "33:1--33:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2936718",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "33",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Xu:2016:PSP,
  author =       "Silei Xu and John C. S. Lui",
  title =        "Product Selection Problem: Improve Market Share by
                 Learning Consumer Behavior",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "34:1--34:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2753764",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "It is often crucial for manufacturers to decide what
                 products to produce so that they can increase their
                 market share in an increasingly fierce market. To
                 decide which products to produce, manufacturers need to
                 analyze the consumers' requirements and how consumers
                 make their purchase decisions so that the new products
                 will be competitive in the market. In this paper, we
                 first present a general distance-based product adoption
                 model to capture consumers' purchase behavior. Using
                 this model, various distance metrics can be used to
                 describe different real life purchase behavior. We then
                 provide a learning algorithm to decide which set of
                 distance metrics one should use when we are given some
                 accessible historical purchase data. Based on the
                 product adoption model, we formalize the k most
                 marketable products (or $k$-MMP) selection problem and
                 formally prove that the problem is NP-hard. To tackle
                 this problem, we propose an efficient greedy-based
                 approximation algorithm with a provable solution
                 guarantee. Using submodularity analysis, we prove that
                 our approximation algorithm can achieve at least 63\%
                 of the optimal solution. We apply our algorithm on both
                 synthetic datasets and real-world datasets
                 (TripAdvisor.com), and show that our algorithm can
                 easily achieve five or more orders of speedup over the
                 exhaustive search and achieve about 96\% of the optimal
                 solution on average. Our experiments also demonstrate
                 the robustness of our distance metric learning method,
                 and illustrate how one can adopt it to improve the
                 accuracy of product selection.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "34",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Jiang:2016:CSB,
  author =       "Meng Jiang and Peng Cui and Alex Beutel and Christos
                 Faloutsos and Shiqiang Yang",
  title =        "Catching Synchronized Behaviors in Large Networks: a
                 Graph Mining Approach",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "35:1--35:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2746403",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Given a directed graph of millions of nodes, how can
                 we automatically spot anomalous, suspicious nodes
                 judging only from their connectivity patterns?
                 Suspicious graph patterns show up in many applications,
                 from Twitter users who buy fake followers, manipulating
                 the social network, to botnet members performing
                 distributed denial of service attacks, disturbing the
                 network traffic graph. We propose a fast and effective
                 method, C atchSync, which exploits two of the tell-tale
                 signs left in graphs by fraudsters: (a) synchronized
                 behavior: suspicious nodes have extremely similar
                 behavior patterns because they are often required to
                 perform some task together (such as follow the same
                 user); and (b) rare behavior: their connectivity
                 patterns are very different from the majority. We
                 introduce novel measures to quantify both concepts
                 (``synchronicity'' and ``normality'') and we propose a
                 parameter-free algorithm that works on the resulting
                 synchronicity-normality plots. Thanks to careful
                 design, CatchSync has the following desirable
                 properties: (a) it is scalable to large datasets, being
                 linear in the graph size; (b) it is parameter free; and
                 (c) it is side-information-oblivious: it can operate
                 using only the topology, without needing labeled data,
                 nor timing information, and the like., while still
                 capable of using side information if available. We
                 applied CatchSync on three large, real datasets,
                 1-billion-edge Twitter social graph, 3-billion-edge,
                 and 12-billion-edge Tencent Weibo social graphs, and
                 several synthetic ones; CatchSync consistently
                 outperforms existing competitors, both in detection
                 accuracy by 36\% on Twitter and 20\% on Tencent Weibo,
                 as well as in speed.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "35",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wei:2016:HTH,
  author =       "Ying Wei and Yangqiu Song and Yi Zhen and Bo Liu and
                 Qiang Yang",
  title =        "Heterogeneous Translated Hashing: a Scalable Solution
                 Towards Multi-Modal Similarity Search",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "36:1--36:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2744204",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multi-modal similarity search has attracted
                 considerable attention to meet the need of information
                 retrieval across different types of media. To enable
                 efficient multi-modal similarity search in large-scale
                 databases recently, researchers start to study
                 multi-modal hashing. Most of the existing methods are
                 applied to search across multi-views among which
                 explicit correspondence is provided. Given a
                 multi-modal similarity search task, we observe that
                 abundant multi-view data can be found on the Web which
                 can serve as an auxiliary bridge. In this paper, we
                 propose a Heterogeneous Translated Hashing (HTH) method
                 with such auxiliary bridge incorporated not only to
                 improve current multi-view search but also to enable
                 similarity search across heterogeneous media which have
                 no direct correspondence. HTH provides more flexible
                 and discriminative ability by embedding heterogeneous
                 media into different Hamming spaces, compared to almost
                 all existing methods that map heterogeneous data in a
                 common Hamming space. We formulate a joint optimization
                 model to learn hash functions embedding heterogeneous
                 media into different Hamming spaces, and a translator
                 aligning different Hamming spaces. The extensive
                 experiments on two real-world datasets, one publicly
                 available dataset of Flickr, and the other
                 MIRFLICKR-Yahoo Answers dataset, highlight the
                 effectiveness and efficiency of our algorithm.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "36",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Tong:2016:GES,
  author =       "Hanghang Tong and Fei Wang and Munmun De Choudhury and
                 Zoran Obradovic",
  title =        "Guest Editorial: Special Issue on Connected Health at
                 Big Data Era {(BigChat)}: a {TKDD} Special Issue",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "37:1--37:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2912122",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "37",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Xiong:2016:KIT,
  author =       "Feiyu Xiong and Moshe Kam and Leonid Hrebien and
                 Beilun Wang and Yanjun Qi",
  title =        "Kernelized Information-Theoretic Metric Learning for
                 Cancer Diagnosis Using High-Dimensional Molecular
                 Profiling Data",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "38:1--38:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2789212",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "With the advancement of genome-wide monitoring
                 technologies, molecular expression data have become
                 widely used for diagnosing cancer through tumor or
                 blood samples. When mining molecular signature data,
                 the process of comparing samples through an adaptive
                 distance function is fundamental but difficult, as such
                 datasets are normally heterogeneous and high
                 dimensional. In this article, we present kernelized
                 information-theoretic metric learning (KITML)
                 algorithms that optimize a distance function to tackle
                 the cancer diagnosis problem and scale to high
                 dimensionality. By learning a nonlinear transformation
                 in the input space implicitly through kernelization,
                 KITML permits efficient optimization, low storage, and
                 improved learning of distance metric. We propose two
                 novel applications of KITML for diagnosing cancer using
                 high-dimensional molecular profiling data: (1) for
                 sample-level cancer diagnosis, the learned metric is
                 used to improve the performance of k -nearest neighbor
                 classification; and (2) for estimating the severity
                 level or stage of a group of samples, we propose a
                 novel set-based ranking approach to extend KITML. For
                 the sample-level cancer classification task, we have
                 evaluated on 14 cancer gene microarray datasets and
                 compared with eight other state-of-the-art approaches.
                 The results show that our approach achieves the best
                 overall performance for the task of
                 molecular-expression-driven cancer sample diagnosis.
                 For the group-level cancer stage estimation, we test
                 the proposed set-KITML approach using three multi-stage
                 cancer microarray datasets, and correctly estimated the
                 stages of sample groups for all three studies.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "38",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Yang:2016:JML,
  author =       "Pei Yang and Hongxia Yang and Haoda Fu and Dawei Zhou
                 and Jieping Ye and Theodoros Lappas and Jingrui He",
  title =        "Jointly Modeling Label and Feature Heterogeneity in
                 Medical Informatics",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "39:1--39:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2768831",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multiple types of heterogeneity including label
                 heterogeneity and feature heterogeneity often co-exist
                 in many real-world data mining applications, such as
                 diabetes treatment classification, gene functionality
                 prediction, and brain image analysis. To effectively
                 leverage such heterogeneity, in this article, we
                 propose a novel graph-based model for Learning with
                 both Label and Feature heterogeneity, namely L$^2$F. It
                 models the label correlation by requiring that any two
                 label-specific classifiers behave similarly on the same
                 views if the associated labels are similar, and imposes
                 the view consistency by requiring that view-based
                 classifiers generate similar predictions on the same
                 examples. The objective function for L$^2$F is jointly
                 convex. To solve the optimization problem, we propose
                 an iterative algorithm, which is guaranteed to converge
                 to the global optimum. One appealing feature of L$^2$F
                 is that it is capable of handling data with missing
                 views and labels. Furthermore, we analyze its
                 generalization performance based on Rademacher
                 complexity, which sheds light on the benefits of
                 jointly modeling the label and feature heterogeneity.
                 Experimental results on various biomedical datasets
                 show the effectiveness of the proposed approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "39",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wu:2016:MDN,
  author =       "Yubao Wu and Xiaofeng Zhu and Li Li and Wei Fan and
                 Ruoming Jin and Xiang Zhang",
  title =        "Mining Dual Networks: Models, Algorithms, and
                 Applications",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "40:1--40:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2785970",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Finding the densest subgraph in a single graph is a
                 fundamental problem that has been extensively studied.
                 In many emerging applications, there exist dual
                 networks. For example, in genetics, it is important to
                 use protein interactions to interpret genetic
                 interactions. In this application, one network
                 represents physical interactions among nodes, for
                 example, protein--protein interactions, and another
                 network represents conceptual interactions, for
                 example, genetic interactions. Edges in the conceptual
                 network are usually derived based on certain
                 correlation measure or statistical test measuring the
                 strength of the interaction. Two nodes with strong
                 conceptual interaction may not have direct physical
                 interaction. In this article, we propose the novel
                 dual-network model and investigate the problem of
                 finding the densest connected subgraph (DCS), which has
                 the largest density in the conceptual network and is
                 also connected in the physical network. Density in the
                 conceptual network represents the average strength of
                 the measured interacting signals among the set of
                 nodes. Connectivity in the physical network shows how
                 they interact physically. Such pattern cannot be
                 identified using the existing algorithms for a single
                 network. We show that even though finding the densest
                 subgraph in a single network is polynomial time
                 solvable, the DCS problem is NP-hard. We develop a
                 two-step approach to solve the DCS problem. In the
                 first step, we effectively prune the dual networks,
                 while guarantee that the optimal solution is contained
                 in the remaining networks. For the second step, we
                 develop two efficient greedy methods based on different
                 search strategies to find the DCS. Different variations
                 of the DCS problem are also studied. We perform
                 extensive experiments on a variety of real and
                 synthetic dual networks to evaluate the effectiveness
                 and efficiency of the developed methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "40",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Cui:2016:BOQ,
  author =       "Licong Cui and Shiqiang Tao and Guo-Qiang Zhang",
  title =        "Biomedical Ontology Quality Assurance Using a Big Data
                 Approach",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "41:1--41:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2768830",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "This article presents recent progresses made in using
                 scalable cloud computing environment, Hadoop and
                 MapReduce, to perform ontology quality assurance (OQA),
                 and points to areas of future opportunity. The standard
                 sequential approach used for implementing OQA methods
                 can take weeks if not months for exhaustive analyses
                 for large biomedical ontological systems. With OQA
                 methods newly implemented using massively parallel
                 algorithms in the MapReduce framework, several orders
                 of magnitude in speed-up can be achieved (e.g., from
                 three months to three hours). Such dramatically reduced
                 time makes it feasible not only to perform exhaustive
                 structural analysis of large ontological hierarchies,
                 but also to systematically track structural changes
                 between versions for evolutional analysis. As an
                 exemplar, progress is reported in using MapReduce to
                 perform evolutional analysis and visualization on the
                 Systemized Nomenclature of Medicine-Clinical Terms
                 (SNOMED CT), a prominent clinical terminology system.
                 Future opportunities in three areas are described: one
                 is to extend the scope of MapReduce-based approach to
                 existing OQA methods, especially for automated
                 exhaustive structural analysis. The second is to apply
                 our proposed MapReduce Pipeline for Lattice-based
                 Evaluation (MaPLE) approach, demonstrated as an
                 exemplar method for SNOMED CT, to other biomedical
                 ontologies. The third area is to develop interfaces for
                 reviewing results obtained by OQA methods and for
                 visualizing ontological alignment and evolution, which
                 can also take advantage of cloud computing technology
                 to systematically pre-compute computationally intensive
                 jobs in order to increase performance during user
                 interactions with the visualization interface. Advances
                 in these directions are expected to better support the
                 ontological engineering lifecycle.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "41",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Rayana:2016:LMB,
  author =       "Shebuti Rayana and Leman Akoglu",
  title =        "Less is More: Building Selective Anomaly Ensembles",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "42:1--42:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2890508",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Ensemble learning for anomaly detection has been
                 barely studied, due to difficulty in acquiring ground
                 truth and the lack of inherent objective functions. In
                 contrast, ensemble approaches for classification and
                 clustering have been studied and effectively used for
                 long. Our work taps into this gap and builds a new
                 ensemble approach for anomaly detection, with
                 application to event detection in temporal graphs as
                 well as outlier detection in no-graph settings. It
                 handles and combines multiple heterogeneous detectors
                 to yield improved and robust performance. Importantly,
                 trusting results from all the constituent detectors may
                 deteriorate the overall performance of the ensemble, as
                 some detectors could provide inaccurate results
                 depending on the type of data in hand and the
                 underlying assumptions of a detector. This suggests
                 that combining the detectors selectively is key to
                 building effective anomaly ensembles-hence ``less is
                 more''. In this paper we propose a novel ensemble
                 approach called SELECT for anomaly detection, which
                 automatically and systematically selects the results
                 from constituent detectors to combine in a fully
                 unsupervised fashion. We apply our method to event
                 detection in temporal graphs and outlier detection in
                 multi-dimensional point data (no-graph), where SELECT
                 successfully utilizes five base detectors and seven
                 consensus methods under a unified ensemble framework.
                 We provide extensive quantitative evaluation of our
                 approach for event detection on five real-world
                 datasets (four with ground truth events), including
                 Enron email communications, RealityMining SMS and phone
                 call records, New York Times news corpus, and World Cup
                 2014 Twitter news feed. We also provide results for
                 outlier detection on seven real-world multi-dimensional
                 point datasets from UCI Machine Learning Repository.
                 Thanks to its selection mechanism, SELECT yields
                 superior performance compared to the individual
                 detectors alone, the full ensemble (naively combining
                 all results), an existing diversity-based ensemble, and
                 an existing weighted ensemble approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "42",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhu:2016:CCS,
  author =       "Yada Zhu and Jingrui He",
  title =        "Co-Clustering Structural Temporal Data with
                 Applications to Semiconductor Manufacturing",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "43:1--43:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2875427",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Recent years have witnessed data explosion in
                 semiconductor manufacturing due to advances in
                 instrumentation and storage techniques. The large
                 amount of data associated with process variables
                 monitored over time form a rich reservoir of
                 information, which can be used for a variety of
                 purposes, such as anomaly detection, quality control,
                 and fault diagnostics. In particular, following the
                 same recipe for a certain Integrated Circuit device,
                 multiple tools and chambers can be deployed for the
                 production of this device, during which multiple time
                 series can be collected, such as temperature,
                 impedance, gas flow, electric bias, etc. These time
                 series naturally fit into a two-dimensional array
                 (matrix), i.e., each element in this array corresponds
                 to a time series for one process variable from one
                 chamber. To leverage the rich structural information in
                 such temporal data, in this article, we propose a novel
                 framework named C-Struts to simultaneously cluster on
                 the two dimensions of this array. In this framework, we
                 interpret the structural information as a set of
                 constraints on the cluster membership, introduce an
                 auxiliary probability distribution accordingly, and
                 design an iterative algorithm to assign each time
                 series to a certain cluster on each dimension.
                 Furthermore, we establish the equivalence between
                 C-Struts and a generic optimization problem, which is
                 able to accommodate various distance functions.
                 Extensive experiments on synthetic, benchmark, as well
                 as manufacturing datasets demonstrate the effectiveness
                 of the proposed method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "43",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Tahani:2016:IDD,
  author =       "Maryam Tahani and Ali M. A. Hemmatyar and Hamid R.
                 Rabiee and Maryam Ramezani",
  title =        "Inferring Dynamic Diffusion Networks in Online Media",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "44:1--44:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2882968",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Online media play an important role in information
                 societies by providing a convenient infrastructure for
                 different processes. Information diffusion that is a
                 fundamental process taking place on social and
                 information networks has been investigated in many
                 studies. Research on information diffusion in these
                 networks faces two main challenges: (1) In most cases,
                 diffusion takes place on an underlying network, which
                 is latent and its structure is unknown. (2) This latent
                 network is not fixed and changes over time. In this
                 article, we investigate the diffusion network
                 extraction (DNE) problem when the underlying network is
                 dynamic and latent. We model the diffusion behavior
                 (existence probability) of each edge as a stochastic
                 process and utilize the Hidden Markov Model (HMM) to
                 discover the most probable diffusion links according to
                 the current observation of the diffusion process, which
                 is the infection time of nodes and the past diffusion
                 behavior of links. We evaluate the performance of our
                 Dynamic Diffusion Network Extraction (DDNE) method, on
                 both synthetic and real datasets. Experimental results
                 show that the performance of the proposed method is
                 independent of the cascade transmission model and
                 outperforms the state of art method in terms of
                 F-measure.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "44",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Koh:2016:URP,
  author =       "Yun Sing Koh and Sri Devi Ravana",
  title =        "Unsupervised Rare Pattern Mining: a Survey",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "45:1--45:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2898359",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Association rule mining was first introduced to
                 examine patterns among frequent items. The original
                 motivation for seeking these rules arose from need to
                 examine customer purchasing behaviour in supermarket
                 transaction data. It seeks to identify combinations of
                 items or itemsets, whose presence in a transaction
                 affects the likelihood of the presence of another
                 specific item or itemsets. In recent years, there has
                 been an increasing demand for rare association rule
                 mining. Detecting rare patterns in data is a vital
                 task, with numerous high-impact applications including
                 medical, finance, and security. This survey aims to
                 provide a general, comprehensive, and structured
                 overview of the state-of-the-art methods for rare
                 pattern mining. We investigate the problems in finding
                 rare rules using traditional association rule mining.
                 As rare association rule mining has not been well
                 explored, there is still specific groundwork that needs
                 to be established. We will discuss some of the major
                 issues in rare association rule mining and also look at
                 current algorithms. As a contribution, we give a
                 general framework for categorizing algorithms: Apriori
                 and Tree based. We highlight the differences between
                 these methods. Finally, we present several real-world
                 application using rare pattern mining in diverse
                 domains. We conclude our survey with a discussion on
                 open and practical challenges in the field.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "45",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Cheng:2016:CFR,
  author =       "Wei Cheng and Zhishan Guo and Xiang Zhang and Wei
                 Wang",
  title =        "{CGC}: a Flexible and Robust Approach to Integrating
                 Co-Regularized Multi-Domain Graph for Clustering",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "46:1--46:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2903147",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Multi-view graph clustering aims to enhance clustering
                 performance by integrating heterogeneous information
                 collected in different domains. Each domain provides a
                 different view of the data instances. Leveraging
                 cross-domain information has been demonstrated an
                 effective way to achieve better clustering results.
                 Despite the previous success, existing multi-view graph
                 clustering methods usually assume that different views
                 are available for the same set of instances. Thus,
                 instances in different domains can be treated as having
                 strict one-to-one relationship. In many real-life
                 applications, however, data instances in one domain may
                 correspond to multiple instances in another domain.
                 Moreover, relationships between instances in different
                 domains may be associated with weights based on prior
                 (partial) knowledge. In this article, we propose a
                 flexible and robust framework, Co-regularized Graph
                 Clustering (CGC), based on non-negative matrix
                 factorization (NMF), to tackle these challenges. CGC
                 has several advantages over the existing methods.
                 First, it supports many-to-many cross-domain instance
                 relationship. Second, it incorporates weight on
                 cross-domain relationship. Third, it allows partial
                 cross-domain mapping so that graphs in different
                 domains may have different sizes. Finally, it provides
                 users with the extent to which the cross-domain
                 instance relationship violates the in-domain clustering
                 structure, and thus enables users to re-evaluate the
                 consistency of the relationship. We develop an
                 efficient optimization method that guarantees to find
                 the global optimal solution with a given confidence
                 requirement. The proposed method can automatically
                 identify noisy domains and assign smaller weights to
                 them. This helps to obtain optimal graph partition for
                 the focused domain. Extensive experimental results on
                 UCI benchmark datasets, newsgroup datasets, and
                 biological interaction networks demonstrate the
                 effectiveness of our approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "46",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Shen:2016:SPO,
  author =       "Chih-Ya Shen and De-Nian Yang and Wang-Chien Lee and
                 Ming-Syan Chen",
  title =        "Spatial-Proximity Optimization for Rapid Task Group
                 Deployment",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "47:1--47:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818714",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Spatial proximity is one of the most important factors
                 for the quick deployment of the task groups in various
                 time-sensitive missions. This article proposes a new
                 spatial query, Spatio-Social Team Query (SSTQ), that
                 forms a strong task group by considering (1) the
                 group's spatial distance (i.e., transportation time),
                 (2) skills of the candidate group members, and (3)
                 social rapport among the candidates. Efficient
                 processing of SSTQ is very challenging, because the
                 aforementioned spatial, skill, and social factors need
                 to be carefully examined. In this article, therefore,
                 we first formulate two subproblems of SSTQ, namely
                 Hop-Constrained Team Problem (HCTP) and
                 Connection-Oriented Team Query (COTQ). HCTP is a
                 decision problem that considers only social and skill
                 dimensions. We prove that HCTP is NP-Complete.
                 Moreover, based on the hardness of HCTP, we prove that
                 SSTQ is NP-Hard and inapproximable within any factor.
                 On the other hand, COTQ is a special case of SSTQ that
                 relaxes the social constraint. We prove that COTQ is
                 NP-Hard and propose an approximation algorithm for
                 COTQ, namely COTprox. Furthermore, based on the
                 observations on COTprox, we devise an approximation
                 algorithm, SSTprox, with a guaranteed error bound for
                 SSTQ. Finally, to efficiently obtain the optimal
                 solution to SSTQ for small instances, we design two
                 efficient algorithms, SpatialFirst and SkillFirst, with
                 different scenarios in mind. These two algorithms
                 incorporate various effective ordering and pruning
                 techniques to reduce the search space for answering
                 SSTQ. Experimental results on real datasets indicate
                 that the proposed algorithms can efficiently answer
                 SSTQ under various parameter settings.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "47",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Yu:2016:FDV,
  author =       "Zhiwen Yu and Zhitao Wang and Liming Chen and Bin Guo
                 and Wenjie Li",
  title =        "Featuring, Detecting, and Visualizing Human Sentiment
                 in {Chinese} Micro-Blog",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "48:1--48:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2821513",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Micro-blog has been increasingly used for the public
                 to express their opinions, and for organizations to
                 detect public sentiment about social events or public
                 policies. In this article, we examine and identify the
                 key problems of this field, focusing particularly on
                 the characteristics of innovative words, multi-media
                 elements, and hierarchical structure of Chinese
                 ``Weibo.'' Based on the analysis, we propose a novel
                 approach and develop associated theoretical and
                 technological methods to address these problems. These
                 include a new sentiment word mining method based on
                 three wording metrics and point-wise information, a
                 rule set model for analyzing sentiment features of
                 different linguistic components, and the corresponding
                 methodology for calculating sentiment on
                 multi-granularity considering emoticon elements as
                 auxiliary affective factors. We evaluate our new word
                 discovery and sentiment detection methods on a
                 real-life Chinese micro-blog dataset. Initial results
                 show that our new diction can improve sentiment
                 detection, and they demonstrate that our multi-level
                 rule set method is more effective, with the average
                 accuracy being 10.2\% and 1.5\% higher than two
                 existing methods for Chinese micro-blog sentiment
                 analysis. In addition, we exploit visualization
                 techniques to study the relationships between online
                 sentiment and real life. The visualization of detected
                 sentiment can help depict temporal patterns and spatial
                 discrepancy.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "48",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chen:2016:EOL,
  author =       "Chen Chen and Hanghang Tong and B. Aditya Prakash and
                 Tina Eliassi-Rad and Michalis Faloutsos and Christos
                 Faloutsos",
  title =        "Eigen-Optimization on Large Graphs by Edge
                 Manipulation",
  journal =      j-TKDD,
  volume =       "10",
  number =       "4",
  pages =        "49:1--49:??",
  month =        jul,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2903148",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:29 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Large graphs are prevalent in many applications and
                 enable a variety of information dissemination
                 processes, e.g., meme, virus, and influence
                 propagation. How can we optimize the underlying graph
                 structure to affect the outcome of such dissemination
                 processes in a desired way (e.g., stop a virus
                 propagation, facilitate the propagation of a piece of
                 good idea, etc)? Existing research suggests that the
                 leading eigenvalue of the underlying graph is the key
                 metric in determining the so-called epidemic threshold
                 for a variety of dissemination models. In this paper,
                 we study the problem of how to optimally place a set of
                 edges (e.g., edge deletion and edge addition) to
                 optimize the leading eigenvalue of the underlying
                 graph, so that we can guide the dissemination process
                 in a desired way. We propose effective, scalable
                 algorithms for edge deletion and edge addition,
                 respectively. In addition, we reveal the intrinsic
                 relationship between edge deletion and node deletion
                 problems. Experimental results validate the
                 effectiveness and efficiency of the proposed
                 algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "49",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Yu:2016:STR,
  author =       "Zhiwen Yu and Miao Tian and Zhu Wang and Bin Guo and
                 Tao Mei",
  title =        "Shop-Type Recommendation Leveraging the Data from
                 Social Media and Location-Based Services",
  journal =      j-TKDD,
  volume =       "11",
  number =       "1",
  pages =        "1:1--1:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2930671",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:30 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "It is an important yet challenging task for investors
                 to determine the most suitable type of shop (e.g.,
                 restaurant, fashion) for a newly opened store.
                 Traditional ways are predominantly field surveys and
                 empirical estimation, which are not effective as they
                 lack shop-related data. As social media and
                 location-based services (LBS) are becoming more and
                 more pervasive, user-generated data from these
                 platforms are providing rich information not only about
                 individual consumption experiences, but also about shop
                 attributes. In this paper, we investigate the
                 recommendation of shop types for a given location, by
                 leveraging heterogeneous data that are mainly
                 historical user preferences and location context from
                 social media and LBS. Our goal is to select the most
                 suitable shop type, seeking to maximize the number of
                 customers served from a candidate set of types. We
                 propose a novel bias learning matrix factorization
                 method with feature fusion for shop popularity
                 prediction. Features are defined and extracted from two
                 perspectives: location, where features are closely
                 related to location characteristics, and commercial,
                 where features are about the relationships between
                 shops in the neighborhood. Experimental results show
                 that the proposed method outperforms state-of-the-art
                 solutions.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{McDowell:2016:LNA,
  author =       "Luke K. McDowell and David W. Aha",
  title =        "Leveraging Neighbor Attributes for Classification in
                 Sparsely Labeled Networks",
  journal =      j-TKDD,
  volume =       "11",
  number =       "1",
  pages =        "2:1--2:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2898358",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:30 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Many analysis tasks involve linked nodes, such as
                 people connected by friendship links. Research on
                 link-based classification (LBC) has studied how to
                 leverage these connections to improve classification
                 accuracy. Most such prior research has assumed the
                 provision of a densely labeled training network.
                 Instead, this article studies the common and
                 challenging case when LBC must use a single sparsely
                 labeled network for both learning and inference, a case
                 where existing methods often yield poor accuracy. To
                 address this challenge, we introduce a novel method
                 that enables prediction via ``neighbor attributes,''
                 which were briefly considered by early LBC work but
                 then abandoned due to perceived problems. We then
                 explain, using both extensive experiments and loss
                 decomposition analysis, how using neighbor attributes
                 often significantly improves accuracy. We further show
                 that using appropriate semi-supervised learning (SSL)
                 is essential to obtaining the best accuracy in this
                 domain and that the gains of neighbor attributes remain
                 across a range of SSL choices and data conditions.
                 Finally, given the challenges of label sparsity for LBC
                 and the impact of neighbor attributes, we show that
                 multiple previous studies must be re-considered,
                 including studies regarding the best model features,
                 the impact of noisy attributes, and strategies for
                 active learning.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chang:2016:CSP,
  author =       "Xiaojun Chang and Feiping Nie and Yi Yang and Chengqi
                 Zhang and Heng Huang",
  title =        "Convex Sparse {PCA} for Unsupervised Feature
                 Learning",
  journal =      j-TKDD,
  volume =       "11",
  number =       "1",
  pages =        "3:1--3:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2910585",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:30 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Principal component analysis (PCA) has been widely
                 applied to dimensionality reduction and data
                 pre-processing for different applications in
                 engineering, biology, social science, and the like.
                 Classical PCA and its variants seek for linear
                 projections of the original variables to obtain the
                 low-dimensional feature representations with maximal
                 variance. One limitation is that it is difficult to
                 interpret the results of PCA. Besides, the classical
                 PCA is vulnerable to certain noisy data. In this paper,
                 we propose a Convex Sparse Principal Component Analysis
                 (CSPCA) algorithm and apply it to feature learning.
                 First, we show that PCA can be formulated as a low-rank
                 regression optimization problem. Based on the
                 discussion, the $ l_{2, 1}$-norm minimization is
                 incorporated into the objective function to make the
                 regression coefficients sparse, thereby robust to the
                 outliers. Also, based on the sparse model used in
                 CSPCA, an optimal weight is assigned to each of the
                 original feature, which in turn provides the output
                 with good interpretability. With the output of our
                 CSPCA, we can effectively analyze the importance of
                 each feature under the PCA criteria. Our new objective
                 function is convex, and we propose an iterative
                 algorithm to optimize it. We apply the CSPCA algorithm
                 to feature selection and conduct extensive experiments
                 on seven benchmark datasets. Experimental results
                 demonstrate that the proposed algorithm outperforms
                 state-of-the-art unsupervised feature selection
                 algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wu:2016:LLR,
  author =       "Ou Wu and Qiang You and Fen Xia and Lei Ma and Weiming
                 Hu",
  title =        "Listwise Learning to Rank from Crowds",
  journal =      j-TKDD,
  volume =       "11",
  number =       "1",
  pages =        "4:1--4:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2910586",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:30 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Learning to rank has received great attention in
                 recent years as it plays a crucial role in many
                 applications such as information retrieval and data
                 mining. The existing concept of learning to rank
                 assumes that each training instance is associated with
                 a reliable label. However, in practice, this assumption
                 does not necessarily hold true as it may be infeasible
                 or remarkably expensive to obtain reliable labels for
                 many learning to rank applications. Therefore, a
                 feasible approach is to collect labels from crowds and
                 then learn a ranking function from crowdsourcing
                 labels. This study explores the listwise learning to
                 rank with crowdsourcing labels obtained from multiple
                 annotators, who may be unreliable. A new probabilistic
                 ranking model is first proposed by combining two
                 existing models. Subsequently, a ranking function is
                 trained by proposing a maximum likelihood learning
                 approach, which estimates ground-truth labels and
                 annotator expertise, and trains the ranking function
                 iteratively. In practical crowdsourcing machine
                 learning, valuable side information (e.g., professional
                 grades) about involved annotators is normally
                 attainable. Therefore, this study also investigates
                 learning to rank from crowd labels when side
                 information on the expertise of involved annotators is
                 available. In particular, three basic types of side
                 information are investigated, and corresponding
                 learning algorithms are consequently introduced.
                 Further, the top-k learning to rank from crowdsourcing
                 labels are explored to deal with long training ranking
                 lists. The proposed algorithms are tested on both
                 synthetic and real-world data. Results reveal that the
                 maximum likelihood estimation approach significantly
                 outperforms the average approach and existing
                 crowdsourcing regression methods. The performances of
                 the proposed algorithms are comparable to those of the
                 learning model in consideration reliable labels. The
                 results of the investigation further indicate that side
                 information is helpful in inferring both ranking
                 functions and expertise degrees of annotators.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Shao:2016:SCI,
  author =       "Junming Shao and Qinli Yang and Hoang-Vu Dang and
                 Bertil Schmidt and Stefan Kramer",
  title =        "Scalable Clustering by Iterative Partitioning and
                 Point Attractor Representation",
  journal =      j-TKDD,
  volume =       "11",
  number =       "1",
  pages =        "5:1--5:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2934688",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:30 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Clustering very large datasets while preserving
                 cluster quality remains a challenging data-mining task
                 to date. In this paper, we propose an effective
                 scalable clustering algorithm for large datasets that
                 builds upon the concept of synchronization. Inherited
                 from the powerful concept of synchronization, the
                 proposed algorithm, CIPA (Clustering by Iterative
                 Partitioning and Point Attractor Representations), is
                 capable of handling very large datasets by iteratively
                 partitioning them into thousands of subsets and
                 clustering each subset separately. Using dynamic
                 clustering by synchronization, each subset is then
                 represented by a set of point attractors and outliers.
                 Finally, CIPA identifies the cluster structure of the
                 original dataset by clustering the newly generated
                 dataset consisting of points attractors and outliers
                 from all subsets. We demonstrate that our new scalable
                 clustering approach has several attractive benefits:
                 (a) CIPA faithfully captures the cluster structure of
                 the original data by performing clustering on each
                 separate data iteratively instead of using any sampling
                 or statistical summarization technique. (b) It allows
                 clustering very large datasets efficiently with high
                 cluster quality. (c) CIPA is parallelizable and also
                 suitable for distributed data. Extensive experiments
                 demonstrate the effectiveness and efficiency of our
                 approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Grabocka:2016:LTS,
  author =       "Josif Grabocka and Nicolas Schilling and Lars
                 Schmidt-Thieme",
  title =        "Latent Time-Series Motifs",
  journal =      j-TKDD,
  volume =       "11",
  number =       "1",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2940329",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:30 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Motifs are the most repetitive/frequent patterns of a
                 time-series. The discovery of motifs is crucial for
                 practitioners in order to understand and interpret the
                 phenomena occurring in sequential data. Currently,
                 motifs are searched among series sub-sequences, aiming
                 at selecting the most frequently occurring ones.
                 Search-based methods, which try out series sub-sequence
                 as motif candidates, are currently believed to be the
                 best methods in finding the most frequent patterns.
                 However, this paper proposes an entirely new
                 perspective in finding motifs. We demonstrate that
                 searching is non-optimal since the domain of motifs is
                 restricted, and instead we propose a principled
                 optimization approach able to find optimal motifs. We
                 treat the occurrence frequency as a function and
                 time-series motifs as its parameters, therefore we
                 learn the optimal motifs that maximize the frequency
                 function. In contrast to searching, our method is able
                 to discover the most repetitive patterns (hence
                 optimal), even in cases where they do not explicitly
                 occur as sub-sequences. Experiments on several
                 real-life time-series datasets show that the motifs
                 found by our method are highly more frequent than the
                 ones found through searching, for exactly the same
                 distance threshold.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhang:2016:SNE,
  author =       "Xianchao Zhang and Linlin Zong and Quanzeng You and
                 Xing Yong",
  title =        "Sampling for {Nystr{\"o}m} Extension-Based Spectral
                 Clustering: Incremental Perspective and Novel
                 Analysis",
  journal =      j-TKDD,
  volume =       "11",
  number =       "1",
  pages =        "7:1--7:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2934693",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:30 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Sampling is the key aspect for Nystr{\"o}m extension
                 based spectral clustering. Traditional sampling schemes
                 select the set of landmark points on a whole and focus
                 on how to lower the matrix approximation error.
                 However, the matrix approximation error does not have
                 direct impact on the clustering performance. In this
                 article, we propose a sampling framework from an
                 incremental perspective, i.e., the landmark points are
                 selected one by one, and each next point to be sampled
                 is determined by previously selected landmark points.
                 Incremental sampling builds explicit relationships
                 among landmark points; thus, they work together well
                 and provide a theoretical guarantee on the clustering
                 performance. We provide two novel analysis methods and
                 propose two schemes for selecting-the-next-one of the
                 framework. The first scheme is based on clusterability
                 analysis, which provides a better guarantee on
                 clustering performance than schemes based on matrix
                 approximation error analysis. The second scheme is
                 based on loss analysis, which provides maximized
                 predictive ability of the landmark points on the
                 (implicit) labels of the unsampled points. Experimental
                 results on a wide range of benchmark datasets
                 demonstrate the superiorities of our proposed
                 incremental sampling schemes over existing sampling
                 schemes.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Qiao:2016:FST,
  author =       "Maoying Qiao and Richard Yi Da Xu and Wei Bian and
                 Dacheng Tao",
  title =        "Fast Sampling for Time-Varying Determinantal Point
                 Processes",
  journal =      j-TKDD,
  volume =       "11",
  number =       "1",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2943785",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:30 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Determinantal Point Processes (DPPs) are stochastic
                 models which assign each subset of a base dataset with
                 a probability proportional to the subset's degree of
                 diversity. It has been shown that DPPs are particularly
                 appropriate in data subset selection and summarization
                 (e.g., news display, video summarizations). DPPs prefer
                 diverse subsets while other conventional models cannot
                 offer. However, DPPs inference algorithms have a
                 polynomial time complexity which makes it difficult to
                 handle large and time-varying datasets, especially when
                 real-time processing is required. To address this
                 limitation, we developed a fast sampling algorithm for
                 DPPs which takes advantage of the nature of some
                 time-varying data (e.g., news corpora updating,
                 communication network evolving), where the data changes
                 between time stamps are relatively small. The proposed
                 algorithm is built upon the simplification of marginal
                 density functions over successive time stamps and the
                 sequential Monte Carlo (SMC) sampling technique.
                 Evaluations on both a real-world news dataset and the
                 Enron Corpus confirm the efficiency of the proposed
                 algorithm.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Crescenzi:2016:GIO,
  author =       "Pierluigi Crescenzi and Gianlorenzo D'angelo and
                 Lorenzo Severini and Yllka Velaj",
  title =        "Greedily Improving Our Own Closeness Centrality in a
                 Network",
  journal =      j-TKDD,
  volume =       "11",
  number =       "1",
  pages =        "9:1--9:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2953882",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:30 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The closeness centrality is a well-known measure of
                 importance of a vertex within a given complex network.
                 Having high closeness centrality can have positive
                 impact on the vertex itself: hence, in this paper we
                 consider the optimization problem of determining how
                 much a vertex can increase its centrality by creating a
                 limited amount of new edges incident to it. We will
                 consider both the undirected and the directed graph
                 cases. In both cases, we first prove that the
                 optimization problem does not admit a polynomial-time
                 approximation scheme (unless P = NP), and then propose
                 a greedy approximation algorithm (with an almost tight
                 approximation ratio), whose performance is then tested
                 on synthetic graphs and real-world networks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Li:2016:CBN,
  author =       "Xiang Li and Charles X. Ling and Huaimin Wang",
  title =        "The Convergence Behavior of Naive {Bayes} on Large
                 Sparse Datasets",
  journal =      j-TKDD,
  volume =       "11",
  number =       "1",
  pages =        "10:1--10:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2948068",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:30 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Large and sparse datasets with a lot of missing values
                 are common in the big data era, such as user behaviors
                 over a large number of items. Classification in such
                 datasets is an important topic for machine learning and
                 data mining. Practically, naive Bayes is still a
                 popular classification algorithm for large sparse
                 datasets, as its time and space complexity scales
                 linearly with the size of non-missing values. However,
                 several important questions about the behavior of naive
                 Bayes are yet to be answered. For example, how
                 different mechanisms of data missing, data sparsity,
                 and the number of attributes systematically affect the
                 learning curves and convergence? In this paper, we
                 address several common data missing mechanisms and
                 propose novel data generation methods based on these
                 mechanisms. We generate large and sparse data
                 systematically, and study the entire AUC (Area Under
                 ROC Curve) learning curve and convergence behavior of
                 naive Bayes. We not only have several important
                 experiment observations, but also provide detailed
                 theoretic studies. Finally, we summarize our empirical
                 and theoretic results as an intuitive decision
                 flowchart and a useful guideline for classifying large
                 sparse datasets in practice.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Fu:2016:MGD,
  author =       "Yanjie Fu and Hui Xiong and Yong Ge and Yu Zheng and
                 Zijun Yao and Zhi-Hua Zhou",
  title =        "Modeling of Geographic Dependencies for Real Estate
                 Ranking",
  journal =      j-TKDD,
  volume =       "11",
  number =       "1",
  pages =        "11:1--11:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2934692",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Aug 29 07:28:30 MDT 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "It is traditionally a challenge for home buyers to
                 understand, compare, and contrast the investment value
                 of real estate. Although a number of appraisal methods
                 have been developed to value real properties, the
                 performances of these methods have been limited by
                 traditional data sources for real estate appraisal.
                 With the development of new ways of collecting
                 estate-related mobile data, there is a potential to
                 leverage geographic dependencies of real estate for
                 enhancing real estate appraisal. Indeed, the geographic
                 dependencies of the investment value of an estate can
                 be from the characteristics of its own neighborhood
                 (individual), the values of its nearby estates (peer),
                 and the prosperity of the affiliated latent business
                 area (zone). To this end, in this paper, we propose a
                 geographic method, named ClusRanking, for real estate
                 appraisal by leveraging the mutual enforcement of
                 ranking and clustering power. ClusRanking is able to
                 exploit geographic individual, peer, and zone
                 dependencies in a probabilistic ranking model.
                 Specifically, we first extract the geographic utility
                 of estates from geography data, estimate the
                 neighborhood popularity of estates by mining taxicab
                 trajectory data, and model the influence of latent
                 business areas. Also, we fuse these three influential
                 factors and predict real estate investment value.
                 Moreover, we simultaneously consider individual, peer
                 and zone dependencies, and derive an estate-specific
                 ranking likelihood as the objective function.
                 Furthermore, we propose an improved method named
                 CR-ClusRanking by incorporating checkin information as
                 a regularization term which reduces the performance
                 volatility of real estate ranking system. Finally, we
                 conduct a comprehensive evaluation with the real
                 estate-related data of Beijing, and the experimental
                 results demonstrate the effectiveness of our proposed
                 methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Gao:2016:DAC,
  author =       "Zekai J. Gao and Chris Jermaine",
  title =        "Distributed Algorithms for Computing Very Large
                 Thresholded Covariance Matrices",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "12:1--12:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2935750",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Computation of covariance matrices from observed data
                 is an important problem, as such matrices are used in
                 applications such as principal component analysis
                 (PCA), linear discriminant analysis (LDA), and
                 increasingly in the learning and application of
                 probabilistic graphical models. However, computing an
                 empirical covariance matrix is not always an easy
                 problem. There are two key difficulties associated with
                 computing such a matrix from a very high-dimensional
                 dataset. The first problem is over-fitting. For a
                 $p$-dimensional covariance matrix, there are $ p(p - 1)
                 / 2$ unique, off-diagonal entries in the empirical
                 covariance matrix $S$ for large $p$ (say, $ p > 10^5$),
                 the size $n$ of the dataset is often much smaller than
                 the number of covariances to compute. Over-fitting is a
                 concern in any situation in which the number of
                 parameters learned can greatly exceed the size of the
                 dataset. Thus, there are strong theoretical reasons to
                 expect that for high-dimensional data-even Gaussian
                 data-the empirical covariance matrix is not a good
                 estimate for the true covariance matrix underlying the
                 generative process. The second problem is
                 computational. Computing a covariance matrix takes $
                 O(n p^2)$ time. For large $p$ (greater than 10,000) and
                 $n$ much greater than $p$, this is debilitating. In
                 this article, we consider how both of these
                 difficulties can be handled simultaneously.
                 Specifically, a key regularization technique for
                 high-dimensional covariance estimation is thresholding,
                 in which the smallest or least significant entries in
                 the covariance matrix are simply dropped and replaced
                 with the value $0$. This suggests an obvious way to
                 address the computational difficulty as well: First,
                 compute the identities of the $K$ entries in the
                 covariance matrix that are actually important in the
                 sense that they will not be removed during
                 thresholding, and then in a second step, compute the
                 values of those entries. This can be done in $ O(K n)$
                 time. If $ K \ll p^2$ and the identities of the
                 important entries can be computed in reasonable time,
                 then this is a big win. The key technical contribution
                 of this article is the design and implementation of two
                 different distributed algorithms for approximating the
                 identities of the important entries quickly, using
                 sampling. We have implemented these methods and tested
                 them using an 800-core compute cluster. Experiments
                 have been run using real datasets having millions of
                 data points and up to 40,000 dimensions. These
                 experiments show that the proposed methods are both
                 accurate and efficient.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2016:WKI,
  author =       "Chenguang Wang and Yangqiu Song and Dan Roth and Ming
                 Zhang and Jiawei Han",
  title =        "World Knowledge as Indirect Supervision for Document
                 Clustering",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "13:1--13:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2953881",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "One of the key obstacles in making learning protocols
                 realistic in applications is the need to supervise
                 them, a costly process that often requires hiring
                 domain experts. We consider the framework to use the
                 world knowledge as indirect supervision. World
                 knowledge is general-purpose knowledge, which is not
                 designed for any specific domain. Then, the key
                 challenges are how to adapt the world knowledge to
                 domains and how to represent it for learning. In this
                 article, we provide an example of using world knowledge
                 for domain-dependent document clustering. We provide
                 three ways to specify the world knowledge to domains by
                 resolving the ambiguity of the entities and their
                 types, and represent the data with world knowledge as a
                 heterogeneous information network. Then, we propose a
                 clustering algorithm that can cluster multiple types
                 and incorporate the sub-type information as
                 constraints. In the experiments, we use two existing
                 knowledge bases as our sources of world knowledge. One
                 is Freebase, which is collaboratively collected
                 knowledge about entities and their organizations. The
                 other is YAGO2, a knowledge base automatically
                 extracted from Wikipedia and maps knowledge to the
                 linguistic knowledge base, WordNet. Experimental
                 results on two text benchmark datasets (20newsgroups
                 and RCV1) show that incorporating world knowledge as
                 indirect supervision can significantly outperform the
                 state-of-the-art clustering algorithms as well as
                 clustering algorithms enhanced with world knowledge
                 features. A preliminary version of this work appeared
                 in the proceedings of KDD 2015 [Wang et al. 2015a].
                 This journal version has made several major
                 improvements. First, we have proposed a new and general
                 learning framework for machine learning with world
                 knowledge as indirect supervision, where document
                 clustering is a special case in the original paper.
                 Second, in order to make our unsupervised semantic
                 parsing method more understandable, we add several real
                 cases from the original sentences to the resulting
                 logic forms with all the necessary information. Third,
                 we add details of the three semantic filtering methods
                 and conduct deep analysis of the three semantic
                 filters, by using case studies to show why the
                 conceptualization-based semantic filter can produce
                 more accurate indirect supervision. Finally, in
                 addition to the experiment on 20 newsgroup data and
                 Freebase, we have extended the experiments on
                 clustering results by using all the combinations of
                 text (20 newsgroup, MCAT, CCAT, ECAT) and world
                 knowledge sources (Freebase, YAGO2).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chakraborty:2016:PCS,
  author =       "Tanmoy Chakraborty and Sriram Srinivasan and Niloy
                 Ganguly and Animesh Mukherjee and Sanjukta Bhowmick",
  title =        "Permanence and Community Structure in Complex
                 Networks",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2953883",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The goal of community detection algorithms is to
                 identify densely connected units within large networks.
                 An implicit assumption is that all the constituent
                 nodes belong equally to their associated community.
                 However, some nodes are more important in the community
                 than others. To date, efforts have been primarily made
                 to identify communities as a whole, rather than
                 understanding to what extent an individual node belongs
                 to its community. Therefore, most metrics for
                 evaluating communities, for example modularity, are
                 global. These metrics produce a score for each
                 community, not for each individual node. In this
                 article, we argue that the belongingness of nodes in a
                 community is not uniform. We quantify the degree of
                 belongingness of a vertex within a community by a new
                 vertex-based metric called permanence. The central idea
                 of permanence is based on the observation that the
                 strength of membership of a vertex to a community
                 depends upon two factors (i) the extent of connections
                 of the vertex within its community versus outside its
                 community, and (ii) how tightly the vertex is connected
                 internally. We present the formulation of permanence
                 based on these two quantities. We demonstrate that
                 compared to other existing metrics (such as modularity,
                 conductance, and cut-ratio), the change in permanence
                 is more commensurate to the level of perturbation in
                 ground-truth communities. We discuss how permanence can
                 help us understand and utilize the structure and
                 evolution of communities by demonstrating that it can
                 be used to --- (i) measure the persistence of a vertex
                 in a community, (ii) design strategies to strengthen
                 the community structure, (iii) explore the
                 core-periphery structure within a community, and (iv)
                 select suitable initiators for message spreading. We
                 further show that permanence is an excellent metric for
                 identifying communities. We demonstrate that the
                 process of maximizing permanence (abbreviated as
                 MaxPerm) produces meaningful communities that concur
                 with the ground-truth community structure of the
                 networks more accurately than eight other popular
                 community detection algorithms. Finally, we provide
                 mathematical proofs to demonstrate the correctness of
                 finding communities by maximizing permanence. In
                 particular, we show that the communities obtained by
                 this method are (i) less affected by the changes in
                 vertex ordering, and (ii) more resilient to resolution
                 limit, degeneracy of solutions, and asymptotic growth
                 of values.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Smith:2016:PNN,
  author =       "Laura M. Smith and Linhong Zhu and Kristina Lerman and
                 Allon G. Percus",
  title =        "Partitioning Networks with Node Attributes by
                 Compressing Information Flow",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "15:1--15:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2968451",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Real-world networks are often organized as modules or
                 communities of similar nodes that serve as functional
                 units. These networks are also rich in content, with
                 nodes having distinguished features or attributes. In
                 order to discover a network's modular structure, it is
                 necessary to take into account not only its links but
                 also node attributes. We describe an
                 information-theoretic method that identifies modules by
                 compressing descriptions of information flow on a
                 network. Our formulation introduces node content into
                 the description of information flow, which we then
                 minimize to discover groups of nodes with similar
                 attributes that also tend to trap the flow of
                 information. The method is conceptually simple and does
                 not require ad-hoc parameters to specify the number of
                 modules or to control the relative contribution of
                 links and node attributes to network structure. We
                 apply the proposed method to partition real-world
                 networks with known community structure. We demonstrate
                 that adding node attributes helps recover the
                 underlying community structure in content-rich networks
                 more effectively than using links alone. In addition,
                 we show that our method is faster and more accurate
                 than alternative state-of-the-art algorithms.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Yu:2016:SAO,
  author =       "Kui Yu and Xindong Wu and Wei Ding and Jian Pei",
  title =        "Scalable and Accurate Online Feature Selection for Big
                 Data",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "16:1--16:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2976744",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Feature selection is important in many big data
                 applications. Two critical challenges closely associate
                 with big data. First, in many big data applications,
                 the dimensionality is extremely high, in millions, and
                 keeps growing. Second, big data applications call for
                 highly scalable feature selection algorithms in an
                 online manner such that each feature can be processed
                 in a sequential scan. We present SAOLA, a {Scalable and
                 Accurate On Line Approach} for feature selection in
                 this paper. With a theoretical analysis on bounds of
                 the pairwise correlations between features, SAOLA
                 employs novel pairwise comparison techniques and
                 maintains a parsimonious model over time in an online
                 manner. Furthermore, to deal with upcoming features
                 that arrive by groups, we extend the SAOLA algorithm,
                 and then propose a new group-SAOLA algorithm for online
                 group feature selection. The group-SAOLA algorithm can
                 online maintain a set of feature groups that is sparse
                 at the levels of both groups and individual features
                 simultaneously. An empirical study using a series of
                 benchmark real datasets shows that our two algorithms,
                 SAOLA and group-SAOLA, are scalable on datasets of
                 extremely high dimensionality and have superior
                 performance over the state-of-the-art feature selection
                 methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Liu:2016:SAU,
  author =       "Bin Liu and Yao Wu and Neil Zhenqiang Gong and Junjie
                 Wu and Hui Xiong and Martin Ester",
  title =        "Structural Analysis of User Choices for Mobile App
                 Recommendation",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2983533",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Advances in smartphone technology have promoted the
                 rapid development of mobile apps. However, the
                 availability of a huge number of mobile apps in
                 application stores has imposed the challenge of finding
                 the right apps to meet the user needs. Indeed, there is
                 a critical demand for personalized app recommendations.
                 Along this line, there are opportunities and challenges
                 posed by two unique characteristics of mobile apps.
                 First, app markets have organized apps in a
                 hierarchical taxonomy. Second, apps with similar
                 functionalities are competing with each other. Although
                 there are a variety of approaches for mobile app
                 recommendations, these approaches do not have a focus
                 on dealing with these opportunities and challenges. To
                 this end, in this article, we provide a systematic
                 study for addressing these challenges. Specifically, we
                 develop a structural user choice model (SUCM) to learn
                 fine-grained user preferences by exploiting the
                 hierarchical taxonomy of apps as well as the
                 competitive relationships among apps. Moreover, we
                 design an efficient learning algorithm to estimate the
                 parameters for the SUCM model. Finally, we perform
                 extensive experiments on a large app adoption dataset
                 collected from Google Play. The results show that SUCM
                 consistently outperforms state-of-the-art Top-N
                 recommendation methods by a significant margin.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Afrati:2016:APD,
  author =       "Foto Afrati and Shlomi Dolev and Ephraim Korach and
                 Shantanu Sharma and Jeffrey D. Ullman",
  title =        "Assignment Problems of Different-Sized Inputs in
                 {MapReduce}",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "18:1--18:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2987376",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "A MapReduce algorithm can be described by a mapping
                 schema, which assigns inputs to a set of reducers, such
                 that for each required output there exists a reducer
                 that receives all the inputs participating in the
                 computation of this output. Reducers have a capacity
                 that limits the sets of inputs they can be assigned.
                 However, individual inputs may vary in terms of size.
                 We consider, for the first time, mapping schemas where
                 input sizes are part of the considerations and
                 restrictions. One of the significant parameters to
                 optimize in any MapReduce job is communication cost
                 between the map and reduce phases. The communication
                 cost can be optimized by minimizing the number of
                 copies of inputs sent to the reducers. The
                 communication cost is closely related to the number of
                 reducers of constrained capacity that are used to
                 accommodate appropriately the inputs, so that the
                 requirement of how the inputs must meet in a reducer is
                 satisfied. In this work, we consider a family of
                 problems where it is required that each input meets
                 with each other input in at least one reducer. We also
                 consider a slightly different family of problems in
                 which each input of a list, X, is required to meet each
                 input of another list, Y, in at least one reducer. We
                 prove that finding an optimal mapping schema for these
                 families of problems is NP-hard, and present a
                 bin-packing-based approximation algorithm for finding a
                 near optimal mapping schema.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2016:UHM,
  author =       "Zhongyuan Wang and Fang Wang and Haixun Wang and
                 Zhirui Hu and Jun Yan and Fangtao Li and Ji-Rong Wen
                 and Zhoujun Li",
  title =        "Unsupervised Head-Modifier Detection in Search
                 Queries",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2988235",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Interpreting the user intent in search queries is a
                 key task in query understanding. Query intent
                 classification has been widely studied. In this
                 article, we go one step further to understand the query
                 from the view of head-modifier analysis. For example,
                 given the query ``popular iphone 5 smart cover,''
                 instead of using coarse-grained semantic classes (e.g.,
                 find electronic product), we interpret that ``smart
                 cover'' is the head or the intent of the query and
                 ``iphone 5'' is its modifier. Query head-modifier
                 detection can help search engines to obtain
                 particularly relevant content, which is also important
                 for applications such as ads matching and query
                 recommendation. We introduce an unsupervised semantic
                 approach for query head-modifier detection. First, we
                 mine a large number of instance level head-modifier
                 pairs from search log. Then, we develop a
                 conceptualization mechanism to generalize the instance
                 level pairs to concept level. Finally, we derive
                 weighted concept patterns that are concise, accurate,
                 and have strong generalization power in head-modifier
                 detection. The developed mechanism has been used in
                 production for search relevance and ads matching. We
                 use extensive experiment results to demonstrate the
                 effectiveness of our approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chang:2016:LMB,
  author =       "Yi Chang and Makoto Yamada and Antonio Ortega and Yan
                 Liu",
  title =        "Lifecycle Modeling for Buzz Temporal Pattern
                 Discovery",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "20:1--20:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2994605",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In social media analysis, one critical task is
                 detecting a burst of topics or buzz, which is reflected
                 by extremely frequent mentions of certain keywords in a
                 short-time interval. Detecting buzz not only provides
                 useful insights into the information propagation
                 mechanism, but also plays an essential role in
                 preventing malicious rumors. However, buzz modeling is
                 a challenging task because a buzz time-series often
                 exhibits sudden spikes and heavy tails, wherein most
                 existing time-series models fail. In this article, we
                 propose novel buzz modeling approaches that capture the
                 rise and fade temporal patterns via Product Lifecycle
                 (PLC) model, a classical concept in economics. More
                 specifically, we propose to model multiple peaks in
                 buzz time-series with PLC mixture or PLC group mixture
                 and develop a probabilistic graphical model (K-Mixture
                 of Product Lifecycle) (K-MPLC) to automatically
                 discover inherent lifecycle patterns within a
                 collection of buzzes. Furthermore, we effectively
                 utilize the model parameters of PLC mixture or PLC
                 group mixture for burst prediction. Our experimental
                 results show that our proposed methods significantly
                 outperform existing leading approaches on buzz
                 clustering and buzz-type prediction.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "20",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wei:2016:NBG,
  author =       "Qiang Wei and Dandan Qiao and Jin Zhang and Guoqing
                 Chen and Xunhua Guo",
  title =        "A Novel Bipartite Graph Based Competitiveness Degree
                 Analysis from Query Logs",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "21:1--21:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996196",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Competitiveness degree analysis is a focal point of
                 business strategy and competitive intelligence, aimed
                 to help managers closely monitor to what extent their
                 rivals are competing with them. This article proposes a
                 novel method, namely BCQ, to measure the
                 competitiveness degree between peers from query logs as
                 an important form of user generated contents, which
                 reflects the ``wisdom of crowds'' from the search
                 engine users' perspective. In doing so, a bipartite
                 graph model is developed to capture the competitive
                 relationships through conjoint attributes hidden in
                 query logs, where the notion of competitiveness degree
                 for entity pairs is introduced, and then used to
                 identify the competitive paths mapped in the bipartite
                 graph. Subsequently, extensive experiments are
                 conducted to demonstrate the effectiveness of BCQ to
                 quantify the competitiveness degrees. Experimental
                 results reveal that BCQ can well support competitors
                 ranking, which is helpful for devising competitive
                 strategies and pursuing market performance. In
                 addition, efficiency experiments on synthetic data show
                 a good scalability of BCQ on large scale of query
                 logs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "21",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Pei:2016:CCP,
  author =       "Yuanli Pei and Xiaoli Z. Fern and Teresa Vania Tjahja
                 and R{\'o}mer Rosales",
  title =        "Comparing Clustering with Pairwise and Relative
                 Constraints: a Unified Framework",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "22:1--22:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996467",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Clustering can be improved with the help of side
                 information about the similarity relationships among
                 instances. Such information has been commonly
                 represented by two types of constraints: pairwise
                 constraints and relative constraints, regarding
                 similarities about instance pairs and triplets,
                 respectively. Prior work has mostly considered these
                 two types of constraints separately and developed
                 individual algorithms to learn from each type. In
                 practice, however, it is critical to understand/compare
                 the usefulness of the two types of constraints as well
                 as the cost of acquiring them, which has not been
                 studied before. This paper provides an extensive
                 comparison of clustering with these two types of
                 constraints. Specifically, we compare their impacts
                 both on human users that provide such constraints and
                 on the learning system that incorporates such
                 constraints into clustering. In addition, to ensure
                 that the comparison of clustering is performed on equal
                 ground (without the potential bias introduced by
                 different learning algorithms), we propose a
                 probabilistic semi-supervised clustering framework that
                 can learn from either type of constraints. Our
                 experiments demonstrate that the proposed
                 semi-supervised clustering framework is highly
                 effective at utilizing both types of constraints to aid
                 clustering. Our user study provides valuable insights
                 regarding the impact of the constraints on human users,
                 and our experiments on clustering with the
                 human-labeled constraints reveal that relative
                 constraint is often more efficient at improving
                 clustering.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "22",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Lorenzetti:2016:MTS,
  author =       "Carlos Lorenzetti and Ana Maguitman and David Leake
                 and Filippo Menczer and Thomas Reichherzer",
  title =        "Mining for Topics to Suggest Knowledge Model
                 Extensions",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "23:1--23:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2997657",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Electronic concept maps, interlinked with other
                 concept maps and multimedia resources, can provide rich
                 knowledge models to capture and share human knowledge.
                 This article presents and evaluates methods to support
                 experts as they extend existing knowledge models, by
                 suggesting new context-relevant topics mined from Web
                 search engines. The task of generating topics to
                 support knowledge model extension raises two research
                 questions: first, how to extract topic descriptors and
                 discriminators from concept maps; and second, how to
                 use these topic descriptors and discriminators to
                 identify candidate topics on the Web with the right
                 balance of novelty and relevance. To address these
                 questions, this article first develops the theoretical
                 framework required for a ``topic suggester'' to aid
                 information search in the context of a knowledge model
                 under construction. It then presents and evaluates
                 algorithms based on this framework and applied in
                 Extender, an implemented tool for topic suggestion.
                 Extender has been developed and tested within
                 CmapTools, a widely used system for supporting
                 knowledge modeling using concept maps. However, the
                 generality of the algorithms makes them applicable to a
                 broad class of knowledge modeling systems, and to Web
                 search in general.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "23",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Kumar:2016:ACT,
  author =       "Dheeraj Kumar and James C. Bezdek and Sutharshan
                 Rajasegarar and Marimuthu Palaniswami and Christopher
                 Leckie and Jeffrey Chan and Jayavardhana Gubbi",
  title =        "Adaptive Cluster Tendency Visualization and Anomaly
                 Detection for Streaming Data",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "24:1--24:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2997656",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The growth in pervasive network infrastructure called
                 the Internet of Things (IoT) enables a wide range of
                 physical objects and environments to be monitored in
                 fine spatial and temporal detail. The detailed, dynamic
                 data that are collected in large quantities from sensor
                 devices provide the basis for a variety of
                 applications. Automatic interpretation of these
                 evolving large data is required for timely detection of
                 interesting events. This article develops and
                 exemplifies two new relatives of the visual assessment
                 of tendency (VAT) and improved visual assessment of
                 tendency (iVAT) models, which uses cluster heat maps to
                 visualize structure in static datasets. One new model
                 is initialized with a static VAT/iVAT image, and then
                 incrementally (hence inc-VAT/inc-iVAT) updates the
                 current minimal spanning tree (MST) used by VAT with an
                 efficient edge insertion scheme. Similarly,
                 dec-VAT/dec-iVAT efficiently removes a node from the
                 current VAT MST. A sequence of inc-iVAT/dec-iVAT images
                 can be used for (visual) anomaly detection in evolving
                 data streams and for sliding window based cluster
                 assessment for time series data. The method is
                 illustrated with four real datasets (three of them
                 being smart city IoT data). The evaluation demonstrates
                 the algorithms' ability to successfully isolate
                 anomalies and visualize changing cluster structure in
                 the streaming data.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "24",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhu:2016:EVM,
  author =       "Wen-Yuan Zhu and Wen-Chih Peng and Ling-Jyh Chen and
                 Kai Zheng and Xiaofang Zhou",
  title =        "Exploiting Viral Marketing for Location Promotion in
                 Location-Based Social Networks",
  journal =      j-TKDD,
  volume =       "11",
  number =       "2",
  pages =        "25:1--25:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3001938",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Dec 26 17:17:00 MST 2016",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "With the explosion of smartphones and social network
                 services, location-based social networks (LBSNs) are
                 increasingly seen as tools for businesses (e.g.,
                 restaurants and hotels) to promote their products and
                 services. In this article, we investigate the key
                 techniques that can help businesses promote their
                 locations by advertising wisely through the underlying
                 LBSNs. In order to maximize the benefit of location
                 promotion, we formalize it as an influence maximization
                 problem in an LBSN, i.e., given a target location and
                 an LBSN, a set of k users (called seeds) should be
                 advertised initially such that they can successfully
                 propagate and attract many other users to visit the
                 target location. Existing studies have proposed
                 different ways to calculate the information propagation
                 probability, that is, how likely it is that a user may
                 influence another, in the setting of a static social
                 network. However, it is more challenging to derive the
                 propagation probability in an LBSN since it is heavily
                 affected by the target location and the user mobility,
                 both of which are dynamic and query dependent. This
                 article proposes two user mobility models, namely the
                 Gaussian-based and distance-based mobility models, to
                 capture the check-in behavior of individual LBSN users,
                 based on which location-aware propagation probabilities
                 can be derived. Extensive experiments based on two real
                 LBSN datasets have demonstrated the superior
                 effectiveness of our proposals compared with existing
                 static models of propagation probabilities to truly
                 reflect the information propagation in LBSNs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "25",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Sariyuce:2017:GMF,
  author =       "Ahmet Erdem Sariy{\"u}ce and Kamer Kaya and Erik Saule
                 and {\"U}mit V. {\c{C}}ataly{\"u}rek",
  title =        "Graph Manipulations for Fast Centrality Computation",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "26:1--26:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3022668",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The betweenness and closeness metrics are widely used
                 metrics in many network analysis applications. Yet,
                 they are expensive to compute. For that reason, making
                 the betweenness and closeness centrality computations
                 faster is an important and well-studied problem. In
                 this work, we propose the framework BADIOS that
                 manipulates the graph by compressing it and splitting
                 into pieces so that the centrality computation can be
                 handled independently for each piece. Experimental
                 results show that the proposed techniques can be a
                 great arsenal to reduce the centrality computation time
                 for various types and sizes of networks. In particular,
                 it reduces the betweenness centrality computation time
                 of a 4.6 million edges graph from more than 5 days to
                 less than 16 hours. For the same graph, the closeness
                 computation time is decreased from more than 3 days to
                 6 hours (12.7x speedup).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "26",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Rozenshtein:2017:FDD,
  author =       "Polina Rozenshtein and Nikolaj Tatti and Aristides
                 Gionis",
  title =        "Finding Dynamic Dense Subgraphs",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "27:1--27:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3046791",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Online social networks are often defined by
                 considering interactions of entities at an aggregate
                 level. For example, a call graph is formed among
                 individuals who have called each other at least once;
                 or at least k times. Similarly, in social-media
                 platforms, we consider implicit social networks among
                 users who have interacted in some way, e.g., have made
                 a conversation, have commented to the content of each
                 other, and so on. Such definitions have been used
                 widely in the literature and they have offered
                 significant insights regarding the structure of social
                 networks. However, it is obvious that they suffer from
                 a severe limitation: They neglect the precise time that
                 interactions among the network entities occur. In this
                 article, we consider interaction networks, where the
                 data description contains not only information about
                 the underlying topology of the social network, but also
                 the exact time instances that network entities
                 interact. In an interaction network, an edge is
                 associated with a timestamp, and multiple edges may
                 occur for the same pair of entities. Consequently,
                 interaction networks offer a more fine-grained
                 representation, which can be leveraged to reveal
                 otherwise hidden dynamic phenomena. In the setting of
                 interaction networks, we study the problem of
                 discovering dynamic dense subgraphs whose edges occur
                 in short time intervals. We view such subgraphs as
                 fingerprints of dynamic activity occurring within
                 network communities. Such communities represent groups
                 of individuals who interact with each other in specific
                 time instances, for example, a group of employees who
                 work on a project and whose interaction intensifies
                 before certain project milestones. We prove that the
                 problem we define is NP -hard, and we provide efficient
                 algorithms by adapting techniques for finding dense
                 subgraphs. We also show how to speed-up the proposed
                 methods by exploiting concavity properties of our
                 objective function and by the means of fractional
                 programming. We perform extensive evaluation of the
                 proposed methods on synthetic and real datasets, which
                 demonstrates the validity of our approach and shows
                 that our algorithms can be used to obtain high-quality
                 results.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "27",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Liu:2017:MBM,
  author =       "Guannan Liu and Yanjie Fu and Guoqing Chen and Hui
                 Xiong and Can Chen",
  title =        "Modeling Buying Motives for Personalized Product
                 Bundle Recommendation",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "28:1--28:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3022185",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Product bundling is a marketing strategy that offers
                 several products/items for sale as one bundle. While
                 the bundling strategy has been widely used, less
                 efforts have been made to understand how items should
                 be bundled with respect to consumers' preferences and
                 buying motives for product bundles. This article
                 investigates the relationships between the items that
                 are bought together within a product bundle. To that
                 end, each purchased product bundle is formulated as a
                 bundle graph with items as nodes and the associations
                 between pairs of items in the bundle as edges. The
                 relationships between items can be analyzed by the
                 formation of edges in bundle graphs, which can be
                 attributed to the associations of feature aspects.
                 Then, a probabilistic model BPM (Bundle Purchases with
                 Motives) is proposed to capture the composition of each
                 bundle graph, with two latent factors node-type and
                 edge-type introduced to describe the feature aspects
                 and relationships respectively. Furthermore, based on
                 the preferences inferred from the model, an approach
                 for recommending items to form product bundles is
                 developed by estimating the probability that a consumer
                 would buy an associative item together with the item
                 already bought in the shopping cart. Finally,
                 experimental results on real-world transaction data
                 collected from well-known shopping sites show the
                 effectiveness advantages of the proposed approach over
                 other baseline methods. Moreover, the experiments also
                 show that the proposed model can explain consumers'
                 buying motives for product bundles in terms of
                 different node-types and edge-types.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "28",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Guo:2017:CSN,
  author =       "Ting Guo and Jia Wu and Xingquan Zhu and Chengqi
                 Zhang",
  title =        "Combining Structured Node Content and Topology
                 Information for Networked Graph Clustering",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "29:1--29:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996197",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Graphs are popularly used to represent objects with
                 shared dependency relationships. To date, all existing
                 graph clustering algorithms consider each node as a
                 single attribute or a set of independent attributes,
                 without realizing that content inside each node may
                 also have complex structures. In this article, we
                 formulate a new networked graph clustering task where a
                 network contains a set of inter-connected (or
                 networked) super-nodes, each of which is a
                 single-attribute graph. The new super-node
                 representation is applicable to many real-world
                 applications, such as a citation network where each
                 node denotes a paper whose content can be described as
                 a graph, and citation relationships between papers form
                 a networked graph (i.e., a super-graph). Networked
                 graph clustering aims to find similar node groups, each
                 of which contains nodes with similar content and
                 structure information. The main challenge is to
                 properly calculate the similarity between super-nodes
                 for clustering. To solve the problem, we propose to
                 characterize node similarity by integrating structure
                 and content information of each super-node. To measure
                 node content similarity, we use cosine distance by
                 considering overlapped attributes between two
                 super-nodes. To measure structure similarity, we
                 propose an Attributed Random Walk Kernel (ARWK) to
                 calculate the similarity between super-nodes. Detailed
                 node content analysis is also included to build
                 relationships between super-nodes with shared internal
                 structure information, so the structure similarity can
                 be calculated in a precise way. By integrating the
                 structure similarity and content similarity as one
                 matrix, the spectral clustering is used to achieve
                 networked graph clustering. Our method enjoys sound
                 theoretical properties, including bounded similarities
                 and better structure similarity assessment than
                 traditional graph clustering methods. Experiments on
                 real-world applications demonstrate that our method
                 significantly outperforms baseline approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "29",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Liu:2017:IPV,
  author =       "Qi Liu and Biao Xiang and Nicholas Jing Yuan and
                 Enhong Chen and Hui Xiong and Yi Zheng and Yu Yang",
  title =        "An Influence Propagation View of {PageRank}",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "30:1--30:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3046941",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "For a long time, PageRank has been widely used for
                 authority computation and has been adopted as a solid
                 baseline for evaluating social influence related
                 applications. However, when measuring the authority of
                 network nodes, the traditional PageRank method does not
                 take the nodes' prior knowledge into consideration.
                 Also, the connection between PageRank and social
                 influence modeling methods is not clearly established.
                 To that end, this article provides a focused study on
                 understanding PageRank as well as the relationship
                 between PageRank and social influence analysis. Along
                 this line, we first propose a linear social influence
                 model and reveal that this model generalizes the
                 PageRank-based authority computation by introducing
                 some constraints. Then, we show that the authority
                 computation by PageRank can be enhanced if exploiting
                 more reasonable constraints (e.g., from prior
                 knowledge). Next, to deal with the computational
                 challenge of linear model with general constraints, we
                 provide an upper bound for identifying nodes with top
                 authorities. Moreover, we extend the proposed linear
                 model for better measuring the authority of the given
                 node sets, and we also demonstrate the way to quickly
                 identify the top authoritative node sets. Finally,
                 extensive experimental evaluations on four real-world
                 networks validate the effectiveness of the proposed
                 linear model with respect to different constraint
                 settings. The results show that the methods with more
                 reasonable constraints can lead to better ranking and
                 recommendation performance. Meanwhile, the upper bounds
                 formed by PageRank values could be used to quickly
                 locate the nodes and node sets with the highest
                 authorities.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "30",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2017:LMD,
  author =       "Sen Wang and Xue Li and Xiaojun Chang and Lina Yao
                 and Quan Z. Sheng and Guodong Long",
  title =        "Learning Multiple Diagnosis Codes for {ICU} Patients
                 with Local Disease Correlation Mining",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "31:1--31:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3003729",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In the era of big data, a mechanism that can
                 automatically annotate disease codes to patients'
                 records in the medical information system is in demand.
                 The purpose of this work is to propose a framework that
                 automatically annotates the disease labels of
                 multi-source patient data in Intensive Care Units
                 (ICUs). We extract features from two main sources,
                 medical charts and notes. The Bag-of-Words model is
                 used to encode the features. Unlike most of the
                 existing multi-label learning algorithms that globally
                 consider correlations between diseases, our model
                 learns disease correlation locally in the patient data.
                 To achieve this, we derive a local disease correlation
                 representation to enrich the discriminant power of each
                 patient data. This representation is embedded into a
                 unified multi-label learning framework. We develop an
                 alternating algorithm to iteratively optimize the
                 objective function. Extensive experiments have been
                 conducted on a real-world ICU database. We have
                 compared our algorithm with representative multi-label
                 learning algorithms. Evaluation results have shown that
                 our proposed method has state-of-the-art performance in
                 the annotation of multiple diagnostic codes for ICU
                 patients. This study suggests that problems in the
                 automated diagnosis code annotation can be reliably
                 addressed by using a multi-label learning model that
                 exploits disease correlation. The findings of this
                 study will greatly benefit health care and management
                 in ICU considering that the automated diagnosis code
                 annotation can significantly improve the quality and
                 management of health care for both patients and
                 caregivers.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "31",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Bae:2017:SEF,
  author =       "Seung-Hee Bae and Daniel Halperin and Jevin D. West
                 and Martin Rosvall and Bill Howe",
  title =        "Scalable and Efficient Flow-Based Community Detection
                 for Large-Scale Graph Analysis",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "32:1--32:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2992785",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Community detection is an increasingly popular
                 approach to uncover important structures in large
                 networks. Flow-based community detection methods rely
                 on communication patterns of the network rather than
                 structural properties to determine communities. The
                 Infomap algorithm in particular optimizes a novel
                 objective function called the map equation and has been
                 shown to outperform other approaches in third-party
                 benchmarks. However, Infomap and its variants are
                 inherently sequential, limiting their use for
                 large-scale graphs. In this article, we propose a novel
                 algorithm to optimize the map equation called RelaxMap.
                 RelaxMap provides two important improvements over
                 Infomap: parallelization, so that the map equation can
                 be optimized over much larger graphs, and
                 prioritization, so that the most important work occurs
                 first, iterations take less time, and the algorithm
                 converges faster. We implement these techniques using
                 OpenMP on shared-memory multicore systems, and evaluate
                 our approach on a variety of graphs from standard graph
                 clustering benchmarks as well as real graph datasets.
                 Our evaluation shows that both techniques are
                 effective: RelaxMap achieves 70\% parallel efficiency
                 on eight cores, and prioritization improves algorithm
                 performance by an additional 20--50\% on average,
                 depending on the graph properties. Additionally,
                 RelaxMap converges in the similar number of iterations
                 and provides solutions of equivalent quality as the
                 serial Infomap implementation.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "32",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Peng:2017:RGR,
  author =       "Chong Peng and Zhao Kang and Yunhong Hu and Jie Cheng
                 and Qiang Cheng",
  title =        "Robust Graph Regularized Nonnegative Matrix
                 Factorization for Clustering",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "33:1--33:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3003730",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Matrix factorization is often used for data
                 representation in many data mining and machine-learning
                 problems. In particular, for a dataset without any
                 negative entries, nonnegative matrix factorization
                 (NMF) is often used to find a low-rank approximation by
                 the product of two nonnegative matrices. With reduced
                 dimensions, these matrices can be effectively used for
                 many applications such as clustering. The existing
                 methods of NMF are often afflicted with their
                 sensitivity to outliers and noise in the data. To
                 mitigate this drawback, in this paper, we consider
                 integrating NMF into a robust principal component
                 model, and design a robust formulation that effectively
                 captures noise and outliers in the approximation while
                 incorporating essential nonlinear structures. A set of
                 comprehensive empirical evaluations in clustering
                 applications demonstrates that the proposed method has
                 strong robustness to gross errors and superior
                 performance to current state-of-the-art methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "33",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Tang:2017:PSS,
  author =       "Xun Tang and Maha Alabduljalil and Xin Jin and Tao
                 Yang",
  title =        "Partitioned Similarity Search with Cache-Conscious
                 Data Traversal",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "34:1--34:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3014060",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "All pairs similarity search (APSS) is used in many web
                 search and data mining applications. Previous work has
                 used techniques such as comparison filtering, inverted
                 indexing, and parallel accumulation of partial results.
                 However, shuffling intermediate results can incur
                 significant communication overhead as data scales up.
                 This paper studies a scalable two-phase approach called
                 Partition-based Similarity Search (PSS). The first
                 phase is to partition the data and group vectors that
                 are potentially similar. The second phase is to run a
                 set of tasks where each task compares a partition of
                 vectors with other candidate partitions. Due to data
                 sparsity and the presence of memory hierarchy,
                 accessing feature vectors during the partition
                 comparison phase incurs significant overhead. This
                 paper introduces a cache-conscious design for data
                 layout and traversal to reduce access time through
                 size-controlled data splitting and vector coalescing,
                 and it provides an analysis to guide the choice of
                 optimization parameters. The evaluation results show
                 that for the tested datasets, the proposed approach can
                 lead to an early elimination of unnecessary I/O and
                 data communication while sustaining parallel efficiency
                 with one order of magnitude of performance improvement
                 and it can also be integrated with LSH for approximated
                 APSS.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "34",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Feng:2017:RBC,
  author =       "Shanshan Feng and Jian Cao and Jie Wang and Shiyou
                 Qian",
  title =        "Recommendations Based on Comprehensively Exploiting
                 the Latent Factors Hidden in Items' Ratings and
                 Content",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "35:1--35:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3003728",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "To improve the performance of recommender systems in a
                 practical manner, several hybrid approaches have been
                 developed by considering item ratings and content
                 information simultaneously. However, most of these
                 hybrid approaches make recommendations based on
                 aggregating different recommendation techniques using
                 various strategies, rather than considering joint
                 modeling of the item's ratings and content, and thus
                 fail to detect many latent factors that could
                 potentially improve the performance of the recommender
                 systems. For this reason, these approaches continue to
                 suffer from data sparsity and do not work well for
                 recommending items to individual users. A few studies
                 try to describe a user's preference by detecting items'
                 latent features from content-description texts as
                 compensation for the sparse ratings. Unfortunately,
                 most of these methods are still generally unable to
                 accomplish recommendation tasks well for two reasons:
                 (1) they learn latent factors from text descriptions or
                 user--item ratings independently, rather than combining
                 them together; and (2) influences of latent factors
                 hidden in texts and ratings are not fully explored. In
                 this study, we propose a probabilistic approach that we
                 denote as latent random walk (LRW) based on the
                 combination of an integrated latent topic model and
                 random walk (RW) with the restart method, which can be
                 used to rank items according to expected user
                 preferences by detecting both their explicit and
                 implicit correlative information, in order to recommend
                 top-ranked items to potentially interested users. As
                 presented in this article, the goal of this work is to
                 comprehensively discover latent factors hidden in
                 items' ratings and content in order to alleviate the
                 data sparsity problem and to improve the performance of
                 recommender systems. The proposed topic model provides
                 a generative probabilistic framework that discovers
                 users' implicit preferences and items' latent features
                 simultaneously by exploiting both ratings and item
                 content information. On the basis of this probabilistic
                 framework, RW can predict a user's preference for
                 unrated items by discovering global latent relations.
                 In order to show the efficiency of the proposed
                 approach, we test LRW and other state-of-the-art
                 methods on three real-world datasets, namely,
                 CAMRa2011, Yahoo!, and APP. The experiments indicate
                 that our approach outperforms all comparative methods
                 and, in addition, that it is less sensitive to the data
                 sparsity problem, thus demonstrating the robustness of
                 LRW for recommendation tasks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "35",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Liu:2017:SPM,
  author =       "Xutong Liu and Feng Chen and Yen-Cheng Lu and
                 Chang-Tien Lu",
  title =        "Spatial Prediction for Multivariate Non-{Gaussian}
                 Data",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "36:1--36:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3022669",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "With the ever increasing volume of geo-referenced
                 datasets, there is a real need for better statistical
                 estimation and prediction techniques for spatial
                 analysis. Most existing approaches focus on predicting
                 multivariate Gaussian spatial processes, but as the
                 data may consist of non-Gaussian (or mixed type)
                 variables, this creates two challenges: (1) how to
                 accurately capture the dependencies among different
                 data types, both Gaussian and non-Gaussian; and (2) how
                 to efficiently predict multivariate non-Gaussian
                 spatial processes. In this article, we propose a
                 generic approach for predicting multiple response
                 variables of mixed types. The proposed approach
                 accurately captures cross-spatial dependencies among
                 response variables and reduces the computational burden
                 by projecting the spatial process to a lower
                 dimensional space with knot-based techniques. Efficient
                 approximations are provided to estimate posterior
                 marginals of latent variables for the predictive
                 process, and extensive experimental evaluations based
                 on both simulation and real-life datasets are provided
                 to demonstrate the effectiveness and efficiency of this
                 new approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "36",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2017:MDP,
  author =       "Liang Wang and Zhiwen Yu and Bin Guo and Tao Ku and
                 Fei Yi",
  title =        "Moving Destination Prediction Using Sparse Dataset: a
                 Mobility Gradient Descent Approach",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "37:1--37:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3051128",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Moving destination prediction offers an important
                 category of location-based applications and provides
                 essential intelligence to business and governments. In
                 existing studies, a common approach to destination
                 prediction is to match the given query trajectory with
                 massive recorded trajectories by similarity
                 calculation. Unfortunately, due to privacy concerns,
                 budget constraints, and many other factors, in most
                 circumstances, we can only obtain a sparse trajectory
                 dataset. In sparse dataset, the available moving
                 trajectories are far from enough to cover all possible
                 query trajectories; thus the predictability of the
                 matching-based approach will decrease remarkably.
                 Toward destination prediction with sparse dataset,
                 instead of searching similar trajectories over the
                 sparse records, we alternatively examine the changes of
                 distances from sampling locations to final destination
                 on query trajectory. The underlying idea is intuitive:
                 It is directly motivated by travel purpose, people
                 always get closer to the final destination during the
                 movement. By borrowing the conception of gradient
                 descent in optimization theory, we propose a novel
                 moving destination prediction approach, namely MGDPre.
                 Building upon the mobility gradient descent, MGDPre
                 only investigates the behavior characteristics of query
                 trajectory itself without matching historical
                 trajectories, and thus is applicable for sparse
                 dataset. We evaluate our approach based on extensive
                 experiments, using GPS trajectories generated by a
                 sample of taxis over a 10-day period in Shenzhen city,
                 China. The results demonstrate that the effectiveness,
                 efficiency, and scalability of our approach outperform
                 state-of-the-art baseline methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "37",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Fountoulakis:2017:RRA,
  author =       "Kimon Fountoulakis and Abhisek Kundu and Eugenia-Maria
                 Kontopoulou and Petros Drineas",
  title =        "A Randomized Rounding Algorithm for Sparse {PCA}",
  journal =      j-TKDD,
  volume =       "11",
  number =       "3",
  pages =        "38:1--38:??",
  month =        apr,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3046948",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jul 24 17:32:52 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/tkdd/;
                 https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We present and analyze a simple, two-step algorithm to
                 approximate the optimal solution of the sparse PCA
                 problem. In the proposed approach, we first solve an $
                 l_1$-penalized version of the NP-hard sparse PCA
                 optimization problem and then we use a randomized
                 rounding strategy to sparsify the resulting dense
                 solution. Our main theoretical result guarantees an
                 additive error approximation and provides a tradeoff
                 between sparsity and accuracy. Extensive experimental
                 evaluation indicates that the proposed approach is
                 competitive in practice, even compared to
                 state-of-the-art toolboxes such as Spasm.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "38",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Aggarwal:2017:ISI,
  author =       "Charu C. Aggarwal",
  title =        "Introduction to Special Issue on the Best Papers from
                 {KDD 2016}",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "39:1--39:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092689",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "This issue contains the best papers from the ACM KDD
                 Conference 2016. As is customary at KDD, special issue
                 papers are invited only from the research track. The
                 top-ranked papers from the KDD 2016 conference are
                 included in this issue. This issue contains a total of
                 six articles, which are from different areas of data
                 mining. A brief description of these articles is also
                 provided in this article.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "39",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Cheng:2017:RCA,
  author =       "Wei Cheng and Jingchao Ni and Kai Zhang and Haifeng
                 Chen and Guofei Jiang and Yu Shi and Xiang Zhang and
                 Wei Wang",
  title =        "Ranking Causal Anomalies for System Fault Diagnosis
                 via Temporal and Dynamical Analysis on Vanishing
                 Correlations",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "40:1--40:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3046946",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Detecting system anomalies is an important problem in
                 many fields such as security, fault management, and
                 industrial optimization. Recently, invariant network
                 has shown to be powerful in characterizing complex
                 system behaviours. In the invariant network, a node
                 represents a system component and an edge indicates a
                 stable, significant interaction between two components.
                 Structures and evolutions of the invariance network, in
                 particular the vanishing correlations, can shed
                 important light on locating causal anomalies and
                 performing diagnosis. However, existing approaches to
                 detect causal anomalies with the invariant network
                 often use the percentage of vanishing correlations to
                 rank possible casual components, which have several
                 limitations: (1) fault propagation in the network is
                 ignored, (2) the root casual anomalies may not always
                 be the nodes with a high percentage of vanishing
                 correlations, (3) temporal patterns of vanishing
                 correlations are not exploited for robust detection,
                 and (4) prior knowledge on anomalous nodes are not
                 exploited for (semi-)supervised detection. To address
                 these limitations, in this article we propose a network
                 diffusion based framework to identify significant
                 causal anomalies and rank them. Our approach can
                 effectively model fault propagation over the entire
                 invariant network and can perform joint inference on
                 both the structural and the time-evolving broken
                 invariance patterns. As a result, it can locate
                 high-confidence anomalies that are truly responsible
                 for the vanishing correlations and can compensate for
                 unstructured measurement noise in the system. Moreover,
                 when the prior knowledge on the anomalous status of
                 some nodes are available at certain time points, our
                 approach is able to leverage them to further enhance
                 the anomaly inference accuracy. When the prior
                 knowledge is noisy, our approach also automatically
                 learns reliable information and reduces impacts from
                 noises. By performing extensive experiments on
                 synthetic datasets, bank information system datasets,
                 and coal plant cyber-physical system datasets, we
                 demonstrate the effectiveness of our approach.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "40",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhang:2017:CDM,
  author =       "Tianyang Zhang and Peng Cui and Christos Faloutsos and
                 Yunfei Lu and Hao Ye and Wenwu Zhu and Shiqiang Yang",
  title =        "{comeNgo}: a Dynamic Model for Social Group
                 Evolution",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "41:1--41:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3059214",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "How do social groups, such as Facebook groups and
                 Wechat groups, dynamically evolve over time? How do
                 people join the social groups, uniformly or with burst?
                 What is the pattern of people quitting from groups? Is
                 there a simple universal model to depict the
                 come-and-go patterns of various groups? In this
                 article, we examine temporal evolution patterns of more
                 than 100 thousands social groups with more than 10
                 million users. We surprisingly find that the evolution
                 patterns of real social groups goes far beyond the
                 classic dynamic models like SI and SIR. For example, we
                 observe both diffusion and non-diffusion mechanism in
                 the group joining process, and power-law decay in group
                 quitting process, rather than exponential decay as
                 expected in SIR model. Therefore, we propose a new
                 model comeNgo, a concise yet flexible dynamic model for
                 group evolution. Our model has the following
                 advantages: (a) Unification power: it generalizes
                 earlier theoretical models and different joining and
                 quitting mechanisms we find from observation. (b)
                 Succinctness and interpretability: it contains only six
                 parameters with clear physical meanings. (c) Accuracy:
                 it can capture various kinds of group evolution
                 patterns preciously, and the goodness of fit increases
                 by 58\% over baseline. (d) Usefulness: it can be used
                 in multiple application scenarios, such as forecasting
                 and pattern discovery. Furthermore, our model can
                 provide insights about different evolution patterns of
                 social groups, and we also find that group structure
                 and its evolution has notable relations with temporal
                 patterns of group evolution.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "41",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chen:2017:CDI,
  author =       "Chen Chen and Hanghang Tong and Lei Xie and Lei Ying
                 and Qing He",
  title =        "Cross-Dependency Inference in Multi-Layered Networks:
                 a Collaborative Filtering Perspective",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "42:1--42:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3056562",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The increasingly connected world has catalyzed the
                 fusion of networks from different domains, which
                 facilitates the emergence of a new network
                 model-multi-layered networks. Examples of such kind of
                 network systems include critical infrastructure
                 networks, biological systems, organization-level
                 collaborations, cross-platform e-commerce, and so
                 forth. One crucial structure that distances
                 multi-layered network from other network models is its
                 cross-layer dependency, which describes the
                 associations between the nodes from different layers.
                 Needless to say, the cross-layer dependency in the
                 network plays an essential role in many data mining
                 applications like system robustness analysis and
                 complex network control. However, it remains a daunting
                 task to know the exact dependency relationships due to
                 noise, limited accessibility, and so forth. In this
                 article, we tackle the cross-layer dependency inference
                 problem by modeling it as a collective collaborative
                 filtering problem. Based on this idea, we propose an
                 effective algorithm Fascinate that can reveal
                 unobserved dependencies with linear complexity.
                 Moreover, we derive Fascinate-ZERO, an online variant
                 of Fascinate that can respond to a newly added node
                 timely by checking its neighborhood dependencies. We
                 perform extensive evaluations on real datasets to
                 substantiate the superiority of our proposed
                 approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "42",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{DeStefani:2017:TCL,
  author =       "Lorenzo {De Stefani} and Alessandro Epasto and Matteo
                 Riondato and Eli Upfal",
  title =        "{TRI{\`E}ST}: Counting Local and Global Triangles in
                 Fully Dynamic Streams with Fixed Memory Size",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "43:1--43:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3059194",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "``Ogni lassada xe persa.''$^1$ --- Proverb from
                 Trieste, Italy. We present tri{\`e}st, a suite of
                 one-pass streaming algorithms to compute unbiased,
                 low-variance, high-quality approximations of the global
                 and local (i.e., incident to each vertex) number of
                 triangles in a fully dynamic graph represented as an
                 adversarial stream of edge insertions and deletions.
                 Our algorithms use reservoir sampling and its variants
                 to exploit the user-specified memory space at all
                 times. This is in contrast with previous approaches,
                 which require hard-to-choose parameters (e.g., a fixed
                 sampling probability) and offer no guarantees on the
                 amount of memory they use. We analyze the variance of
                 the estimations and show novel concentration bounds for
                 these quantities. Our experimental results on very
                 large graphs demonstrate that tri{\`e}st outperforms
                 state-of-the-art approaches in accuracy and exhibits a
                 small update time.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "43",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Hooi:2017:GBF,
  author =       "Bryan Hooi and Kijung Shin and Hyun Ah Song and Alex
                 Beutel and Neil Shah and Christos Faloutsos",
  title =        "Graph-Based Fraud Detection in the Face of
                 Camouflage",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "44:1--44:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3056563",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Given a bipartite graph of users and the products that
                 they review, or followers and followees, how can we
                 detect fake reviews or follows? Existing fraud
                 detection methods (spectral, etc.) try to identify
                 dense subgraphs of nodes that are sparsely connected to
                 the remaining graph. Fraudsters can evade these methods
                 using camouflage, by adding reviews or follows with
                 honest targets so that they look ``normal.'' Even
                 worse, some fraudsters use hijacked accounts from
                 honest users, and then the camouflage is indeed
                 organic. Our focus is to spot fraudsters in the
                 presence of camouflage or hijacked accounts. We propose
                 FRAUDAR, an algorithm that (a) is camouflage resistant,
                 (b) provides upper bounds on the effectiveness of
                 fraudsters, and (c) is effective in real-world data.
                 Experimental results under various attacks show that
                 FRAUDAR outperforms the top competitor in accuracy of
                 detecting both camouflaged and non-camouflaged fraud.
                 Additionally, in real-world experiments with a Twitter
                 follower--followee graph of 1.47 billion edges, FRAUDAR
                 successfully detected a subgraph of more than 4, 000
                 detected accounts, of which a majority had tweets
                 showing that they used follower-buying services.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "44",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Anderson:2017:AHE,
  author =       "Ashton Anderson and Jon Kleinberg and Sendhil
                 Mullainathan",
  title =        "Assessing Human Error Against a Benchmark of
                 Perfection",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "45:1--45:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3046947",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "An increasing number of domains are providing us with
                 detailed trace data on human decisions in settings
                 where we can evaluate the quality of these decisions
                 via an algorithm. Motivated by this development, an
                 emerging line of work has begun to consider whether we
                 can characterize and predict the kinds of decisions
                 where people are likely to make errors. To investigate
                 what a general framework for human error prediction
                 might look like, we focus on a model system with a rich
                 history in the behavioral sciences: the decisions made
                 by chess players as they select moves in a game. We
                 carry out our analysis at a large scale, employing
                 datasets with several million recorded games, and using
                 chess tablebases to acquire a form of ground truth for
                 a subset of chess positions that have been completely
                 solved by computers but remain challenging for even the
                 best players in the world. We organize our analysis
                 around three categories of features that we argue are
                 present in most settings where the analysis of human
                 error is applicable: the skill of the decision-maker,
                 the time available to make the decision, and the
                 inherent difficulty of the decision. We identify rich
                 structure in all three of these categories of features,
                 and find strong evidence that in our domain, features
                 describing the inherent difficulty of an instance are
                 significantly more powerful than features based on
                 skill or time.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "45",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2017:DCM,
  author =       "Yihan Wang and Shaoxu Song and Lei Chen and Jeffrey Xu
                 Yu and Hong Cheng",
  title =        "Discovering Conditional Matching Rules",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "46:1--46:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3070647",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Matching dependencies (MDs) have recently been
                 proposed to make data dependencies tolerant to various
                 information representations, and found useful in data
                 quality applications such as record matching. Instead
                 of the strict equality function used in traditional
                 dependency syntax (e.g., functional dependencies), MDs
                 specify constraints based on similarity and
                 identification. However, in practice, MDs may still be
                 too strict and applicable only in a subset of tuples in
                 a relation. Thereby, we study the conditional matching
                 dependencies (CMDs), which bind matching dependencies
                 only in a certain part of a table, i.e., MDs
                 conditionally applicable in a subset of tuples.
                 Compared to MDs, CMDs have more expressive power that
                 enables them to satisfy wider application needs. In
                 this article, we study several important theoretical
                 and practical issues of CMDs, including irreducible
                 CMDs with respect to the implication, discovery of CMDs
                 from data, reliable CMDs agreed most by a relation,
                 approximate CMDs almost satisfied in a relation, and
                 finally applications of CMDs in record matching and
                 missing value repairing. Through an extensive
                 experimental evaluation in real data sets, we
                 demonstrate the efficiency of proposed CMDs discovery
                 algorithms and effectiveness of CMDs in real
                 applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "46",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Anagnostopoulos:2017:QDL,
  author =       "Christos Anagnostopoulos and Peter Triantafillou",
  title =        "Query-Driven Learning for Predictive Analytics of Data
                 Subspace Cardinality",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "47:1--47:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3059177",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Fundamental to many predictive analytics tasks is the
                 ability to estimate the cardinality (number of data
                 items) of multi-dimensional data subspaces, defined by
                 query selections over datasets. This is crucial for
                 data analysts dealing with, e.g., interactive data
                 subspace explorations, data subspace visualizations,
                 and in query processing optimization. However, in many
                 modern data systems, predictive analytics may be (i)
                 too costly money-wise, e.g., in clouds, (ii)
                 unreliable, e.g., in modern Big Data query engines,
                 where accurate statistics are difficult to
                 obtain/maintain, or (iii) infeasible, e.g., for privacy
                 issues. We contribute a novel, query-driven, function
                 estimation model of analyst-defined data subspace
                 cardinality. The proposed estimation model is highly
                 accurate in terms of prediction and accommodating the
                 well-known selection queries: multi-dimensional range
                 and distance-nearest neighbors (radius) queries. Our
                 function estimation model: (i) quantizes the vectorial
                 query space, by learning the analysts' access patterns
                 over a data space, (ii) associates query vectors with
                 their corresponding cardinalities of the
                 analyst-defined data subspaces, (iii) abstracts and
                 employs query vectorial similarity to predict the
                 cardinality of an unseen/unexplored data subspace, and
                 (iv) identifies and adapts to possible changes of the
                 query subspaces based on the theory of optimal
                 stopping. The proposed model is decentralized,
                 facilitating the scaling-out of such predictive
                 analytics queries. The research significance of the
                 model lies in that (i) it is an attractive solution
                 when data-driven statistical techniques are undesirable
                 or infeasible, (ii) it offers a scale-out,
                 decentralized training solution, (iii) it is applicable
                 to different selection query types, and (iv) it offers
                 a performance that is superior to that of data-driven
                 approaches.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "47",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wu:2017:LSO,
  author =       "Yue Wu and Steven C. H. Hoi and Tao Mei and Nenghai
                 Yu",
  title =        "Large-Scale Online Feature Selection for Ultra-High
                 Dimensional Sparse Data",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "48:1--48:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3070646",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Feature selection (FS) is an important technique in
                 machine learning and data mining, especially for
                 large-scale high-dimensional data. Most existing
                 studies have been restricted to batch learning, which
                 is often inefficient and poorly scalable when handling
                 big data in real world. As real data may arrive
                 sequentially and continuously, batch learning has to
                 retrain the model for the new coming data, which is
                 very computationally intensive. Online feature
                 selection (OFS) is a promising new paradigm that is
                 more efficient and scalable than batch learning
                 algorithms. However, existing online algorithms usually
                 fall short in their inferior efficacy. In this article,
                 we present a novel second-order OFS algorithm that is
                 simple yet effective, very fast and extremely scalable
                 to deal with large-scale ultra-high dimensional sparse
                 data streams. The basic idea is to exploit the
                 second-order information to choose the subset of
                 important features with high confidence weights. Unlike
                 existing OFS methods that often suffer from extra high
                 computational cost, we devise a novel algorithm with a
                 MaxHeap-based approach, which is not only more
                 effective than the existing first-order algorithms, but
                 also significantly more efficient and scalable. Our
                 extensive experiments validated that the proposed
                 technique achieves highly competitive accuracy as
                 compared with state-of-the-art batch FS methods,
                 meanwhile it consumes significantly less computational
                 cost that is orders of magnitude lower. Impressively,
                 on a billion-scale synthetic dataset (1-billion
                 dimensions, 1-billion non-zero features, and 1-million
                 samples), the proposed algorithm takes less than 3
                 minutes to run on a single PC.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "48",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Costa:2017:MTA,
  author =       "Alceu Ferraz Costa and Yuto Yamaguchi and Agma Juci
                 Machado Traina and Caetano {Traina Jr.} and Christos
                 Faloutsos",
  title =        "Modeling Temporal Activity to Detect Anomalous
                 Behavior in Social Media",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "49:1--49:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3064884",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Social media has become a popular and important tool
                 for human communication. However, due to this
                 popularity, spam and the distribution of malicious
                 content by computer-controlled users, known as bots,
                 has become a widespread problem. At the same time, when
                 users use social media, they generate valuable data
                 that can be used to understand the patterns of human
                 communication. In this article, we focus on the
                 following important question: Can we identify and use
                 patterns of human communication to decide whether a
                 human or a bot controls a user? The first contribution
                 of this article is showing that the distribution of
                 inter-arrival times (IATs) between postings is
                 characterized by following four patterns: (i)
                 heavy-tails, (ii) periodic-spikes, (iii) correlation
                 between consecutive values, and (iv) bimodallity. As
                 our second contribution, we propose a mathematical
                 model named Act-M (Activity Model). We show that Act-M
                 can accurately fit the distribution of IATs from social
                 media users. Finally, we use Act-M to develop a method
                 that detects if users are bots based only on the timing
                 of their postings. We validate Act-M using data from
                 over 55 million postings from four social media
                 services: Reddit, Twitter, Stack-Overflow, and
                 Hacker-News. Our experiments show that Act-M provides a
                 more accurate fit to the data than existing models for
                 human dynamics. Additionally, when detecting bots,
                 Act-M provided a precision higher than 93\% and 77\%
                 with a sensitivity of 70\% for the Twitter and Reddit
                 datasets, respectively.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "49",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Vosoughi:2017:RGP,
  author =       "Soroush Vosoughi and Mostafa `Neo' Mohsenvand and Deb
                 Roy",
  title =        "Rumor Gauge: Predicting the Veracity of Rumors on
                 {Twitter}",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "50:1--50:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3070644",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The spread of malicious or accidental misinformation
                 in social media, especially in time-sensitive
                 situations, such as real-world emergencies, can have
                 harmful effects on individuals and society. In this
                 work, we developed models for automated verification of
                 rumors (unverified information) that propagate through
                 Twitter. To predict the veracity of rumors, we
                 identified salient features of rumors by examining
                 three aspects of information spread: linguistic style
                 used to express rumors, characteristics of people
                 involved in propagating information, and network
                 propagation dynamics. The predicted veracity of a time
                 series of these features extracted from a rumor (a
                 collection of tweets) is generated using Hidden Markov
                 Models. The verification algorithm was trained and
                 tested on 209 rumors representing 938,806 tweets
                 collected from real-world events, including the 2013
                 Boston Marathon bombings, the 2014 Ferguson unrest, and
                 the 2014 Ebola epidemic, and many other rumors about
                 various real-world events reported on popular websites
                 that document public rumors. The algorithm was able to
                 correctly predict the veracity of 75\% of the rumors
                 faster than any other public source, including
                 journalists and law enforcement officials. The ability
                 to track rumors and predict their outcomes may have
                 practical applications for news consumers, financial
                 markets, journalists, and emergency services, and more
                 generally to help minimize the impact of false
                 information on Twitter.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "50",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Boutemine:2017:MCS,
  author =       "Oualid Boutemine and Mohamed Bouguessa",
  title =        "Mining Community Structures in Multidimensional
                 Networks",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "51:1--51:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3080574",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "We investigate the problem of community detection in
                 multidimensional networks, that is, networks where
                 entities engage in various interaction types
                 (dimensions) simultaneously. While some approaches have
                 been proposed to identify community structures in
                 multidimensional networks, there are a number of
                 problems still to solve. In fact, the majority of the
                 proposed approaches suffer from one or even more of the
                 following limitations: (1) difficulty detecting
                 communities in networks characterized by the presence
                 of many irrelevant dimensions, (2) lack of systematic
                 procedures to explicitly identify the relevant
                 dimensions of each community, and (3) dependence on a
                 set of user-supplied parameters, including the number
                 of communities, that require a proper tuning. Most of
                 the existing approaches are inadequate for dealing with
                 these three issues in a unified framework. In this
                 paper, we develop a novel approach that is capable of
                 addressing the aforementioned limitations in a single
                 framework. The proposed approach allows automated
                 identification of communities and their sub-dimensional
                 spaces using a novel objective function and a
                 constrained label propagation-based optimization
                 strategy. By leveraging the relevance of dimensions at
                 the node level, the strategy aims to maximize the
                 number of relevant within-community links while keeping
                 track of the most relevant dimensions. A notable
                 feature of the proposed approach is that it is able to
                 automatically identify low dimensional community
                 structures embedded in a high dimensional space.
                 Experiments on synthetic and real multidimensional
                 networks illustrate the suitability of the new
                 method.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "51",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Algizawy:2017:RTL,
  author =       "Essam Algizawy and Tetsuji Ogawa and Ahmed El-Mahdy",
  title =        "Real-Time Large-Scale Map Matching Using Mobile Phone
                 Data",
  journal =      j-TKDD,
  volume =       "11",
  number =       "4",
  pages =        "52:1--52:??",
  month =        aug,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3046945",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Mon Jan 22 09:23:44 MST 2018",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "With the wide spread use of mobile phones, cellular
                 mobile big data is becoming an important resource that
                 provides a wealth of information with almost no cost.
                 However, the data generally suffers from relatively
                 high spatial granularity, limiting the scope of its
                 application. In this article, we consider, for the
                 first time, the utility of actual mobile big data for
                 map matching allowing for ``microscopic'' level traffic
                 analysis. The state-of-the-art in map matching
                 generally targets GPS data, which provides far denser
                 sampling and higher location resolution than the mobile
                 data. Our approach extends the typical Hidden-Markov
                 model used in map matching to accommodate for highly
                 sparse location trajectories, exploit the large mobile
                 data volume to learn the model parameters, and exploit
                 the sparsity of the data to provide for real-time
                 Viterbi processing. We study an actual, anonymised
                 mobile trajectories data set of the city of Dakar,
                 Senegal, spanning a year, and generate a corresponding
                 road-level traffic density, at an hourly granularity,
                 for each mobile trajectory. We observed a relatively
                 high correlation between the generated traffic
                 intensities and corresponding values obtained by the
                 gravity and equilibrium models typically used in
                 mobility analysis, indicating the utility of the
                 approach as an alternative means for traffic
                 analysis.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "52",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{vanLeeuwen:2018:ETS,
  author =       "Matthijs van Leeuwen and Polo Chau and Jilles Vreeken
                 and Dafna Shahaf and Christos Faloutsos",
  title =        "Editorial: {TKDD} Special Issue on Interactive Data
                 Exploration and Analytics",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "1:1--1:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3181707",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "1",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Rayar:2018:VIS,
  author =       "Fr{\'e}d{\'e}ric Rayar and Sabine Barrat and Fatma
                 Bouali and Gilles Venturini",
  title =        "A Viewable Indexing Structure for the Interactive
                 Exploration of Dynamic and Large Image Collections",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "2:1--2:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3047011",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Thanks to the capturing devices cost reduction and the
                 advent of social networks, the size of image
                 collections is becoming extremely huge. Many works in
                 the literature have addressed the indexing of large
                 image collections for search purposes. However, there
                 is a lack of support for exploratory data mining. One
                 may want to wander around the images and experience
                 serendipity in the exploration process. Thus, effective
                 paradigms not only for organising, but also visualising
                 these image collections become necessary. In this
                 article, we present a study to jointly index and
                 visualise large image collections. The work focuses on
                 satisfying three constraints. First, large image
                 collections, up to million of images, shall be handled.
                 Second, dynamic collections, such as ever-growing
                 collections, shall be processed in an incremental way,
                 without reprocessing the whole collection at each
                 modification. Finally, an intuitive and interactive
                 exploration system shall be provided to the user to
                 allow him to easily mine image collections. To this
                 end, a data partitioning algorithm has been modified
                 and proximity graphs have been used to fit the
                 visualisation purpose. A custom web platform has been
                 implemented to visualise the hierarchical and
                 graph-based hybrid structure. The results of a user
                 evaluation we have conducted show that the exploration
                 of the collections is intuitive and smooth thanks to
                 the proposed structure. Furthermore, the scalability of
                 the proposed indexing method is proved using large
                 public image collections.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "2",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Makki:2018:AVV,
  author =       "Raheleh Makki and Eder Carvalho and Axel J. Soto and
                 Stephen Brooks and Maria Cristina {Ferreira De
                 Oliveira} and Evangelos Milios and Rosane Minghim",
  title =        "{ATR-Vis}: Visual and Interactive Information
                 Retrieval for Parliamentary Discussions in {Twitter}",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "3:1--3:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3047010",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The worldwide adoption of Twitter turned it into one
                 of the most popular platforms for content analysis as
                 it serves as a gauge of the public's feeling and
                 opinion on a variety of topics. This is particularly
                 true of political discussions and lawmakers' actions
                 and initiatives. Yet, one common but unrealistic
                 assumption is that the data of interest for analysis is
                 readily available in a comprehensive and accurate form.
                 Data need to be retrieved, but due to the brevity and
                 noisy nature of Twitter content, it is difficult to
                 formulate user queries that match relevant posts that
                 use different terminology without introducing a
                 considerable volume of unwanted content. This problem
                 is aggravated when the analysis must contemplate
                 multiple and related topics of interest, for which
                 comments are being concurrently posted. This article
                 presents Active Tweet Retrieval Visualization
                 (ATR-Vis), a user-driven visual approach for the
                 retrieval of Twitter content applicable to this
                 scenario. The method proposes a set of active retrieval
                 strategies to involve an analyst in such a way that a
                 major improvement in retrieval coverage and precision
                 is attained with minimal user effort. ATR-Vis enables
                 non-technical users to benefit from the aforementioned
                 active learning strategies by providing visual aids to
                 facilitate the requested supervision. This supports the
                 exploration of the space of potentially relevant
                 tweets, and affords a better understanding of the
                 retrieval results. We evaluate our approach in
                 scenarios in which the task is to retrieve tweets
                 related to multiple parliamentary debates within a
                 specific time span. We collected two Twitter datasets,
                 one associated with debates in the Canadian House of
                 Commons during a particular week in May 2014, and
                 another associated with debates in the Brazilian
                 Federal Senate during a selected week in May 2015. The
                 two use cases illustrate the effectiveness of ATR-Vis
                 for the retrieval of relevant tweets, while
                 quantitative results show that our approach achieves
                 high retrieval quality with a modest amount of
                 supervision. Finally, we evaluated our tool with three
                 external users who perform searching in social media as
                 part of their professional work.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "3",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Lim:2018:MEA,
  author =       "Yongsub Lim and Minsoo Jung and U. Kang",
  title =        "Memory-Efficient and Accurate Sampling for Counting
                 Local Triangles in Graph Streams: From Simple to
                 Multigraphs",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "4:1--4:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3022186",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "How can we estimate local triangle counts accurately
                 in a graph stream without storing the whole graph? How
                 to handle duplicated edges in local triangle counting
                 for graph stream? Local triangle counting, which
                 computes the number of triangles attached to each node
                 in a graph, is a very important problem with wide
                 applications in social network analysis, anomaly
                 detection, web mining, and the like. In this article,
                 we propose algorithms for local triangle counting in a
                 graph stream based on edge sampling: M ascot for a
                 simple graph, and MultiBMascot and MultiWMascot for a
                 multigraph. To develop Mascot, we first present two
                 naive local triangle counting algorithms in a graph
                 stream, called Mascot-C and Mascot-A. Mascot-C is based
                 on constant edge sampling, and Mascot-A improves its
                 accuracy by utilizing more memory spaces. Mascot
                 achieves both accuracy and memory-efficiency of the two
                 algorithms by unconditional triangle counting for a new
                 edge, regardless of whether it is sampled or not.
                 Extending the idea to a multigraph, we develop two
                 algorithms MultiBMascot and MultiWMascot. MultiBMascot
                 enables local triangle counting on the corresponding
                 simple graph of a streamed multigraph without explicit
                 graph conversion; MultiWMascot considers repeated
                 occurrences of an edge as its weight and counts each
                 triangle as the product of its three edge weights. In
                 contrast to the existing algorithm that requires prior
                 knowledge on the target graph and appropriately set
                 parameters, our proposed algorithms require only one
                 parameter of edge sampling probability. Through
                 extensive experiments, we show that for the same number
                 of edges sampled, M ascot provides the best accuracy
                 compared to the existing algorithm as well as Mascot-C
                 and Mascot-A. We also demonstrate that MultiBMascot on
                 a multigraph is comparable to Mascot-C on the
                 counterpart simple graph, and MultiWMascot becomes more
                 accurate for higher degree nodes. Thanks to Mascot, we
                 also discover interesting anomalous patterns in real
                 graphs, including core-peripheries in the web, a
                 bimodal call pattern in a phone call history, and
                 intensive collaboration in DBLP.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "4",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Shi:2018:VAB,
  author =       "Lei Shi and Hanghang Tong and Madelaine Daianu and
                 Feng Tian and Paul M. Thompson",
  title =        "Visual Analysis of Brain Networks Using Sparse
                 Regression Models",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "5:1--5:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3023363",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Studies of the human brain network are becoming
                 increasingly popular in the fields of neuroscience,
                 computer science, and neurology. Despite this rapidly
                 growing line of research, gaps remain on the
                 intersection of data analytics, interactive visual
                 representation, and the human intelligence-all needed
                 to advance our understanding of human brain networks.
                 This article tackles this challenge by exploring the
                 design space of visual analytics. We propose an
                 integrated framework to orchestrate computational
                 models with comprehensive data visualizations on the
                 human brain network. The framework targets two
                 fundamental tasks: the visual exploration of
                 multi-label brain networks and the visual comparison
                 among brain networks across different subject groups.
                 During the first task, we propose a novel interactive
                 user interface to visualize sets of labeled brain
                 networks; in our second task, we introduce sparse
                 regression models to select discriminative features
                 from the brain network to facilitate the comparison.
                 Through user studies and quantitative experiments, both
                 methods are shown to greatly improve the visual
                 comparison performance. Finally, real-world case
                 studies with domain experts demonstrate the utility and
                 effectiveness of our framework to analyze
                 reconstructions of human brain connectivity maps. The
                 perceptually optimized visualization design and the
                 feature selection model calibration are shown to be the
                 key to our significant findings.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "5",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Galbrun:2018:MRS,
  author =       "Esther Galbrun and Pauli Miettinen",
  title =        "Mining Redescriptions with Siren",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "6:1--6:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3007212",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In many areas of science, scientists need to find
                 distinct common characterizations of the same objects
                 and, vice versa, to identify sets of objects that admit
                 multiple shared descriptions. For example, in biology,
                 an important task is to identify the bioclimatic
                 constraints that allow some species to survive, that
                 is, to describe geographical regions both in terms of
                 the fauna that inhabits them and of their bioclimatic
                 conditions. In data analysis, the task of automatically
                 generating such alternative characterizations is called
                 redescription mining. If a domain expert wants to use
                 redescription mining in his research, merely being able
                 to find redescriptions is not enough. He must also be
                 able to understand the redescriptions found, adjust
                 them to better match his domain knowledge, test
                 alternative hypotheses with them, and guide the mining
                 process toward results he considers interesting. To
                 facilitate these goals, we introduce Siren, an
                 interactive tool for mining and visualizing
                 redescriptions. Siren allows to obtain redescriptions
                 in an anytime fashion through efficient, distributed
                 mining, to examine the results in various linked
                 visualizations, to interact with the results either
                 directly or via the visualizations, and to guide the
                 mining algorithm toward specific redescriptions. In
                 this article, we explain the features of Siren and why
                 they are useful for redescription mining. We also
                 propose two novel redescription mining algorithms that
                 improve the generalizability of the results compared to
                 the existing ones.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "6",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wu:2018:IDC,
  author =       "Hao Wu and Maoyuan Sun and Peng Mi and Nikolaj Tatti
                 and Chris North and Naren Ramakrishnan",
  title =        "Interactive Discovery of Coordinated Relationship
                 Chains with Maximum Entropy Models",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "7:1--7:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3047017",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Modern visual analytic tools promote human-in-the-loop
                 analysis but are limited in their ability to direct the
                 user toward interesting and promising directions of
                 study. This problem is especially acute when the
                 analysis task is exploratory in nature, e.g., the
                 discovery of potentially coordinated relationships in
                 massive text datasets. Such tasks are very common in
                 domains like intelligence analysis and security
                 forensics where the goal is to uncover surprising
                 coalitions bridging multiple types of relations. We
                 introduce new maximum entropy models to discover
                 surprising chains of relationships leveraging count
                 data about entity occurrences in documents. These
                 models are embedded in a visual analytic system called
                 MERCER (Maximum Entropy Relational Chain ExploRer) that
                 treats relationship bundles as first class objects and
                 directs the user toward promising lines of inquiry. We
                 demonstrate how user input can judiciously direct
                 analysis toward valid conclusions, whereas a purely
                 algorithmic approach could be led astray. Experimental
                 results on both synthetic and real datasets from the
                 intelligence community are presented.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "7",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Choo:2018:VVA,
  author =       "Jaegul Choo and Hannah Kim and Edward Clarkson and
                 Zhicheng Liu and Changhyun Lee and Fuxin Li and
                 Hanseung Lee and Ramakrishnan Kannan and Charles D.
                 Stolper and John Stasko and Haesun Park",
  title =        "{VisIRR}: a Visual Analytics System for Information
                 Retrieval and Recommendation for Large-Scale Document
                 Data",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "8:1--8:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3070616",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In this article, we present an interactive visual
                 information retrieval and recommendation system, called
                 VisIRR, for large-scale document discovery. VisIRR
                 effectively combines the paradigms of (1) a passive
                 pull through query processes for retrieval and (2) an
                 active push that recommends items of potential interest
                 to users based on their preferences. Equipped with an
                 efficient dynamic query interface against a large-scale
                 corpus, VisIRR organizes the retrieved documents into
                 high-level topics and visualizes them in a 2D space,
                 representing the relationships among the topics along
                 with their keyword summary. In addition, based on
                 interactive personalized preference feedback with
                 regard to documents, VisIRR provides document
                 recommendations from the entire corpus, which are
                 beyond the retrieved sets. Such recommended documents
                 are visualized in the same space as the retrieved
                 documents, so that users can seamlessly analyze both
                 existing and newly recommended ones. This article
                 presents novel computational methods, which make these
                 integrated representations and fast interactions
                 possible for a large-scale document corpus. We
                 illustrate how the system works by providing detailed
                 usage scenarios. Additionally, we present preliminary
                 user study results for evaluating the effectiveness of
                 the system.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "8",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Kamat:2018:SBA,
  author =       "Niranjan Kamat and Arnab Nandi",
  title =        "A Session-Based Approach to Fast-But-Approximate
                 Interactive Data Cube Exploration",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3070648",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "With the proliferation of large datasets, sampling has
                 become pervasive in data analysis. Sampling has
                 numerous benefits-from reducing the computation time
                 and cost to increasing the scope of interactive
                 analysis. A popular task in data science, well-suited
                 toward sampling, is the computation of
                 fast-but-approximate aggregations over sampled data.
                 Aggregation is a foundational block of data analysis,
                 with data cube being its primary construct. We observe
                 that such aggregation queries are typically issued in
                 an ad-hoc, interactive setting. In contrast to one-off
                 queries, a typical query session consists of a series
                 of quick queries, interspersed with the user inspecting
                 the results and formulating the next query. The
                 similarity between session queries opens up
                 opportunities for reusing computation of not just query
                 results, but also error estimates. Error estimates need
                 to be provided alongside sampled results for the
                 results to be meaningful. We propose Sesame, a rewrite
                 and caching framework that accelerates the entire
                 interactive session of aggregation queries over sampled
                 data. We focus on two unique and computationally
                 expensive aspects of this use case: query speculation
                 in the presence of sampling, and error computation, and
                 provide novel strategies for result and error reuse. We
                 demonstrate that our approach outperforms conventional
                 sampled aggregation techniques by at least an order of
                 magnitude, without modifying the underlying database.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "9",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Senin:2018:GID,
  author =       "Pavel Senin and Jessica Lin and Xing Wang and Tim
                 Oates and Sunil Gandhi and Arnold P. Boedihardjo and
                 Crystal Chen and Susan Frankenstein",
  title =        "{GrammarViz} 3.0: Interactive Discovery of
                 Variable-Length Time Series Patterns",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3051126",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "The problems of recurrent and anomalous pattern
                 discovery in time series, e.g., motifs and discords,
                 respectively, have received a lot of attention from
                 researchers in the past decade. However, since the
                 pattern search space is usually intractable, most
                 existing detection algorithms require that the patterns
                 have discriminative characteristics and have its length
                 known in advance and provided as input, which is an
                 unreasonable requirement for many real-world problems.
                 In addition, patterns of similar structure, but of
                 different lengths may co-exist in a time series.
                 Addressing these issues, we have developed algorithms
                 for variable-length time series pattern discovery that
                 are based on symbolic discretization and grammar
                 inference-two techniques whose combination enables the
                 structured reduction of the search space and discovery
                 of the candidate patterns in linear time. In this work,
                 we present GrammarViz 3.0-a software package that
                 provides implementations of proposed algorithms and
                 graphical user interface for interactive
                 variable-length time series pattern discovery. The
                 current version of the software provides an alternative
                 grammar inference algorithm that improves the time
                 series motif discovery workflow, and introduces an
                 experimental procedure for automated discretization
                 parameter selection that builds upon the minimum
                 cardinality maximum cover principle and aids the time
                 series recurrent and anomalous pattern discovery.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "10",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Datta:2018:CVC,
  author =       "Srayan Datta and Eytan Adar",
  title =        "{CommunityDiff}: Visualizing Community Clustering
                 Algorithms",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "11:1--11:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3047009",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Community detection is an oft-used analytical function
                 of network analysis but can be a black art to apply in
                 practice. Grouping of related nodes is important for
                 identifying patterns in network datasets but also
                 notoriously sensitive to input data and algorithm
                 selection. This is further complicated by the fact
                 that, depending on domain and use case, the ground
                 truth knowledge of the end-user can vary from none to
                 complete. In this work, we present CommunityDiff, an
                 interactive visualization system that combines
                 visualization and active learning (AL) to support the
                 end-user's analytical process. As the end-user
                 interacts with the system, a continuous refinement
                 process updates both the community labels and
                 visualizations. CommunityDiff features a mechanism for
                 visualizing ensemble spaces, weighted combinations of
                 algorithm output, that can identify patterns,
                 commonalities, and differences among multiple community
                 detection algorithms. Among other features,
                 CommunityDiff introduces an AL mechanism that visually
                 indicates uncertainty about community labels to focus
                 end-user attention and supporting end-user control that
                 ranges from explicitly indicating the number of
                 expected communities to merging and splitting
                 communities. Based on this end-user input,
                 CommunityDiff dynamically recalculates communities. We
                 demonstrate the viability of our through a study of
                 speed of end-user convergence on satisfactory community
                 labels. As part of building CommunityDiff, we describe
                 a design process that can be adapted to other
                 Interactive Machine Learning applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "11",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Yang:2018:LIC,
  author =       "Yang Yang and Jie Tang and Juanzi Li",
  title =        "Learning to Infer Competitive Relationships in
                 Heterogeneous Networks",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "12:1--12:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3051127",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Detecting and monitoring competitors is fundamental to
                 a company to stay ahead in the global market. Existing
                 studies mainly focus on mining competitive
                 relationships within a single data source, while
                 competing information is usually distributed in
                 multiple networks. How to discover the underlying
                 patterns and utilize the heterogeneous knowledge to
                 avoid biased aspects in this issue is a challenging
                 problem. In this article, we study the problem of
                 mining competitive relationships by learning across
                 heterogeneous networks. We use Twitter and patent
                 records as our data sources and statistically study the
                 patterns behind the competitive relationships. We find
                 that the two networks exhibit different but
                 complementary patterns of competitions. Overall, we
                 find that similar entities tend to be competitors, with
                 a probability of 4 times higher than chance. On the
                 other hand, in social network, we also find a 10
                 minutes phenomenon: when two entities are mentioned by
                 the same user within 10 minutes, the likelihood of them
                 being competitors is 25 times higher than chance. Based
                 on the discovered patterns, we propose a novel Topical
                 Factor Graph Model. Generally, our model defines a
                 latent topic layer to bridge the Twitter network and
                 patent network. It then employs a semi-supervised
                 learning algorithm to classify the relationships
                 between entities (e.g., companies or products). We test
                 the proposed model on two real data sets and the
                 experimental results validate the effectiveness of our
                 model, with an average of +46\% improvement over
                 alternative methods. Besides, we further demonstrate
                 the competitive relationships inferred by our proposed
                 model can be applied in the job-hopping prediction
                 problem by achieving an average of +10.7\%
                 improvement.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "12",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Wang:2018:PSM,
  author =       "Boyue Wang and Yongli Hu and Junbin Gao and Yanfeng
                 Sun and Baocai Yin",
  title =        "Partial Sum Minimization of Singular Values
                 Representation on {Grassmann} Manifolds",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "13:1--13:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092690",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Clustering is one of the fundamental topics in data
                 mining and pattern recognition. As a prospective
                 clustering method, the subspace clustering has made
                 considerable progress in recent researches, e.g.,
                 sparse subspace clustering (SSC) and low rank
                 representation (LRR). However, most existing subspace
                 clustering algorithms are designed for vectorial data
                 from linear spaces, thus not suitable for
                 high-dimensional data with intrinsic non-linear
                 manifold structure. For high-dimensional or manifold
                 data, few research pays attention to clustering
                 problems. The purpose of clustering on manifolds tends
                 to cluster manifold-valued data into several groups
                 according to the mainfold-based similarity metric. This
                 article proposes an extended LRR model for
                 manifold-valued Grassmann data that incorporates prior
                 knowledge by minimizing partial sum of singular values
                 instead of the nuclear norm, namely Partial Sum
                 minimization of Singular Values Representation
                 (GPSSVR). The new model not only enforces the global
                 structure of data in low rank, but also retains
                 important information by minimizing only smaller
                 singular values. To further maintain the local
                 structures among Grassmann points, we also integrate
                 the Laplacian penalty with GPSSVR. The proposed model
                 and algorithms are assessed on a public human face
                 dataset, some widely used human action video datasets
                 and a real scenery dataset. The experimental results
                 show that the proposed methods obviously outperform
                 other state-of-the-art methods.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "13",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Trevino:2018:DSE,
  author =       "Edgar S. Garc{\'\i}a Trevi{\~n}o and Muhammad Zaid
                 Hameed and Javier A. Barria",
  title =        "Data Stream Evolution Diagnosis Using Recursive
                 Wavelet Density Estimators",
  journal =      j-TKDD,
  volume =       "12",
  number =       "1",
  pages =        "14:1--14:??",
  month =        feb,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106369",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Data streams are a new class of data that is becoming
                 pervasively important in a wide range of applications,
                 ranging from sensor networks, environmental monitoring
                 to finance. In this article, we propose a novel
                 framework for the online diagnosis of evolution of
                 multidimensional streaming data that incorporates
                 Recursive Wavelet Density Estimators into the context
                 of Velocity Density Estimation. In the proposed
                 framework changes in streaming data are characterized
                 by the use of local and global evolution coefficients.
                 In addition, we propose for the analysis of changes in
                 the correlation structure of the data a recursive
                 implementation of the Pearson correlation coefficient
                 using exponential discounting. Two visualization tools,
                 namely temporal and spatial velocity profiles, are
                 extended in the context of the proposed framework.
                 These are the three main advantages of the proposed
                 method over previous approaches: (1) the memory storage
                 required is minimal and independent of any window size;
                 (2) it has a significantly lower computational
                 complexity; and (3) it makes possible the fast
                 diagnosis of data evolution at all dimensions and at
                 relevant combinations of dimensions with only one pass
                 of the data. With the help of the four examples, we
                 show the framework's relevance in a change detection
                 context and its potential capability for real world
                 applications.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "14",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Kaushal:2018:ETP,
  author =       "Vishal Kaushal and Manasi Patwardhan",
  title =        "Emerging Trends in Personality Identification Using
                 Online Social Networks --- a Literature Survey",
  journal =      j-TKDD,
  volume =       "12",
  number =       "2",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3070645",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Personality is a combination of all the
                 attributes-behavioral, temperamental, emotional, and
                 mental-that characterizes a unique individual. Ability
                 to identify personalities of people has always been of
                 great interest to the researchers due to its
                 importance. It continues to find highly useful
                 applications in many domains. Owing to the increasing
                 popularity of online social networks, researchers have
                 started looking into the possibility of predicting a
                 user's personality from his online social networking
                 profile, which serves as a rich source of textual as
                 well as non-textual content published by users. In the
                 process of creating social networking profiles, users
                 reveal a lot about themselves both in what they share
                 and how they say it. Studies suggest that the online
                 social networking websites are, in fact, a relevant and
                 valid means of communicating personality. In this
                 article, we review these various studies reported in
                 literature toward identification of personality using
                 online social networks. To the best of our knowledge,
                 this is the first reported survey of its kind at the
                 time of submission. We hope that our contribution,
                 especially in summarizing the previous findings and in
                 identifying the directions for future research in this
                 area, would encourage researchers to do more work in
                 this budding area.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "15",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Pandove:2018:SRC,
  author =       "Divya Pandove and Shivan Goel and Rinkl Rani",
  title =        "Systematic Review of Clustering High-Dimensional and
                 Large Datasets",
  journal =      j-TKDD,
  volume =       "12",
  number =       "2",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3132088",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Technological advancement has enabled us to store and
                 process huge amount of data in relatively short spans
                 of time. The nature of data is rapidly changing,
                 particularly its dimensionality is more commonly multi-
                 and high-dimensional. There is an immediate need to
                 expand our focus to include analysis of
                 high-dimensional and large datasets. Data analysis is
                 becoming a mammoth task, due to incremental increase in
                 data volume and complexity in terms of heterogony of
                 data. It is due to this dynamic computing environment
                 that the existing techniques either need to be modified
                 or discarded to handle new data in multiple
                 high-dimensions. Data clustering is a tool that is used
                 in many disciplines, including data mining, so that
                 meaningful knowledge can be extracted from seemingly
                 unstructured data. The aim of this article is to
                 understand the problem of clustering and various
                 approaches addressing this problem. This article
                 discusses the process of clustering from both
                 microviews (data treating) and macroviews (overall
                 clustering process). Different distance and similarity
                 measures, which form the cornerstone of effective data
                 clustering, are also identified. Further, an in-depth
                 analysis of different clustering approaches focused on
                 data mining, dealing with large-scale datasets is
                 given. These approaches are comprehensively compared to
                 bring out a clear differentiation among them. This
                 article also surveys the problem of high-dimensional
                 data and the existing approaches, that makes it more
                 relevant. It also explores the latest trends in cluster
                 analysis, and the real-life applications of this
                 concept. This survey is exhaustive as it tries to cover
                 all the aspects of clustering in the field of data
                 mining.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "16",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Li:2018:LSC,
  author =       "Yixuan Li and Kun He and Kyle Kloster and David Bindel
                 and John Hopcroft",
  title =        "Local Spectral Clustering for Overlapping Community
                 Detection",
  journal =      j-TKDD,
  volume =       "12",
  number =       "2",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106370",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Large graphs arise in a number of contexts and
                 understanding their structure and extracting
                 information from them is an important research area.
                 Early algorithms for mining communities have focused on
                 global graph structure, and often run in time
                 proportional to the size of the entire graph. As we
                 explore networks with millions of vertices and find
                 communities of size in the hundreds, it becomes
                 important to shift our attention from macroscopic
                 structure to microscopic structure in large networks. A
                 growing body of work has been adopting local expansion
                 methods in order to identify communities from a few
                 exemplary seed members. In this article, we propose a
                 novel approach for finding overlapping communities
                 called Lemon (Local Expansion via Minimum One Norm).
                 Provided with a few known seeds, the algorithm finds
                 the community by performing a local spectral diffusion.
                 The core idea of Lemon is to use short random walks to
                 approximate an invariant subspace near a seed set,
                 which we refer to as local spectra. Local spectra can
                 be viewed as the low-dimensional embedding that
                 captures the nodes' closeness in the local network
                 structure. We show that Lemon's performance in
                 detecting communities is competitive with
                 state-of-the-art methods. Moreover, the running time
                 scales with the size of the community rather than that
                 of the entire graph. The algorithm is easy to implement
                 and is highly parallelizable. We further provide
                 theoretical analysis of the local spectral properties,
                 bounding the measure of tightness of extracted
                 community using the eigenvalues of graph Laplacian. We
                 thoroughly evaluate our approach using both synthetic
                 and real-world datasets across different domains, and
                 analyze the empirical variations when applying our
                 method to inherently different networks in practice. In
                 addition, the heuristics on how the seed set quality
                 and quantity would affect the performance are
                 provided.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "17",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Costa:2018:MOC,
  author =       "Gianni Costa and Riccardo Ortale",
  title =        "Mining Overlapping Communities and Inner Role
                 Assignments through {Bayesian} Mixed-Membership Models
                 of Networks with Context-Dependent Interactions",
  journal =      j-TKDD,
  volume =       "12",
  number =       "2",
  pages =        "18:1--18:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106368",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Community discovery and role assignment have been
                 recently integrated into an unsupervised approach for
                 the exploratory analysis of overlapping communities and
                 inner roles in networks. However, the formation of ties
                 in these prototypical research efforts is not truly
                 realistic, since it does not account for a fundamental
                 aspect of link establishment in real-world networks,
                 i.e., the explicative reasons that cause interactions
                 among nodes. Such reasons can be interpreted as generic
                 requirements of nodes, that are met by other nodes and
                 essentially pertain both to the nodes themselves and to
                 their interaction contexts (i.e., the respective
                 communities and roles). In this article, we present two
                 new model-based machine-learning approaches, wherein
                 community discovery and role assignment are seamlessly
                 integrated and simultaneously performed through
                 approximate posterior inference in Bayesian
                 mixed-membership models of directed networks. The
                 devised models account for the explicative reasons
                 governing link establishment in terms of node-specific
                 and contextual latent interaction factors. The former
                 are inherently characteristic of nodes, while the
                 latter are characterizations of nodes in the context of
                 the individual communities and roles. The generative
                 process of both models assigns nodes to communities
                 with respective roles and connects them through
                 directed links, which are probabilistically governed by
                 their node-specific and contextual interaction factors.
                 The difference between the proposed models lies in the
                 exploitation of the contextual interaction factors.
                 More precisely, in one model, the contextual
                 interaction factors have the same impact on link
                 generation. In the other model, the contextual
                 interaction factors are weighted by the extent of
                 involvement of the linked nodes in the respective
                 communities and roles. We develop MCMC algorithms
                 implementing approximate posterior inference and
                 parameter estimation within our models. Finally, we
                 conduct an intensive comparative experimentation, which
                 demonstrates their superiority in community compactness
                 and link prediction on various real-world and synthetic
                 networks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "18",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Long:2018:PMS,
  author =       "Cheng Long and Raymond Chi-Wing Wong and Victor Junqiu
                 Wei",
  title =        "Profit Maximization with Sufficient Customer
                 Satisfactions",
  journal =      j-TKDD,
  volume =       "12",
  number =       "2",
  pages =        "19:1--19:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3110216",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "In many commercial campaigns, we observe that there
                 exists a tradeoff between the number of customers
                 satisfied by the company and the profit gained. Merely
                 satisfying as many customers as possible or maximizing
                 the profit is not desirable. To this end, in this
                 article, we propose a new problem called
                 $k$-Satisfiability Assignment for Maximizing the Profit
                 ( $$ k $$-SAMP), where $k$ is a user parameter and a
                 non-negative integer. Given a set $P$ of products and a
                 set $O$ of customers, $k$-SAMP is to find an assignment
                 between $P$ and $O$ such that at least $k$ customers
                 are satisfied in the assignment and the profit incurred
                 by this assignment is maximized. Although we find that
                 this problem is closely related to two classic computer
                 science problems, namely maximum weight matching and
                 maximum matching, the techniques developed for these
                 classic problems cannot be adapted to our $k$-SAMP
                 problem. In this work, we design a novel algorithm
                 called Adjust for the $k$-SAMP problem. Given an
                 assignment $A$, Adjust iteratively increases the profit
                 of $A$ by adjusting some appropriate matches in $A$
                 while keeping at least $k$ customers satisfied in $A$.
                 We prove that Adjust returns a global optimum.
                 Extensive experiments were conducted that verified the
                 efficiency of Adjust.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "19",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Ramezani:2018:CDU,
  author =       "Maryam Ramezani and Ali Khodadadi and Hamid R.
                 Rabiee",
  title =        "Community Detection Using Diffusion Information",
  journal =      j-TKDD,
  volume =       "12",
  number =       "2",
  pages =        "20:1--20:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3110215",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Community detection in social networks has become a
                 popular topic of research during the last decade. There
                 exist a variety of algorithms for modularizing the
                 network graph into different communities. However, they
                 mostly assume that partial or complete information of
                 the network graphs are available that is not feasible
                 in many cases. In this article, we focus on detecting
                 communities by exploiting their diffusion information.
                 To this end, we utilize the Conditional Random Fields
                 (CRF) to discover the community structures. The
                 proposed method, community diffusion (CoDi), does not
                 require any prior knowledge about the network structure
                 or specific properties of communities. Furthermore, in
                 contrast to the structure-based community detection
                 methods, this method is able to identify the hidden
                 communities. The experimental results indicate
                 considerable improvements in detecting communities
                 based on accuracy, scalability, and real cascade
                 information measures.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "20",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Chiasserini:2018:ACS,
  author =       "Carla-Fabiana Chiasserini and Michel Garetto and Emili
                 Leonardi",
  title =        "De-anonymizing Clustered Social Networks by
                 Percolation Graph Matching",
  journal =      j-TKDD,
  volume =       "12",
  number =       "2",
  pages =        "21:1--21:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3127876",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Online social networks offer the opportunity to
                 collect a huge amount of valuable information about
                 billions of users. The analysis of this data by service
                 providers and unintended third parties are posing
                 serious treats to user privacy. In particular, recent
                 work has shown that users participating in more than
                 one online social network can be identified based only
                 on the structure of their links to other users. An
                 effective tool to de-anonymize social network users is
                 represented by graph matching algorithms. Indeed, by
                 exploiting a sufficiently large set of seed nodes, a
                 percolation process can correctly match almost all
                 nodes across the different social networks. In this
                 article, we show the crucial role of clustering, which
                 is a relevant feature of social network graphs (and
                 many other systems). Clustering has both the effect of
                 making matching algorithms more prone to errors, and
                 the potential to greatly reduce the number of seeds
                 needed to trigger percolation. We show these facts by
                 considering a fairly general class of random geometric
                 graphs with variable clustering level. We assume that
                 seeds can be identified in particular sub-regions of
                 the network graph, while no a priori knowledge about
                 the location of the other nodes is required. Under
                 these conditions, we show how clever algorithms can
                 achieve surprisingly good performance while limiting
                 the number of matching errors.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "21",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Zhao:2018:JRL,
  author =       "Wayne Xin Zhao and Feifan Fan and Ji-Rong Wen and
                 Edward Y. Chang",
  title =        "Joint Representation Learning for Location-Based
                 Social Networks with Multi-Grained Sequential
                 Contexts",
  journal =      j-TKDD,
  volume =       "12",
  number =       "2",
  pages =        "22:1--22:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3127875",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "This article studies the problem of learning effective
                 representations for Location-Based Social Networks
                 (LBSN), which is useful in many tasks such as location
                 recommendation and link prediction. Existing network
                 embedding methods mainly focus on capturing topology
                 patterns reflected in social connections, while
                 check-in sequences, the most important data type in
                 LBSNs, are not directly modeled by these models. In
                 this article, we propose a representation learning
                 method for LBSNs called as JRLM++, which models
                 check-in sequences together with social connections. To
                 capture sequential relatedness, JRLM++ characterizes
                 two levels of sequential contexts, namely fine-grained
                 and coarse-grained contexts. We present a learning
                 algorithm tailored to the hierarchical architecture of
                 the proposed model. We conduct extensive experiments on
                 two important applications using real-world datasets.
                 The experimental results demonstrate the superiority of
                 our model. The proposed model can generate effective
                 representations for both users and locations in the
                 same embedding space, which can be further utilized to
                 improve multiple LBSN tasks.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "22",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Hu:2018:CFT,
  author =       "Guang-Neng Hu and Xin-Yu Dai and Feng-Yu Qiu and Rui
                 Xia and Tao Li and Shu-Jian Huang and Jia-Jun Chen",
  title =        "Collaborative Filtering with Topic and Social Latent
                 Factors Incorporating Implicit Feedback",
  journal =      j-TKDD,
  volume =       "12",
  number =       "2",
  pages =        "23:1--23:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3127873",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Recommender systems (RSs) provide an effective way of
                 alleviating the information overload problem by
                 selecting personalized items for different users.
                 Latent factors-based collaborative filtering (CF) has
                 become the popular approaches for RSs due to its
                 accuracy and scalability. Recently, online social
                 networks and user-generated content provide diverse
                 sources for recommendation beyond ratings. Although
                 social matrix factorization (Social MF) and topic
                 matrix factorization (Topic MF) successfully exploit
                 social relations and item reviews, respectively; both
                 of them ignore some useful information. In this
                 article, we investigate the effective data fusion by
                 combining the aforementioned approaches. First, we
                 propose a novel model MR3 to jointly model three
                 sources of information (i.e., ratings, item reviews,
                 and social relations) effectively for rating prediction
                 by aligning the latent factors and hidden topics.
                 Second, we incorporate the implicit feedback from
                 ratings into the proposed model to enhance its
                 capability and to demonstrate its flexibility. We
                 achieve more accurate rating prediction on real-life
                 datasets over various state-of-the-art methods.
                 Furthermore, we measure the contribution from each of
                 the three data sources and the impact of implicit
                 feedback from ratings, followed by the sensitivity
                 analysis of hyperparameters. Empirical studies
                 demonstrate the effectiveness and efficacy of our
                 proposed model and its extension.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "23",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Perozzi:2018:DCA,
  author =       "Bryan Perozzi and Leman Akoglu",
  title =        "Discovering Communities and Anomalies in Attributed
                 Graphs: Interactive Visual Exploration and
                 Summarization",
  journal =      j-TKDD,
  volume =       "12",
  number =       "2",
  pages =        "24:1--24:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3139241",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Given a network with node attributes, how can we
                 identify communities and spot anomalies? How can we
                 characterize, describe, or summarize the network in a
                 succinct way? Community extraction requires a measure
                 of quality for connected subgraphs (e.g., social
                 circles). Existing subgraph measures, however, either
                 consider only the connectedness of nodes inside the
                 community and ignore the cross-edges at the boundary
                 (e.g., density) or only quantify the structure of the
                 community and ignore the node attributes (e.g.,
                 conductance). In this work, we focus on node-attributed
                 networks and introduce: (1) a new measure of subgraph
                 quality for attributed communities called normality,
                 (2) a community extraction algorithm that uses
                 normality to extract communities and a few
                 characterizing attributes per community, and (3) a
                 summarization and interactive visualization approach
                 for attributed graph exploration. More specifically,
                 (1) we first introduce a new measure to quantify the
                 normality of an attributed subgraph. Our normality
                 measure carefully utilizes structure and attributes
                 together to quantify both the internal consistency and
                 external separability. We then formulate an objective
                 function to automatically infer a few attributes
                 (called the ``focus'') and respective attribute
                 weights, so as to maximize the normality score of a
                 given subgraph. Most notably, unlike many other
                 approaches, our measure allows for many cross-edges as
                 long as they can be ``exonerated;'' i.e., either (i)
                 are expected under a null graph model, and/or (ii)
                 their boundary nodes do not exhibit the focus
                 attributes. Next, (2) we propose AMEN (for Attributed
                 Mining of Entity Networks), an algorithm that
                 simultaneously discovers the communities and their
                 respective focus in a given graph, with a goal to
                 maximize the total normality. Communities for which a
                 focus that yields high normality cannot be found are
                 considered low quality or anomalous. Last, (3) we
                 formulate a summarization task with a multi-criteria
                 objective, which selects a subset of the communities
                 that (i) cover the entire graph well, are (ii) high
                 quality and (iii) diverse in their focus attributes. We
                 further design an interactive visualization interface
                 that presents the communities to a user in an
                 interpretable, user-friendly fashion. The user can
                 explore all the communities, analyze various
                 algorithm-generated summaries, as well as devise their
                 own summaries interactively to characterize the network
                 in a succinct way. As the experiments on real-world
                 attributed graphs show, our proposed approaches
                 effectively find anomalous communities and outperform
                 several existing measures and methods, such as
                 conductance, density, OddBall, and SODA. We also
                 conduct extensive user studies to measure the
                 capability and efficiency that our approach provides to
                 the users toward network summarization, exploration,
                 and sensemaking.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "24",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Bonab:2018:GGO,
  author =       "Hamed R. Bonab and Fazli Can",
  title =        "{GOOWE}: Geometrically Optimum and Online-Weighted
                 Ensemble Classifier for Evolving Data Streams",
  journal =      j-TKDD,
  volume =       "12",
  number =       "2",
  pages =        "25:1--25:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3139240",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:45 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Designing adaptive classifiers for an evolving data
                 stream is a challenging task due to the data size and
                 its dynamically changing nature. Combining individual
                 classifiers in an online setting, the ensemble
                 approach, is a well-known solution. It is possible that
                 a subset of classifiers in the ensemble outperforms
                 others in a time-varying fashion. However, optimum
                 weight assignment for component classifiers is a
                 problem, which is not yet fully addressed in online
                 evolving environments. We propose a novel data stream
                 ensemble classifier, called Geometrically Optimum and
                 Online-Weighted Ensemble (GOOWE), which assigns optimum
                 weights to the component classifiers using a sliding
                 window containing the most recent data instances. We
                 map vote scores of individual classifiers and true
                 class labels into a spatial environment. Based on the
                 Euclidean distance between vote scores and
                 ideal-points, and using the linear least squares (LSQ)
                 solution, we present a novel, dynamic, and online
                 weighting approach. While LSQ is used for batch mode
                 ensemble classifiers, it is the first time that we
                 adapt and use it for online environments by providing a
                 spatial modeling of online ensembles. In order to show
                 the robustness of the proposed algorithm, we use
                 real-world datasets and synthetic data generators using
                 the Massive Online Analysis (MOA) libraries. First, we
                 analyze the impact of our weighting system on
                 prediction accuracy through two scenarios. Second, we
                 compare GOOWE with eight state-of-the-art ensemble
                 classifiers in a comprehensive experimental
                 environment. Our experiments show that GOOWE provides
                 improved reactions to different types of concept drift
                 compared to our baselines. The statistical tests
                 indicate a significant improvement in accuracy, with
                 conservative time and memory requirements.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "25",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Xie:2018:ERP,
  author =       "Hong Xie and Richard T. B. Ma and John C. S. Lui",
  title =        "Enhancing Reputation via Price Discounts in E-Commerce
                 Systems: a Data-Driven Approach",
  journal =      j-TKDD,
  volume =       "12",
  number =       "3",
  pages =        "26:1--26:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3154417",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:46 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Reputation systems have become an indispensable
                 component of modern E-commerce systems, as they help
                 buyers make informed decisions in choosing trustworthy
                 sellers. To attract buyers and increase the transaction
                 volume, sellers need to earn reasonably high reputation
                 scores. This process usually takes a substantial amount
                 of time. To accelerate this process, sellers can
                 provide price discounts to attract users, but the
                 underlying difficulty is that sellers have no prior
                 knowledge on buyers' preferences over price discounts.
                 In this article, we develop an online algorithm to
                 infer the optimal discount rate from data. We first
                 formulate an optimization framework to select the
                 optimal discount rate given buyers' discount
                 preferences, which is a tradeoff between the short-term
                 profit and the ramp-up time (for reputation). We then
                 derive the closed-form optimal discount rate, which
                 gives us key insights in applying a stochastic bandits
                 framework to infer the optimal discount rate from the
                 transaction data with regret upper bounds. We show that
                 the computational complexity of evaluating the
                 performance metrics is infeasibly high, and therefore,
                 we develop efficient randomized algorithms with
                 guaranteed performance to approximate them. Finally, we
                 conduct experiments on a dataset crawled from eBay.
                 Experimental results show that our framework can trade
                 60\% of the short-term profit for reducing the ramp-up
                 time by 40\%. This reduction in the ramp-up time can
                 increase the long-term profit of a seller by at least
                 20\%.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "26",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Belcastro:2018:GRA,
  author =       "Loris Belcastro and Fabrizio Marozzo and Domenico
                 Talia and Paolo Trunfio",
  title =        "{G-RoI}: Automatic Region-of-Interest Detection Driven
                 by Geotagged Social Media Data",
  journal =      j-TKDD,
  volume =       "12",
  number =       "3",
  pages =        "27:1--27:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3154411",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:46 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Geotagged data gathered from social media can be used
                 to discover interesting locations visited by users
                 called Places-of-Interest (PoIs). Since a PoI is
                 generally identified by the geographical coordinates of
                 a single point, it is hard to match it with user
                 trajectories. Therefore, it is useful to define an
                 area, called Region-of-Interest ( RoI ), to represent
                 the boundaries of the PoI's area. RoI mining techniques
                 are aimed at discovering ROIs from PoIs and other data.
                 Existing RoI mining techniques are based on three main
                 approaches: predefined shapes, density-based
                 clustering, and grid-based aggregation. This article
                 proposes G-RoI, a novel RoI mining technique that
                 exploits the indications contained in geotagged social
                 media items to discover RoIs with a high accuracy.
                 Experiments performed over a set of PoIs in Rome and
                 Paris using social media geotagged data, demonstrate
                 that G-RoI in most cases achieves better results than
                 existing techniques. In particular, the mean F$_1$
                 score is 0.34 higher than that obtained with the
                 well-known DBSCAN algorithm in Rome RoIs and 0.23
                 higher in Paris RoIs.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "27",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Shin:2018:FAF,
  author =       "Kijung Shin and Bryan Hooi and Christos Faloutsos",
  title =        "Fast, Accurate, and Flexible Algorithms for Dense
                 Subtensor Mining",
  journal =      j-TKDD,
  volume =       "12",
  number =       "3",
  pages =        "28:1--28:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3154414",
  ISSN =         "1556-4681 (print), 1556-472X (electronic)",
  ISSN-L =       "1556-4681",
  bibdate =      "Tue Jan 29 17:18:46 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/tkdd.bib",
  abstract =     "Given a large-scale and high-order tensor, how can we
                 detect dense subtensors in it? Can we spot them in
                 near-linear time but with quality guarantees? Extensive
                 previous work has shown that dense subtensors, as well
                 as dense subgraphs, indicate anomalous or fraudulent
                 behavior (e.g., lockstep behavior in social networks).
                 However, available algorithms for detecting dense
                 subtensors are not satisfactory in terms of speed,
                 accuracy, and flexibility. In this work, we propose two
                 algorithms, called M-Zoom and M-Biz, for fast and
                 accurate dense-subtensor detection with various density
                 measures. M-Zoom gives a lower bound on the density of
                 detected subtensors, while M-Biz guarantees the local
                 optimality of detected subtensors. M-Zoom and M-Biz can
                 be combined, giving the following advantages: (1)
                 Scalable: scale near-linearly with all aspects of
                 tensors and are up to 114$ \times $ faster than
                 state-of-the-art methods with similar accuracy, (2)
                 Provably accurate: provide a guarantee on the lowest
                 density and local optimality of the subtensors they
                 find, (3) Flexible: support multi-subtensor detection
                 and size bounds as well as diverse density measures,
                 and (4) Effective: successfully detected edit wars and
                 bot activities in Wikipedia, and spotted network
                 attacks from a TCP dump with near-perfect accuracy (AUC
                 = 0.98).",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Knowl. Discov. Data",
  articleno =    "28",
  fjournal =     "ACM Transactions on Knowledge Discovery from Data
                 (TKDD)",
  journal-URL =  "https://dl.acm.org/loi/tkdd",
}

@Article{Liang:2018:PRA,
  author =       "J