Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.21",
%%%     date            = "23 January 2020",
%%%     time            = "07:45:58 MST",
%%%     filename        = "jdiq.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "45241 7052 37648 356077",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "Journal of Data and Information Quality
%%%                       (JDIQ); bibliography",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        the ACM Journal of Data and Information
%%%                        Quality (JDIQ) (CODEN ????, ISSN 1936-1955),
%%%                        covering all journal issues from 2009 --
%%%                        date.
%%%
%%%                        At version 1.21, the COMPLETE journal
%%%                        coverage looked like this:
%%%
%%%                             2009 (  17)    2013 (   8)    2017 (  17)
%%%                             2010 (   6)    2014 (  11)    2018 (  34)
%%%                             2011 (   8)    2015 (  22)    2019 (  21)
%%%                             2012 (  15)    2016 (  14)    2020 (   6)
%%%
%%%                             Article:        179
%%%
%%%                             Total entries:  179
%%%
%%%                        The journal table of contents pages are at:
%%%
%%%                            http://www.acm.org/jdiq/
%%%                            http://portal.acm.org/browse_dl.cfm?idx=J1191
%%%                            https://dl.acm.org/loi/jdiq
%%%
%%%                        Qualified subscribers can retrieve the full
%%%                        text of recent articles in PDF form.
%%%
%%%                        The initial draft was extracted from the ACM
%%%                        Web pages.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.  Article reviews have been
%%%                        omitted, until their copyright status has
%%%                        been clarified.
%%%
%%%                        bibsource keys in the bibliography entries
%%%                        below indicate the entry originally came
%%%                        from the computer science bibliography
%%%                        archive, even though it has likely since
%%%                        been corrected and updated.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%                        information about the entry.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using ``bibsort -byvolume.''
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility."
%%%     }
%%% ====================================================================
@Preamble{"\input bibnames.sty" #
    "\def \TM {${}^{\sc TM}$}"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-JDIQ                  = "Journal of Data and Information
                                  Quality (JDIQ)"}

%%% ====================================================================
%%% Bibliography entries:
@Article{Madnick:2009:EII,
  author =       "Stuart E. Madnick and Yang W. Lee",
  title =        "Editorial for the Inaugural Issue of the {ACM Journal
                 of Data and Information Quality (JDIQ)}",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  ISSN =         "1936-1955",
  bibdate =      "Fri Sep 18 15:11:35 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Madnick:2009:OFD,
  author =       "Stuart E. Madnick and Richard Y. Wang and Yang W. Lee
                 and Hongwei Zhu",
  title =        "Overview and Framework for Data and Information
                 Quality Research",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1515693.1516680",
  ISSN =         "1936-1955",
  bibdate =      "Fri Sep 18 15:11:35 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Awareness of data and information quality issues has
                 grown rapidly in light of the critical role played by
                 the quality of information in our data-intensive,
                 knowledge-based economy. Research in the past two
                 decades has produced a large body of data quality
                 knowledge and has expanded our ability to solve many
                 data and information quality problems. In this article,
                 we present an overview of the evolution and current
                 landscape of data and information quality research. We
                 introduce a framework to characterize the research
                 along two dimensions: topics and methods.
                 Representative papers are cited for purposes of
                 illustrating the issues addressed and the methods used.
                 We also identify and discuss challenges to be addressed
                 in future research.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Li:2009:BAE,
  author =       "Xiao-Bai Li",
  title =        "A {Bayesian} Approach for Estimating and Replacing
                 Missing Categorical Data",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1515693.1515695",
  ISSN =         "1936-1955",
  bibdate =      "Fri Sep 18 15:11:35 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "We propose a new approach for estimating and replacing
                 missing categorical data. With this approach, the
                 posterior probabilities of a missing attribute value
                 belonging to a certain category are estimated using the
                 simple Bayes method. Two alternative methods for
                 replacing the missing value are proposed: The first
                 replaces the missing value with the value having the
                 estimated maximum probability; the second uses a value
                 that is selected with probability proportional to the
                 estimated posterior distribution. The effectiveness of
                 the proposed approach is evaluated based on some
                 important data quality measures for data warehousing
                 and data mining. The results of the experimental study
                 demonstrate the effectiveness of the proposed
                 approach.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Weber:2009:OSD,
  author =       "Kristin Weber and Boris Otto and Hubert {\"O}sterle",
  title =        "One Size Does Not Fit All---{A} Contingency Approach
                 to Data Governance",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1515693.1515696",
  ISSN =         "1936-1955",
  bibdate =      "Fri Sep 18 15:11:35 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Enterprizes need Data Quality Management (DQM) to
                 respond to strategic and operational challenges
                 demanding high-quality corporate data. Hitherto,
                 companies have mostly assigned accountabilities for DQM
                 to Information Technology (IT) departments. They have
                 thereby neglected the organizational issues critical to
                 successful DQM. With data governance, however,
                 companies may implement corporate-wide accountabilities
                 for DQM that encompass professionals from business and
                 IT departments. This research aims at starting a
                 scientific discussion on data governance by
                 transferring concepts from IT governance and
                 organizational theory to the previously largely ignored
                 field of data governance. The article presents the
                 first results of a community action research project on
                 data governance comprising six international companies
                 from various industries. It outlines a data governance
                 model that consists of three components (data quality
                 roles, decision areas, and responsibilities), which
                 together form a responsibility assignment matrix. The
                 data governance model documents data quality roles and
                 their type of interaction with DQM activities. In
                 addition, the article describes a data governance
                 contingency model and demonstrates the influence of
                 performance strategy, diversification breadth,
                 organization structure, competitive strategy, degree of
                 process harmonization, degree of market regulation, and
                 decision-making style on data governance. Based on
                 these findings, companies can structure their specific
                 data governance model.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Heinrich:2009:PDM,
  author =       "B. Heinrich and M. Klier and M. Kaiser",
  title =        "A Procedure to Develop Metrics for Currency and its
                 Application in {CRM}",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1515693.1515697",
  ISSN =         "1936-1955",
  bibdate =      "Fri Sep 18 15:11:35 MDT 2009",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Due to the importance of using up-to-date data in
                 information systems, this article analyzes how the
                 data-quality dimension currency can be quantified.
                 Based on several requirements (e.g., normalization and
                 interpretability) and a literature review, we design a
                 procedure to develop probability-based metrics for
                 currency which can be adjusted to the specific
                 characteristics of data attribute values. We evaluate
                 the presented procedure with regard to the requirements
                 and illustrate the applicability as well as its
                 practical benefit. In cooperation with a major German
                 mobile services provider, the procedure was applied in
                 the field of campaign management in order to improve
                 both success rates and profits.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Madnick:2009:ELS,
  author =       "Stuart E. Madnick and Yang W. Lee",
  title =        "Editorial Letter for the Special Issue on Data Quality
                 in Databases and Information Systems",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "2",
  pages =        "6:1--6:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1577840.1577841",
  ISSN =         "1936-1955",
  bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Naumann:2009:GES,
  author =       "Felix Naumann and Louiqa Raschid",
  title =        "Guest Editorial for the Special Issue on Data Quality
                 in Databases",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "2",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1577840.1577842",
  ISSN =         "1936-1955",
  bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Dash:2009:MLN,
  author =       "Manoranjan Dash and Ayush Singhania",
  title =        "Mining in Large Noisy Domains",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "2",
  pages =        "8:1--8:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1577840.1577843",
  ISSN =         "1936-1955",
  bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "In this article we address the issue of how to mine
                 efficiently in large and noisy data. We propose an
                 efficient sampling algorithm ({\em Concise\/}) as a
                 solution for large and noisy data. Concise is far more
                 superior than the Simple Random Sampling ({\em SRS\/})
                 in selecting a representative sample. Particularly when
                 the data is very large and noisy, Concise achieves the
                 maximum gain over SRS. The comparison is in terms of
                 their impact on subsequent data mining tasks,
                 specifically, classification, clustering, and
                 association rule mining. We compared Concise with a few
                 existing noise removal algorithms followed by SRS.
                 Although the accuracy of mining results are similar,
                 Concise spends very little time compared to the
                 existing algorithms because Concise has linear time
                 complexity.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
  keywords =     "association rule mining; classification; clustering;
                 data mining; Information filtering; sampling; selection
                 process",
}

@Article{Moustakides:2009:OSR,
  author =       "George V. Moustakides and Vassilios S. Verykios",
  title =        "Optimal Stopping: a Record-Linkage Approach",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "2",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1577840.1577844",
  ISSN =         "1936-1955",
  bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Record-linkage is the process of identifying whether
                 two separate records refer to the same real-world
                 entity when some elements of the record's identifying
                 information (attributes) agree and others disagree.
                 Existing record-linkage decision methodologies use the
                 outcomes from the comparisons of the whole set of
                 attributes. Here, we propose an alternative scheme that
                 assesses the attributes sequentially, allowing for a
                 decision to made at any attribute's comparison stage,
                 and thus before exhausting all available attributes.
                 The scheme we develop is optimum in that it minimizes a
                 well-defined average cost criterion while the
                 corresponding optimum solution can be easily mapped
                 into a decision tree to facilitate the record-linkage
                 decision process. Experimental results performed in
                 real datasets indicate the superiority of our
                 methodology compared to existing approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
  keywords =     "duplicate detection; optimal stopping;
                 Record-linkage",
}

@Article{Klein:2009:RDQ,
  author =       "A. Klein and W. Lehner",
  title =        "Representing Data Quality in Sensor Data Streaming
                 Environments",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "2",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1577840.1577845",
  ISSN =         "1936-1955",
  bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Sensors in smart-item environments capture data about
                 product conditions and usage to support business
                 decisions as well as production automation processes. A
                 challenging issue in this application area is the
                 restricted quality of sensor data due to limited sensor
                 precision and sensor failures. Moreover, data stream
                 processing to meet resource constraints in streaming
                 environments introduces additional noise and decreases
                 the data quality. In order to avoid wrong business
                 decisions due to dirty data, quality characteristics
                 have to be captured, processed, and provided to the
                 respective business task. However, the issue of how to
                 efficiently provide applications with information about
                 data quality is still an open research problem.\par

                 In this article, we address this problem by presenting
                 a flexible model for the propagation and processing of
                 data quality. The comprehensive analysis of common data
                 stream processing operators and their impact on data
                 quality allows a fruitful data evaluation and
                 diminishes incorrect business decisions. Further, we
                 propose the data quality model control to adapt the
                 data quality granularity to the data stream
                 interestingness.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
  keywords =     "data quality; Data stream processing; smart items",
}

@Article{Embury:2009:IDS,
  author =       "Suzanne M. Embury and Paolo Missier and Sandra Sampaio
                 and R. Mark Greenwood and Alun D. Preece",
  title =        "Incorporating Domain-Specific Information Quality
                 Constraints into Database Queries",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "2",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1577840.1577846",
  ISSN =         "1936-1955",
  bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "The range of information now available in queryable
                 repositories opens up a host of possibilities for new
                 and valuable forms of data analysis. Database query
                 languages such as SQL and XQuery offer a concise and
                 high-level means by which such analyses can be
                 implemented, facilitating the extraction of relevant
                 data subsets into either generic or bespoke data
                 analysis environments. Unfortunately, the quality of
                 data in these repositories is often highly variable.
                 The data is still useful, but only if the consumer is
                 aware of the data quality problems and can work around
                 them. Standard query languages offer little support for
                 this aspect of data management. In principle, however,
                 it should be possible to embed constraints describing
                 the consumer's data quality requirements into the query
                 directly, so that the query evaluator can take over
                 responsibility for enforcing them during query
                 processing.\par

                 Most previous attempts to incorporate information
                 quality constraints into database queries have been
                 based around a small number of highly generic quality
                 measures, which are defined and computed by the
                 information provider. This is a useful approach in some
                 application areas but, in practice, quality criteria
                 are more commonly determined by the user of the
                 information not by the provider. In this article, we
                 explore an approach to incorporating quality
                 constraints into database queries where the definition
                 of quality is set by the user and not the provider of
                 the information. Our approach is based around the
                 concept of a {\em quality view}, a configurable quality
                 assessment component into which domain-specific notions
                 of quality can be embedded. We examine how quality
                 views can be incorporated into XQuery, and draw from
                 this the language features that are required in general
                 to embed quality views into any query language. We also
                 propose some syntactic sugar on top of XQuery to
                 simplify the process of querying with quality
                 constraints.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
  keywords =     "database query languages; Information quality; views;
                 XQuery",
}

@Article{Madnick:2009:CPS,
  author =       "Stuart E. Madnick and Yang W. Lee",
  title =        "Call for Papers Special Issue on Healthcare
                 Information Quality: the Challenges and Opportunities
                 in Healthcare Systems and Services",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "2",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1577840.1577847",
  ISSN =         "1936-1955",
  bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Madnick:2009:ECW,
  author =       "Stuart E. Madnick and Yang W. Lee",
  title =        "Editors' Comments: Where the {JDIQ} Articles Come
                 From: Incubating Research in an Emerging Field",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "3",
  pages =        "13:1--13:??",
  month =        dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1659225.1659226",
  ISSN =         "1936-1955",
  bibdate =      "Wed Mar 17 14:47:55 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Sessions:2009:TMD,
  author =       "V. Sessions and M. Valtorta",
  title =        "Towards a Method for Data Accuracy Assessment
                 Utilizing a {Bayesian} Network Learning Algorithm",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "3",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1659225.1659227",
  ISSN =         "1936-1955",
  bibdate =      "Wed Mar 17 14:47:55 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "This research develops a data quality algorithm
                 entitled the Accuracy Assessment Algorithm (AAA). This
                 is an extension of research in developing an
                 enhancement to a Bayesian Network (BN) learning
                 algorithm called the Data Quality (DQ) algorithm. This
                 new algorithm is concerned with estimating the accuracy
                 levels of a dataset by assessing the quality of the
                 data with no prior knowledge of the dataset. The AAA
                 and associated metrics were tested using two canonical
                 BNs and one large-scale medical network. The article
                 presents the results regarding the efficacy of the
                 algorithm and the implications for future research and
                 practice.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
  keywords =     "accuracy levels; Bayesian networks; data quality
                 assessment; PC algorithm",
}

@Article{Even:2009:DAD,
  author =       "Adir Even and G. Shankaranarayanan",
  title =        "Dual Assessment of Data Quality in Customer
                 Databases",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "3",
  pages =        "15:1--15:??",
  month =        dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1659225.1659228",
  ISSN =         "1936-1955",
  bibdate =      "Wed Mar 17 14:47:55 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Quantitative assessment of data quality is critical
                 for identifying the presence of data defects and the
                 extent of the damage due to these defects. Quantitative
                 assessment can help define realistic quality
                 improvement targets, track progress, evaluate the
                 impacts of different solutions, and prioritize
                 improvement efforts accordingly. This study describes a
                 methodology for quantitatively assessing both impartial
                 {\em and\/} contextual data quality in large datasets.
                 Impartial assessment measures the extent to which a
                 dataset is defective, independent of the context in
                 which that dataset is used. Contextual assessment, as
                 defined in this study, measures the extent to which the
                 presence of defects reduces a dataset's utility, the
                 benefits gained by using that dataset in a specific
                 context. The dual assessment methodology is
                 demonstrated in the context of Customer Relationship
                 Management (CRM), using large data samples from
                 real-world datasets. The results from comparing the two
                 assessments offer important insights for directing
                 quality maintenance efforts and prioritizing quality
                 improvement solutions for this dataset. The study
                 describes the steps and the computation involved in the
                 dual-assessment methodology and discusses the
                 implications for applying the methodology in other
                 business contexts and data environments.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
  keywords =     "CRM; customer relationship management; databases; Data
                 quality; information value; total data quality
                 management",
}

@Article{Fisher:2009:AMP,
  author =       "Craig W. Fisher and Eitel J. M. Lauria and Carolyn C.
                 Matheus",
  title =        "An Accuracy Metric: Percentages, Randomness, and
                 Probabilities",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "3",
  pages =        "16:1--16:??",
  month =        dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1659225.1659229",
  ISSN =         "1936-1955",
  bibdate =      "Wed Mar 17 14:47:55 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Practitioners and researchers regularly refer to error
                 rates or accuracy percentages of databases. The former
                 is the number of cells in error divided by the total
                 number of cells; the latter is the number of correct
                 cells divided by the total number of cells. However,
                 databases may have similar error rates (or accuracy
                 percentages) but differ drastically in the complexity
                 of their accuracy problems. A simple percent does not
                 provide information as to whether the errors are
                 systematic or randomly distributed throughout the
                 database. We expand the accuracy metric to include a
                 randomness measure and include a probability
                 distribution value. The proposed randomness check is
                 based on the Lempel--Ziv (LZ) complexity measure.
                 Through two simulation studies we show that the LZ
                 complexity measure can clearly differentiate as to
                 whether the errors are random or systematic. This
                 determination is a significant first step and is a
                 major departure from the percentage-alone technique.
                 Once it is determined that the errors are random, a
                 probability distribution, Poisson, is used to help
                 address various managerial questions.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
  keywords =     "complexity; Data and information quality; randomness",
}

@Article{Ababneh:2009:CSE,
  author =       "Sufyan Ababneh and Rashid Ansari and Ashfaq Khokhar",
  title =        "Compensated Signature Embedding for Multimedia Content
                 Authentication",
  journal =      j-JDIQ,
  volume =       "1",
  number =       "3",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1659225.1659230",
  ISSN =         "1936-1955",
  bibdate =      "Wed Mar 17 14:47:55 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "One of the main goals of digital content
                 authentication and preservation techniques is to
                 guarantee the originality and quality of the
                 information. In this article, robust watermarking is
                 used to embed content-based fragile signatures in
                 multimedia signals to achieve efficient authentication
                 without requiring any third-party reference or side
                 information. To overcome the signature alteration
                 caused by the embedding perturbation and other possible
                 encoding operations, a closed-form compensation
                 technique is proposed for ensuring signature
                 consistency by employing a Lagrangian-based approach. A
                 minimum distortion criterion is used to ensure signal
                 quality. The effectiveness of the proposed approach is
                 investigated with simulations of examples of image
                 authentication in which signatures are designed to
                 reveal tamper localization. Results using quantitative
                 performance criteria show successful authentication
                 over a range of robustness in embedding watermarks
                 using both QIM-DM and spread-spectrum techniques. A
                 comparison with two iterative compensation schemes is
                 also presented.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
  keywords =     "compensated signature embedding; Content
                 authentication; watermarking",
}

@Article{Madnick:2010:ECA,
  author =       "Stuart E. Madnick and Yang W. Lee",
  title =        "{Editors}' Comments: {ACM Journal of Data and
                 Information Quality (JDIQ)} is alive and well!",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jul,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1805286.1805287",
  ISSN =         "1936-1955",
  bibdate =      "Tue Sep 7 08:41:54 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Tremblay:2010:UDM,
  author =       "Monica Chiarini Tremblay and Kaushik Dutta and Debra
                 Vandermeer",
  title =        "Using Data Mining Techniques to Discover Bias Patterns
                 in Missing Data",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jul,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1805286.1805288",
  ISSN =         "1936-1955",
  bibdate =      "Tue Sep 7 08:41:54 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "In today's data-rich environment, decision makers draw
                 conclusions from data repositories that may contain
                 data quality problems. In this context, missing data is
                 an important and known problem, since it can seriously
                 affect the accuracy of conclusions drawn. Researchers
                 have described several approaches for dealing with
                 missing data, primarily attempting to infer values or
                 estimate the impact of missing data on conclusions.
                 However, few have considered approaches to characterize
                 patterns of bias in missing data, that is, to determine
                 the specific attributes that predict the missingness of
                 data values. Knowledge of the specific systematic bias
                 patterns in the incidence of missing data can help
                 analysts more accurately assess the quality of
                 conclusions drawn from data sets with missing data.
                 This research proposes a methodology to combine a
                 number of Knowledge Discovery and Data Mining
                 techniques, including association rule mining, to
                 discover patterns in related attribute values that help
                 characterize these bias patterns. We demonstrate the
                 efficacy of our proposed approach by applying it on a
                 demo census dataset seeded with biased missing data.
                 The experimental results show that our approach was
                 able to find seeded biases and filter out most seeded
                 noise.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
  keywords =     "Data quality; missing data; pattern discovery",
}

@Article{Jensen:2010:JCI,
  author =       "Matthew L. Jensen and Judee K. Burgoon and Jay F.
                 {Nunamaker, Jr.}",
  title =        "Judging the Credibility of Information Gathered from
                 Face-to-Face Interactions",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jul,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1805286.1805289",
  ISSN =         "1936-1955",
  bibdate =      "Tue Sep 7 08:41:54 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "One of the most pernicious threats to information
                 quality comes through perpetration of deception by
                 information suppliers. Deception undermines many
                 critical dimensions of information quality, such as
                 accuracy, completeness, and believability. Despite this
                 threat, information gatherers are ill equipped to
                 assess the credibility of information suppliers. This
                 work presents a prototype system that examines messages
                 gathered during direct, face-to-face information
                 gathering. The system unobtrusively identifies kinesic
                 and linguistic features that may indicate deception in
                 information suppliers' messages. System use was found
                 to significantly improve assessment ability in
                 between-subjects and within-subjects tests. The
                 improved ability to accurately assess credibility
                 during face-to-face interactions should yield higher
                 information quality.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
  keywords =     "Credibility assessment; deception detection;
                 decision-aids; human-computer interaction; information
                 veracity; kinesics; linguistics",
}

@Article{Meda:2010:DDF,
  author =       "Hema S. Meda and Anup Kumar Sen and Amitava Bagchi",
  title =        "On Detecting Data Flow Errors in Workflows",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jul,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1805286.1805290",
  ISSN =         "1936-1955",
  bibdate =      "Tue Sep 7 08:41:54 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "When designing a business workflow, it is customary
                 practice to create the control flow structure first and
                 to ensure its correctness. Information about the flow
                 of data is introduced subsequently into the workflow
                 and its correctness is independently verified. Improper
                 specification of data requirements of tasks and XOR
                 splits can cause problems such as wrong branching at
                 XOR splits and the failure of tasks to execute. Here we
                 present a graph traversal algorithm called GTforDF for
                 detecting data flow errors in both nested and
                 unstructured workflows, and illustrate its operation on
                 realistic examples. Two of these have interconnected
                 loops and are free of control flow errors, and the
                 third one is an unstructured loop-free workflow. Our
                 approach extends and generalizes data flow verification
                 methods that have been recently proposed. It also makes
                 use of the concept of corresponding pairs lately
                 introduced in control flow verification. It thus has
                 the potential for development into a unified
                 algorithmic procedure for the concurrent detection of
                 control flow and data flow errors. The correctness of
                 the algorithm has been proved theoretically. It has
                 also been tested experimentally on many examples.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
  keywords =     "Corresponding pair; Data flow errors; Workflow
                 management",
}

@Article{Magnani:2010:SUM,
  author =       "Matteo Magnani and Danilo Montesi",
  title =        "A Survey on Uncertainty Management in Data
                 Integration",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jul,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1805286.1805291",
  ISSN =         "1936-1955",
  bibdate =      "Tue Sep 7 08:41:54 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "In the last few years, uncertainty management has come
                 to be recognized as a fundamental aspect of data
                 integration. It is now accepted that it may not be
                 possible to remove uncertainty generated during data
                 integration processes and that uncertainty in itself
                 may represent a source of relevant information. Several
                 issues, such as the aggregation of uncertain mappings
                 and the querying of uncertain mediated schemata, have
                 been addressed by applying well-known uncertainty
                 management theories. However, several problems lie
                 unresolved. This article sketches an initial picture of
                 this highly active research area; it details existing
                 works in the light of a homogeneous framework, and
                 identifies and discusses the leading issues awaiting
                 solutions.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
  keywords =     "Data integration; uncertainty",
}

@Article{Talburt:2010:CPS,
  author =       "John R. Talburt and Stuart E. Madnick and Yang W.
                 Lee",
  title =        "Call for Papers: Special Issue on Entity Resolution",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "1",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1805286.1805292",
  ISSN =         "1936-1955",
  bibdate =      "Tue Sep 7 08:41:54 MDT 2010",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Madnick:2011:ESN,
  author =       "Stuart E. Madnick and Yang W. Lee",
  title =        "Editorial: In Search of Novel Ideas and Solutions with
                 a Broader Context of Data Quality in Mind",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "2",
  pages =        "7:1--7:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1891879.1891880",
  ISSN =         "1936-1955",
  bibdate =      "Mon Mar 28 12:03:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Blake:2011:EID,
  author =       "Roger Blake and Paul Mangiameli",
  title =        "The Effects and Interactions of Data Quality and
                 Problem Complexity on Classification",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "2",
  pages =        "8:1--8:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1891879.1891881",
  ISSN =         "1936-1955",
  bibdate =      "Mon Mar 28 12:03:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Gelman:2011:GGA,
  author =       "Irit Askira Gelman",
  title =        "{GIGO} or not {GIGO}: The Accuracy of Multi-Criteria
                 Satisficing Decisions",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "2",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1891879.1891882",
  ISSN =         "1936-1955",
  bibdate =      "Mon Mar 28 12:03:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Fan:2011:GBN,
  author =       "Xiaoming Fan and Jianyong Wang and Xu Pu and Lizhu
                 Zhou and Bing Lv",
  title =        "On Graph-Based Name Disambiguation",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "2",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1891879.1891883",
  ISSN =         "1936-1955",
  bibdate =      "Mon Mar 28 12:03:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ngugi:2011:TBI,
  author =       "Benjamin Ngugi and Beverly K. Kahn and Marilyn
                 Tremaine",
  title =        "Typing Biometrics: Impact of Human Learning on
                 Performance Quality",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "2",
  pages =        "11:1--11:??",
  month =        feb,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/1891879.1891884",
  ISSN =         "1936-1955",
  bibdate =      "Mon Mar 28 12:03:59 MDT 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Madnick:2011:ENC,
  author =       "Stuart E. Madnick and Yang W. Lee",
  title =        "Editorial Notes: Classification and Assessment of
                 Large Amounts of Data: Examples in the Healthcare
                 Industry and Collaborative Digital Libraries",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "3",
  pages =        "12:1--12:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2063504.2063505",
  ISSN =         "1936-1955",
  bibdate =      "Thu Dec 15 09:41:55 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Lauria:2011:CBT,
  author =       "Eitel J. M. Laur{\'\i}a and Alan D. March",
  title =        "Combining {Bayesian} Text Classification and Shrinkage
                 to Automate Healthcare Coding: a Data Quality
                 Analysis",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "3",
  pages =        "13:1--13:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2063504.2063506",
  ISSN =         "1936-1955",
  bibdate =      "Thu Dec 15 09:41:55 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Dalip:2011:AAD,
  author =       "Daniel Hasan Dalip and Marcos Andr{\'e}
                 Gon{\c{c}}alves and Marco Cristo and P{\'a}vel Calado",
  title =        "Automatic Assessment of Document Quality in {Web}
                 Collaborative Digital Libraries",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "3",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2063504.2063507",
  ISSN =         "1936-1955",
  bibdate =      "Thu Dec 15 09:41:55 MST 2011",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Muller:2012:IDQ,
  author =       "Heiko M{\"u}ller and Johann-Christoph Freytag and Ulf
                 Leser",
  title =        "Improving data quality by source analysis",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "4",
  pages =        "15:1--15:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2107536.2107538",
  ISSN =         "1936-1955",
  bibdate =      "Fri Mar 16 15:01:48 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "In many domains, data cleaning is hampered by our
                 limited ability to specify a comprehensive set of
                 integrity constraints to assist in identification of
                 erroneous data. An alternative approach to improve data
                 quality is to exploit different data sources that
                 contain information about the same set of objects. Such
                 overlapping sources highlight hot-spots of poor data
                 quality through conflicting data values and immediately
                 provide alternative values for conflict resolution. In
                 order to derive a dataset of high quality, we can merge
                 the overlapping sources based on a quality assessment
                 of the conflicting values. The quality of the resulting
                 dataset, however, is highly dependent on our ability to
                 asses the quality of conflicting values effectively.
                 The main objective of this article is to introduce
                 methods that aid the developer of an integrated system
                 over overlapping, but contradicting sources in the task
                 of improving the quality of data. Value conflicts
                 between contradicting sources are often systematic,
                 caused by some characteristic of the different sources.
                 Our goal is to identify such systematic differences and
                 outline data patterns that occur in conjunction with
                 them. Evaluated by an expert user, the regularities
                 discovered provide insights into possible conflict
                 reasons and help to assess the quality of inconsistent
                 values. The contributions of this article are two
                 concepts of systematic conflicts: contradiction
                 patterns and minimal update sequences. Contradiction
                 patterns resemble a special form of association rules
                 that summarize characteristic data properties for
                 conflict occurrence. We adapt existing association rule
                 mining algorithms for mining contradiction patterns.
                 Contradiction patterns, however, view each class of
                 conflicts in isolation, sometimes leading to largely
                 overlapping patterns. Sequences of set-oriented update
                 operations that transform one data source into the
                 other are compact descriptions for all regular
                 differences among the sources. We consider minimal
                 update sequences as the most likely explanation for
                 observed differences between overlapping data sources.
                 Furthermore, the order of operations within the
                 sequences point out potential dependencies between
                 systematic differences. Finding minimal update
                 sequences, however, is beyond reach in practice. We
                 show that the problem already is NP-complete for a
                 restricted set of operations. In the light of this
                 intractability result, we present heuristics that lead
                 to convincing results for all examples we considered.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Gelman:2012:BMC,
  author =       "Irit Askira Gelman",
  title =        "Biases in multi-criteria, satisfying decisions due to
                 data errors",
  journal =      j-JDIQ,
  volume =       "2",
  number =       "4",
  pages =        "16:1--16:??",
  month =        feb,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2107536.2107539",
  ISSN =         "1936-1955",
  bibdate =      "Fri Mar 16 15:01:48 MDT 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "This inquiry centers on an asymmetry, or bias, in the
                 accuracy of multi-criteria, conjunctive, and
                 disjunctive decisions, which originates from
                 fundamental properties of the logical conjunction and
                 disjunction operations. A mathematical-statistical
                 analysis indicates that, as we keep adding criteria to
                 a multi-criteria conjunctive or disjunctive decision
                 rule, errors in the data produce decision errors
                 asymmetrically. As a result, in conjunctive decisions,
                 the probability of a false negative increases while the
                 probability of a false positive decreases. In contrast,
                 in disjunctive decisions, as we keep adding criteria,
                 the probability of a false positive increases while
                 that of a false negative decreases. For instance, in a
                 conjunctive business decision rule, the probability of
                 overlooking a bargain can be far greater than the
                 probability of misjudging an unattractive offer to be a
                 good one. A series of Monte Carlo simulations validates
                 the analytical findings and explores the contribution
                 of several additional factors.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Sachdeva:2012:SIS,
  author =       "Shelly Sachdeva and Subhash Bhalla",
  title =        "Semantic interoperability in standardized electronic
                 health record databases",
  journal =      j-JDIQ,
  volume =       "3",
  number =       "1",
  pages =        "1:1--1:??",
  month =        apr,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2166788.2166789",
  ISSN =         "1936-1955",
  bibdate =      "Thu Nov 8 18:27:12 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Different clinics and hospitals have their own
                 information systems to maintain patient data. This
                 hinders the exchange of data among systems (and
                 organizations). Hence there is a need to provide
                 standards for data exchange. In digitized form, the
                 individual patient's medical record can be stored,
                 retrieved, and shared over a network through
                 enhancement in information technology. Thus, electronic
                 health records (EHRs) should be standardized,
                 incorporating semantic interoperability. A subsequent
                 step requires that healthcare professionals and
                 patients get involved in using the EHRs, with the help
                 of technological developments. This study aims to
                 provide different approaches in understanding some
                 current and challenging concepts in health informatics.
                 Successful handling of these challenges will lead to
                 improved quality in healthcare by reducing medical
                 errors, decreasing costs, and enhancing patient care.
                 The study is focused on the following goals: (1)
                 understanding the role of EHRs; (2) understanding the
                 need for standardization to improve quality; (3)
                 establishing interoperability in maintaining EHRs; (4)
                 examining a framework for standardization and
                 interoperability (the openEHR architecture); (5)
                 identifying the role of archetypes for knowledge-based
                 systems; and (6) understanding the difficulties in
                 querying HER data.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Brown:2012:DQT,
  author =       "Steven Brown and Trent S. Rosenbloom and Shawn P.
                 Hardenbrook and Terry Clark and Elliot Fielstein and
                 Peter Elkin and Ted Speroff",
  title =        "Documentation quality and time costs: a randomized
                 controlled trial of structured entry versus dictation",
  journal =      j-JDIQ,
  volume =       "3",
  number =       "1",
  pages =        "2:1--2:??",
  month =        apr,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2166788.2166790",
  ISSN =         "1936-1955",
  bibdate =      "Thu Nov 8 18:27:12 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "The Department of Veterans Affairs (VA) performs over
                 800,000 disability exams and distributes over
                 {\&}dollor;37 billion in disability benefits per year.
                 VA developed and deployed a computer-based disability
                 exam documentation system in order to improve exam
                 report quality and timeliness. We conducted a
                 randomized controlled trial comparing joint disability
                 examinations supported by computerized templates to the
                 examinations documented via dictation, to determine if
                 the system met the intended goals or had unintended
                 consequences. Consenting veterans were randomized to
                 undergo exams documented using computerized templates
                 or via dictation. We compared exam report quality,
                 documentation time costs, encounter length, total time
                 to fulfill an exam request with a finalized exam
                 report, and veteran satisfaction. Computer-based
                 templates resulted in disability exam reports that had
                 higher quality scores (p. 0.042) and were returned to
                 the requesting office faster than exam reports created
                 via dictation (p. 0.02). Documentation time and veteran
                 satisfaction were similar for both the documentation
                 techniques. Encounter length was significantly longer
                 for the template group. Computer-based templates
                 impacted the VA disability evaluation system by
                 improving report quality scores and production time and
                 lengthening encounter times. Oversight bodies have
                 called for mandated use of computer-based templates
                 nationwide. We believe mandates regarding use of health
                 information technology should be guided by data
                 regarding its positive and negative impacts.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Sunyaev:2012:SCD,
  author =       "Ali Sunyaev and Dmitry Chornyi",
  title =        "Supporting chronic disease care quality: Design and
                 implementation of a health service and its integration
                 with electronic health records",
  journal =      j-JDIQ,
  volume =       "3",
  number =       "2",
  pages =        "3:1--3:??",
  month =        may,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2184442.2184443",
  ISSN =         "1936-1955",
  bibdate =      "Thu Nov 8 18:27:12 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Chronic medical conditions take a huge toll on lives
                 of a growing number of people and are a major
                 contributor to the rising costs in healthcare. As
                 patients are increasingly willing to take an active
                 part in managing their conditions, chronic disease
                 self-management programs and information systems that
                 support them are recognized for their potential to
                 improve the quality of healthcare delivery. These
                 programs often rely on recording longitudinal patient
                 data and analyzing it. Therefore, maintaining
                 appropriate data quality is important for
                 self-management programs to be efficient and safe. We
                 designed and implemented a prototype of a health
                 self-management service for chronically ill people. It
                 is a distributed application that supports patients
                 with diabetes at tracking their blood glucose levels.
                 The main design goals were usability, extensibility,
                 security, and interoperability. The system integrates
                 with the Microsoft HealthVault and Google Health
                 personal health record platforms. It utilizes
                 industry-strength storage and security mechanisms, is
                 scalable, and as a result, can be used to gather,
                 securely store, and analyze patient data over long
                 periods of time. In this article we examine how
                 software information technology can support chronic
                 disease self-management and its impact on the quality
                 of patient data. Furthermore, we describe the
                 requirements that drove the system's development, its
                 architecture, and design decisions.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Elizabeth:2012:NSA,
  author =       "D. Shiloah Elizabeth and H. Khanna Nehemiah and C.
                 Sunil Retmin Raj and A. Kannan",
  title =        "A novel segmentation approach for improving diagnostic
                 accuracy of {CAD} systems for detecting lung cancer
                 from chest computed tomography images",
  journal =      j-JDIQ,
  volume =       "3",
  number =       "2",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2184442.2184444",
  ISSN =         "1936-1955",
  bibdate =      "Thu Nov 8 18:27:12 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Segmentation of lung tissue is an important and
                 challenging task in any computer aided diagnosis
                 system. The accuracy of the segmentation subsystem
                 determines the performance of the other subsystems in
                 any computer aided diagnosis system based on image
                 analysis. We propose a novel technique for segmentation
                 of lung tissue from computed tomography of the chest.
                 Manual segmentation of lung parenchyma becomes
                 difficult with an enormous volume of images. The goal
                 of this work is to present an automated approach to
                 segmentation of lung parenchyma from the rest of the
                 chest CT image. The approach involves the conventional
                 optimal thresholding technique and operations based on
                 convex edge and centroid properties of the lung region.
                 The segmentation technique proposed in this article can
                 be used to preprocess lung images given to a computer
                 aided diagnosis system for diagnosis of lung disorders.
                 This improves the diagnostic performance of the system.
                 This has been tested by using it in a computer aided
                 diagnosis system that was used for detection of lung
                 cancer from chest computed tomography images. The
                 results obtained show that the lungs can be correctly
                 segmented even in the presence of peripheral pathology
                 bearing regions; pathology bearing regions that could
                 not be detected using a CAD system that applies optimal
                 thresholding could be detected using a CAD system using
                 out proposed approach for segmentation of lungs.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Yakout:2012:EPA,
  author =       "Mohamed Yakout and Mikhail J. Atallah and Ahmed
                 Elmagarmid",
  title =        "Efficient and Practical Approach for Private Record
                 Linkage",
  journal =      j-JDIQ,
  volume =       "3",
  number =       "3",
  pages =        "5:1--5:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287714.2287715",
  ISSN =         "1936-1955",
  bibdate =      "Thu Nov 8 18:27:13 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Record linkage is used to associate entities from
                 multiple data sources. For example, two organizations
                 contemplating a merger may want to know how common
                 their customer bases are so that they may better assess
                 the benefits of the merger. Another example is a
                 database of people who are forbidden from a certain
                 activity by regulators, may need to be compared to a
                 list of people engaged in that activity. The autonomous
                 entities who wish to carry out the record matching
                 computation are often reluctant to fully share their
                 data; they fear losing control over its subsequent
                 dissemination and usage, or they want to insure privacy
                 because the data is proprietary or confidential, and/or
                 they are cautious simply because privacy laws forbid
                 its disclosure or regulate the form of that disclosure.
                 In such cases, the problem of carrying out the linkage
                 computation without full data exchange has been called
                 private record linkage. Previous private record linkage
                 techniques have made use of a third party. We provide
                 efficient techniques for private record linkage that
                 improve on previous work in that (1) our techniques
                 make no use of a third party, and (2) they achieve much
                 better performance than previous schemes in terms of
                 their execution time while maintaining acceptable
                 quality of output compared to nonprivacy settings. Our
                 protocol consists of two phases. The first phase
                 primarily produces candidate record pairs for matching,
                 by carrying out a very fast (but not accurate) matching
                 between such pairs of records. The second phase is a
                 novel protocol for efficiently computing distances
                 between each candidate pair (without any expensive
                 cryptographic operations such as modular
                 exponentiations). Our experimental evaluation of our
                 approach validates these claims.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Yang:2012:ECD,
  author =       "Yanjuan Yang and Michael Mannino",
  title =        "An Experimental Comparison of a Document Deception
                 Detection Policy using Real and Artificial Deception",
  journal =      j-JDIQ,
  volume =       "3",
  number =       "3",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2287714.2287716",
  ISSN =         "1936-1955",
  bibdate =      "Thu Nov 8 18:27:13 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Developing policies to screen documents for deception
                 is often hampered by the cost of data collection and
                 the inability to evaluate policy alternatives due to
                 lack of data. To lower data collection costs and
                 increase the amount of data, artificially generated
                 deception data can be used, but the impact of using
                 artificially generated deception data is not well
                 understood. This article studies the impact of
                 artificially generated deception on document screening
                 policies. The deception and truth data were collected
                 from financial aid applications, a document-centric
                 area with limited resources for screening. Real
                 deception was augmented with artificial data generated
                 by noise and deception generation models. Using the
                 real data and artificially generated data, we designed
                 an innovative experiment with deception type and
                 deception rate as factors, and harmonic mean and cost
                 as outcome variables. We used two budget models (fixed
                 and variable) typically employed by financial aid
                 offices to measure the cost of noncompliance in
                 financial aid applications. The analysis included an
                 evaluation of a common policy for deception screening
                 using both fixed and varying screening rates. The
                 results of the experiment provided evidence of similar
                 performance of screening policy with real and
                 artificial deception, suggesting the possibility of
                 using artificially generated deception to reduce the
                 costs associated with obtaining training data.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Robb:2012:INU,
  author =       "David A. Robb and Paul L. Bowen and A. Faye Borthick
                 and Fiona H. Rohde",
  title =        "Improving New Users' Query Performance: Deterring
                 Premature Stopping of Query Revision with Information
                 for Forming Ex Ante Expectations",
  journal =      j-JDIQ,
  volume =       "3",
  number =       "4",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2348828.2348829",
  ISSN =         "1936-1955",
  bibdate =      "Thu Nov 8 18:27:14 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "As the volume of data in organizational databases
                 grows, organizations are seeking to use this data to
                 improve organizational success. To this end, users are
                 being asked to query these databases to provide
                 information to help answer questions posed by key
                 management personnel. Users who have had extensive
                 experience with an organization's data can often detect
                 the presence of errors in their queries when query
                 results do not correspond to their ex ante
                 expectations. New users, however, are less familiar
                 with the data they will be querying. Having no, or
                 limited, ex ante expectations for query results, new
                 users may be unaware that the result produced by their
                 query is incorrect. Unwarranted confidence in the
                 correctness of their queries predisposes these users to
                 stop looking for query errors even when their queries
                 still contain errors. This behavior, premature stopping
                 of query revision, prompts investigating whether new
                 users' query performance would improve if they were not
                 only provided with, but used, readily available
                 information to form ex ante expectations. Our results
                 demonstrated a threshold effect in new users heeding
                 information for forming ex ante expectations. That is,
                 the mere availability of information for forming ex
                 ante expectations made no difference in query
                 performance. When admonishing users to heed ex ante
                 information, however, there was an associated increase
                 in the accuracy of their queries. These results suggest
                 that users unfamiliar with a particular database might
                 make fewer query errors if they not only received
                 readily available information but were then prompted to
                 use the information to form ex ante expectations for
                 query results.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Varol:2012:HMA,
  author =       "Cihan Varol and Coskun Bayrak",
  title =        "Hybrid Matching Algorithm for Personal Names",
  journal =      j-JDIQ,
  volume =       "3",
  number =       "4",
  pages =        "8:1--8:??",
  month =        sep,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2348828.2348830",
  ISSN =         "1936-1955",
  bibdate =      "Thu Nov 8 18:27:14 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib;
                 http://www.math.utah.edu/pub/tex/bib/spell.bib",
  abstract =     "Companies acquire personal information from phone,
                 World Wide Web, or email in order to sell or send an
                 advertisement about their product. However, when this
                 information is acquired, moved, copied, or edited, the
                 data may lose its quality. Often, the use of data
                 administrators or a tool that has limited capabilities
                 to correct the mistyped information can cause many
                 problems. Moreover, most of the correction techniques
                 are particularly implemented for the words used in
                 daily conversations. Since personal names have
                 different characteristics compared to general text, a
                 hybrid matching algorithm (PNRS) which employs phonetic
                 encoding, string matching and statistical facts to
                 provide a possible candidate for misspelled names is
                 developed. At the end, the efficiency of the proposed
                 algorithm is compared with other well known spelling
                 correction techniques.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{ODonoghue:2012:ISI,
  author =       "John O'Donoghue and Jane Grimson and Katherine
                 Seelman",
  title =        "Introduction to the Special Issue on Information
                 Quality: The Challenges and Opportunities in Healthcare
                 Systems and Services",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "1",
  pages =        "1:1--1:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2378016.2378017",
  ISSN =         "1936-1955",
  bibdate =      "Thu Nov 8 18:27:14 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Collins:2012:CGF,
  author =       "Claire Collins and Kelly Janssens",
  title =        "Creating a General (Family) Practice Epidemiological
                 Database in {Ireland} --- Data Quality Issue
                 Management",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "1",
  pages =        "2:1--2:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2378016.2378018",
  ISSN =         "1936-1955",
  bibdate =      "Thu Nov 8 18:27:14 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "In Ireland, while detailed information is available
                 regarding hospital attendance, little is known
                 regarding general (family) practice attendance.
                 However, it is conservatively estimated that there are
                 almost nine times as many general practice encounters
                 than there are hospital encounters each year in
                 Ireland. This represents a very significant gap in
                 health information. Indeed, general practice has been
                 shown in other countries to be an important and rich
                 source of information about the health of the
                 population, their behaviors and their utilization of
                 health services. Funded by the Health Information and
                 Quality Authority (HIQA), the Irish College of General
                 Practitioners (ICGP) undertook a feasibility study of
                 diagnostic coding of routinely entered patient data and
                 the creation of a national general practice morbidity
                 and epidemiological database (GPMED project). This
                 article outlines the process of data quality issue
                 management undertaken. The study's findings suggest
                 that the quality of data collection and reporting
                 structures available in general practice throughout
                 Ireland at the outset of this project were not adequate
                 to permit the creation of a database of sufficient
                 quality for service planning and policy or
                 epidemiological research. Challenges include the dearth
                 of a minimum standard of data recorded in consultations
                 by GPs and the absence of the digital data recording
                 and exporting infrastructure within Irish patient
                 management software systems. In addition, there is at
                 present a lack of recognition regarding the value of
                 such data for patient management and service
                 planning---including importantly, data collectors who
                 do not fully accept the merit of maintaining data,
                 which has a direct consequence for data quality. The
                 work of this project has substantial implications for
                 the data available to the health sector in Ireland and
                 contributes to the knowledge base internationally
                 regarding general practice morbidity data.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Cure:2012:IDQ,
  author =       "Olivier Cur{\'e}",
  title =        "Improving the Data Quality of Drug Databases using
                 Conditional Dependencies and Ontologies",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "1",
  pages =        "3:1--3:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2378016.2378019",
  ISSN =         "1936-1955",
  bibdate =      "Thu Nov 8 18:27:14 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Many health care systems and services exploit drug
                 related information stored in databases. The poor data
                 quality of these databases, e.g. inaccuracy of drug
                 contraindications, can lead to catastrophic
                 consequences for the health condition of patients.
                 Hence it is important to ensure their quality in terms
                 of data completeness and soundness. In the database
                 domain, standard Functional Dependencies (FDs) and
                 INclusion Dependencies (INDs), have been proposed to
                 prevent the insertion of incorrect data. But they are
                 generally not expressive enough to represent a
                 domain-specific set of constraints. To this end,
                 conditional dependencies, i.e. standard dependencies
                 extended with tableau patterns containing constant
                 values, have been introduced and several methods have
                 been proposed for their discovery and representation.
                 The quality of drug databases can be considerably
                 improved by their usage. Moreover, pharmacology
                 information is inherently hierarchical and many
                 standards propose graph structures to represent them,
                 e.g. the Anatomical Therapeutic Chemical classification
                 (ATC) or OpenGalen's terminology. In this article, we
                 emphasize that the technologies of the Semantic Web are
                 adapted to represent these hierarchical structures,
                 i.e. in RDFS and OWL. We also present a solution for
                 representing conditional dependencies using a query
                 language defined for these graph oriented structures,
                 namely SPARQL. The benefits of this approach are
                 interoperability with applications and ontologies of
                 the Semantic Web as well as a reasoning-based query
                 execution solution to clean underlying databases.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{McNaull:2012:DIQ,
  author =       "James McNaull and Juan Carlos Augusto and Maurice
                 Mulvenna and Paul McCullagh",
  title =        "Data and Information Quality Issues in Ambient
                 Assisted Living Systems",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "1",
  pages =        "4:1--4:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2378016.2378020",
  ISSN =         "1936-1955",
  bibdate =      "Thu Nov 8 18:27:14 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Demographic aging, as a result of people living for
                 longer, has put an increased burden on health and
                 social care provision across most of the economies of
                 the developed and developing world. In order to cope
                 with the greater numbers of older people, together with
                 increasing prevalence of chronic diseases, governments
                 are looking to new ways to provide care and support to
                 older people and their care providers. A growing trend
                 is where health and social care providers are moving
                 towards the use of assisted living technologies to
                 provide care and assistance in the home. In this
                 article, the research area of Ambient Assisted Living
                 (AAL) systems is examined and the data, information and
                 the higher-level contextual knowledge quality issues in
                 relation to these systems, is discussed. Lack of
                 quality control may result in an AAL system providing
                 assistance and support based upon incorrect data,
                 information and knowledge inputs, and this may have a
                 detrimental effect on the person making use of the
                 system. We propose a model whereby contextual knowledge
                 gained during the AAL system's reasoning cycle can be
                 fed back to aid in further quality checking at the
                 various architectural layers, and a realistic AAL
                 scenario is provided to support this. Future research
                 should be conducted in these areas, with the
                 requirement of building quality criteria into the
                 design and implementation of AAL systems.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{ODonoghue:2012:DMW,
  author =       "John O'Donoghue and John Herbert",
  title =        "Data Management within {mHealth} Environments: Patient
                 Sensors, Mobile Devices, and Databases",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "1",
  pages =        "5:1--5:??",
  month =        oct,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2378016.2378021",
  ISSN =         "1936-1955",
  bibdate =      "Thu Nov 8 18:27:14 MST 2012",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Pervasive environments generate large quantities of
                 data, originating from backend servers, portable
                 devices, and wireless mobile sensors. Pervasive sensing
                 devices that monitor properties of the environment
                 (including human beings) can be a large data source.
                 Unprocessed datasets may include data that is faulty
                 and irrelevant, and data that is important and useful.
                 If not managed correctly the large amount of data from
                 a data-rich pervasive environment may result in
                 information overload or delivery of incorrect
                 information. Context-sensitive quality data management
                 aims to gather, verify, process, and manage the
                 multiple data sources in a pervasive environment in
                 order to deliver high quality, relevant information to
                 the end-user. Managing the quality of data from
                 different sources, correlating related data, and making
                 use of context, are all essential in providing end
                 users with accurate and meaningful data in real time.
                 This requirement is especially true for critical
                 applications such as in a medical environment. This
                 article presents the Data Management System (DMS)
                 architecture. It is designed to deliver quality data
                 service to its users. The DMS architecture employs an
                 agent-based middleware to intelligently and effectively
                 manage all pervasive data sources, and to make use of
                 context to deliver relevant information to the
                 end-user. Two of the DMS components are presented: (1)
                 data validation and (2) data consistency. The DMS
                 components have been rigorously evaluated using various
                 medical-based test cases. This article demonstrates a
                 careful, precise approach to data based on the quality
                 of the data and the context of its use. It emphasises
                 the DMS architecture and the role of software agents in
                 providing quality data management.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Talburt:2013:SIE,
  author =       "John R. Talburt",
  title =        "Special Issue on Entity Resolution Overview: The
                 Criticality of Entity Resolution in Data and
                 Information Quality",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "2",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2435221.2435222",
  ISSN =         "1936-1955",
  bibdate =      "Sat Jun 22 12:13:00 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Song:2013:DIE,
  author =       "Dezhao Song and Jeff Heflin",
  title =        "Domain-Independent Entity Coreference for Linking
                 Ontology Instances",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "2",
  pages =        "7:1--7:??",
  month =        mar,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2435221.2435223",
  ISSN =         "1936-1955",
  bibdate =      "Sat Jun 22 12:13:00 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "The objective of entity coreference is to determine if
                 different mentions (e.g., person names, place names,
                 database records, ontology instances, etc.) refer to
                 the same real word object. Entity coreference
                 algorithms can be used to detect duplicate database
                 records and to determine if two Semantic Web instances
                 represent the same underlying real word entity. The key
                 issues in developing an entity coreference algorithm
                 include how to locate context information and how to
                 utilize the context appropriately. In this article, we
                 present a novel entity coreference algorithm for
                 ontology instances. For scalability reasons, we select
                 a neighborhood of each instance from an RDF graph. To
                 determine the similarity between two instances, our
                 algorithm computes the similarity between comparable
                 property values in the neighborhood graphs. The
                 similarity of distinct URIs and blank nodes is computed
                 by comparing their outgoing links. In an attempt to
                 reduce the impact of distant nodes on the final
                 similarity measure, we explore a distance-based
                 discounting approach. To provide the best possible
                 domain-independent matches, we propose an approach to
                 compute the discriminability of triples in order to
                 assign weights to the context information. We evaluated
                 our algorithm using different instance categories from
                 five datasets. Our experiments show that the best
                 results are achieved by including both our discounting
                 and triple discrimination approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Nuray-Turan:2013:ACS,
  author =       "Rabia Nuray-Turan and Dmitri V. Kalashnikov and Sharad
                 Mehrotra",
  title =        "Adaptive Connection Strength Models for
                 Relationship-Based Entity Resolution",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "2",
  pages =        "8:1--8:??",
  month =        mar,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2435221.2435224",
  ISSN =         "1936-1955",
  bibdate =      "Sat Jun 22 12:13:00 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Entity Resolution (ER) is a data quality challenge
                 that deals with ambiguous references in data and whose
                 task is to identify all references that co-refer. Due
                 to practical significance of the ER problem, many
                 creative ER techniques have been proposed in the past,
                 including those that analyze relationships that exist
                 among entities in data. Such approaches view the
                 database as an entity-relationship graph, where direct
                 and indirect relationships correspond to paths in the
                 graph. These techniques rely on measuring the
                 connection strength among various nodes in the graph by
                 using a connection strength (CS) model. While such
                 approaches have demonstrated significant advantage over
                 traditional ER techniques, currently they also have a
                 significant limitation: the CS models that they use are
                 intuition-based fixed models that tend to behave well
                 in general, but are very generic and not tuned to a
                 specific domain, leading to suboptimal result quality.
                 Hence, in this article we propose an approach that
                 employs supervised learning to adapt the connection
                 strength measure to the given domain using the
                 available past/training data. The adaptive approach has
                 several advantages: it increases both the quality and
                 efficiency of ER and it also minimizes the domain
                 analyst participation needed to tune the CS model to
                 the given domain. The extensive empirical evaluation
                 demonstrates that the proposed approach reaches up to
                 8\% higher accuracy than the graph-based ER methods
                 that use fixed and intuition-based CS models.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Panse:2013:IHU,
  author =       "Fabian Panse and Maurice van Keulen and Norbert
                 Ritter",
  title =        "Indeterministic Handling of Uncertain Decisions in
                 Deduplication",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "2",
  pages =        "9:1--9:??",
  month =        mar,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2435221.2435225",
  ISSN =         "1936-1955",
  bibdate =      "Sat Jun 22 12:13:00 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "In current research and practice, deduplication is
                 usually considered as a deterministic approach in which
                 database tuples are either declared to be duplicates or
                 not. In ambiguous situations, however, it is often not
                 completely clear-cut, which tuples represent the same
                 real-world entity. In deterministic approaches, many
                 realistic possibilities may be ignored, which in turn
                 can lead to false decisions. In this article, we
                 present an indeterministic approach for deduplication
                 by using a probabilistic target model including
                 techniques for proper probabilistic interpretation of
                 similarity matching results. Thus, instead of deciding
                 for one of the most likely situations, all realistic
                 situations are modeled in the resultant data. This
                 approach minimizes the negative impact of false
                 decisions. Moreover, the deduplication process becomes
                 almost fully automatic and human effort can be largely
                 reduced. To increase applicability, we introduce
                 several semi-indeterministic methods that heuristically
                 reduce the set of indeterministically handled decisions
                 in several meaningful ways. We also describe a
                 full-indeterministic method for theoretical and
                 presentational reasons.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Zhou:2013:GLC,
  author =       "Yinle Zhou and Eric Nelson and Fumiko Kobayashi and
                 John R. Talburt",
  title =        "A Graduate-Level Course on Entity Resolution and
                 Information Quality: a Step toward {ER} Education",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "2",
  pages =        "10:1--10:??",
  month =        mar,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2435221.2435226",
  ISSN =         "1936-1955",
  bibdate =      "Sat Jun 22 12:13:00 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "This article discusses the topics, approaches, and
                 lessons learned in teaching a graduate-level course
                 covering entity resolution (ER) and its relationship to
                 information quality (IQ). The course surveys a broad
                 spectrum of ER topics and activities including entity
                 reference extraction, entity reference preparation,
                 entity reference resolution techniques, entity identity
                 management, and entity relationship analysis. The
                 course content also attempts to balance aspects of ER
                 theory with practical application through a series of
                 laboratory exercises coordinated with the lecture
                 topics. As an additional teaching aid, a configurable,
                 open-source entity resolution engine (OYSTER) was
                 developed that allows students to experience with
                 different types of ER architectures including
                 merge-purge, record linking, identity resolution, and
                 identity capture.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Cao:2013:NAD,
  author =       "Lan Cao and Hongwei Zhu",
  title =        "Normal accidents: Data quality problems in
                 {ERP}-enabled manufacturing",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "3",
  pages =        "11:1--11:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "1936-1955",
  bibdate =      "Sat Jun 22 12:13:05 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "The efficient operation of Enterprise Resource
                 Planning (ERP) systems largely depends on data quality.
                 ERP can improve data quality and information sharing
                 within an organization. It can also pose challenges to
                 data quality. While it is well known that data quality
                 is important in ERP systems, most existing research has
                 focused on identifying the factors affecting the
                 implementation and the business values of ERP. With
                 normal accident theory as a theoretical lens, we
                 examine data quality problems in ERP using a case study
                 of a large, fast-growing multinational manufacturer
                 headquartered in China. Our findings show that
                 organizations that have successfully implemented ERP
                 can still experience certain data quality problems. We
                 identify major data quality problems in data
                 production, storage and maintenance, and utilization
                 processes. We also analyze the causes of these data
                 quality problems by linking them to certain
                 characteristics of ERP systems within an organizational
                 context. Our analysis shows that problems resulting
                 from the tight coupling effects and the complexity of
                 ERP-enabled manufacturing systems can be inevitable.
                 This study will help researchers and practitioners
                 formulate data management strategies that are effective
                 in the presence of certain ``normal'' data quality
                 problems.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Biran:2013:CII,
  author =       "Dov Biran and Michael H. Zack and Richard J. Briotta",
  title =        "Competitive intelligence and information quality: a
                 game-theoretic perspective",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "3",
  pages =        "12:1--12:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "1936-1955",
  bibdate =      "Sat Jun 22 12:13:05 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "To better understand a competitor's tactical and
                 strategic plans, companies need to take a closer look
                 at competitive intelligence or they risk missing
                 lucrative opportunities. Because of this there is a
                 growing interest in competitive intelligence and
                 intelligence information gathering systems (IIS). This
                 article uses game-theoretic concepts to develop an
                 analytic framework to assess the value of deploying a
                 competitive intelligence gathering information system.
                 Modeling the competitive environment as a game provides
                 a useful approach to study and evaluate competitive
                 strategies given diverse assumptions about the quality
                 of the information known by the players. When
                 determining the value of deploying an IIS, decision
                 makers need to examine three components of the
                 competitive environment: the competitive rules of the
                 game, the state of player knowledge, and the
                 reliability of the information gathered. This framework
                 focuses on competitive environments where the players'
                 state of knowledge (i.e., common versus covert
                 knowledge) and the reliability of the information
                 generated are essential to the decision making process.
                 The article concludes with implications for research
                 and practice.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Joglekar:2013:AAD,
  author =       "Nitin R. Joglekar and Edward G. Anderson and G.
                 Shankaranarayanan",
  title =        "Accuracy of aggregate data in distributed project
                 settings: Model, analysis and implications",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "3",
  pages =        "13:1--13:??",
  month =        may,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "1936-1955",
  bibdate =      "Sat Jun 22 12:13:05 MDT 2013",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "We examine the management of data accuracy in
                 inter-organizational data exchanges using the context
                 of distributed software projects. Organizations
                 typically manage projects by outsourcing portions of
                 the project to partners. Managing a portfolio of such
                 projects requires sharing data regarding the status of
                 work-in-progress residing with the partners and
                 estimates of these projects' completion times.
                 Portfolio managers use these data to assign projects to
                 be outsourced to partners. These data are rarely
                 accurate. Unless these data are filtered, inaccuracies
                 can lead to myopic and expensive sourcing decisions. We
                 develop a model that uses project-status data to
                 identify an optimal assignment of projects to be
                 outsourced. This model permits corruption of
                 project-status data. We use this model to compute the
                 costs of using perfect versus inaccurate project-status
                 data and show that the costs of deviation from optimal
                 are sizable when the inaccuracy in the data is
                 significant. We further propose a filter to correct
                 inaccurate project-status data and generate an estimate
                 of true progress. With this filter, depending on the
                 relative magnitudes of errors, we show that accuracy of
                 project-status data can be improved and the associated
                 economic benefit is significant. We illustrate the
                 improvement in accuracy and associated economic benefit
                 by instantiating the model and the filter. We further
                 elaborate on how the model parameters may be estimated
                 and used in practice.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Raschid:2014:E,
  author =       "Louiqa Raschid",
  title =        "Editorial",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "4",
  pages =        "14:1--14:??",
  month =        may,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2579167",
  ISSN =         "1936-1955",
  bibdate =      "Tue May 27 16:54:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Wijnhoven:2014:VBF,
  author =       "Fons Wijnhoven and Chintan Amrit and Pim Dietz",
  title =        "Value-Based File Retention: File Attributes as File
                 Value and Information Waste Indicators",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "4",
  pages =        "15:1--15:??",
  month =        may,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2567656",
  ISSN =         "1936-1955",
  bibdate =      "Tue May 27 16:54:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Several file retention policy methods propose that a
                 file retention policy should be based on file value.
                 Though such a retention policy might increase the value
                 of accessible files, the method to arrive at such a
                 policy is under-researched. This article discusses how
                 one can arrive at a method for developing file
                 retention policies based on the use values of files.
                 The method's applicability is initially assessed
                 through a case study at Capgemini, Netherlands. In the
                 case study, we hypothesize that one can develop a file
                 retention policy by testing causal relations between
                 file attributes (as used by file retention methods) and
                 the use value of files. Unfortunately, most file
                 attributes used by file retention methods have a weak
                 correlation with file value, resulting in the
                 conclusion that these methods do not well select out
                 high- and low-value files. This would imply the
                 ineffectiveness of the used attributes in our study or
                 errors in our conceptualization of file value. We
                 continue with the last possibility and develop
                 indicators for file utility (with low utility being
                 waste). With this approach we were able to detect waste
                 files, in a sample of files, with an accuracy of 80\%.
                 We therefore not only suggest further research in
                 information waste detection as part of a file retention
                 policy, but also to further explore other file
                 attributes that could better predict file value and
                 file utility.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Fan:2014:IBR,
  author =       "Wenfei Fan and Shuai Ma and Nan Tang and Wenyuan Yu",
  title =        "Interaction between Record Matching and Data
                 Repairing",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "4",
  pages =        "16:1--16:??",
  month =        may,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2567657",
  ISSN =         "1936-1955",
  bibdate =      "Tue May 27 16:54:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Central to a data cleaning system are record matching
                 and data repairing. Matching aims to identify tuples
                 that refer to the same real-world object, and repairing
                 is to make a database consistent by fixing errors in
                 the data by using integrity constraints. These are
                 typically treated as separate processes in current data
                 cleaning systems, based on heuristic solutions. This
                 article studies a new problem in connection with data
                 cleaning, namely the interaction between record
                 matching and data repairing. We show that repairing can
                 effectively help us identify matches, and vice versa.
                 To capture the interaction, we provide a uniform
                 framework that seamlessly unifies repairing and
                 matching operations to clean a database based on
                 integrity constraints, matching rules, and master data.
                 We give a full treatment of fundamental problems
                 associated with data cleaning via matching and
                 repairing, including the static analyses of constraints
                 and rules taken together, and the complexity,
                 termination, and determinism analyses of data cleaning.
                 We show that these problems are hard, ranging from
                 NP-complete or coNP-complete, to PSPACE-complete.
                 Nevertheless, we propose efficient algorithms to clean
                 data via both matching and repairing. The algorithms
                 find deterministic fixes and reliable fixes based on
                 confidence and entropy analyses, respectively, which
                 are more accurate than fixes generated by heuristics.
                 Heuristic fixes are produced only when deterministic or
                 reliable fixes are unavailable. We experimentally
                 verify that our techniques can significantly improve
                 the accuracy of record matching and data repairing that
                 are taken as separate processes, using real-life and
                 synthetic data.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Martin:2014:MAE,
  author =       "Nigel Martin and Alexandra Poulovassilis and Jianing
                 Wang",
  title =        "A Methodology and Architecture Embedding Quality
                 Assessment in Data Integration",
  journal =      j-JDIQ,
  volume =       "4",
  number =       "4",
  pages =        "17:1--17:??",
  month =        may,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2567663",
  ISSN =         "1936-1955",
  bibdate =      "Tue May 27 16:54:25 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Data integration aims to combine heterogeneous
                 information sources and to provide interfaces for
                 accessing the integrated resource. Data integration is
                 a collaborative task that may involve many people with
                 different degrees of experience, knowledge of the
                 application domain, and expectations relating to the
                 integrated resource. It may be difficult to determine
                 and control the quality of an integrated resource due
                 to these factors. In this article, we propose a data
                 integration methodology that has embedded within it
                 iterative quality assessment and improvement of the
                 integrated resource. We also propose an architecture
                 for the realisation of this methodology. The quality
                 assessment is based on an ontology representation of
                 different users' quality requirements and of the main
                 elements of the integrated resource. We use description
                 logic as the formal basis for reasoning about users'
                 quality requirements and for validating that an
                 integrated resource satisfies these requirements. We
                 define quality factors and associated metrics which
                 enable the quality of alternative global schemas for an
                 integrated resource to be assessed quantitatively, and
                 hence the improvement which results from the refinement
                 of a global schema following our methodology to be
                 measured. We evaluate our approach through a
                 large-scale real-life case study in biological data
                 integration in which an integrated resource is
                 constructed from three autonomous proteomics data
                 sources.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Naumann:2014:E,
  author =       "Felix Naumann",
  title =        "Editorial",
  journal =      j-JDIQ,
  volume =       "5",
  number =       "1--2",
  pages =        "1:1--1:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2648781",
  ISSN =         "1936-1955",
  bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Talburt:2014:IQR,
  author =       "John Talburt and Therese L. Williams and Thomas C.
                 Redman and David Becker",
  title =        "Information quality research challenge: Predicting and
                 quantifying the impact of social issues on information
                 quality programs",
  journal =      j-JDIQ,
  volume =       "5",
  number =       "1--2",
  pages =        "2:1--2:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629603",
  ISSN =         "1936-1955",
  bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Rahm:2014:DPC,
  author =       "Erhard Rahm",
  title =        "Discovering product counterfeits in online shops: a
                 big data integration challenge",
  journal =      j-JDIQ,
  volume =       "5",
  number =       "1--2",
  pages =        "3:1--3:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629605",
  ISSN =         "1936-1955",
  bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Christen:2014:CPP,
  author =       "Peter Christen and Dinusha Vatsalan and Vassilios S.
                 Verykios",
  title =        "Challenges for privacy preservation in data
                 integration",
  journal =      j-JDIQ,
  volume =       "5",
  number =       "1--2",
  pages =        "4:1--4:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629604",
  ISSN =         "1936-1955",
  bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Techniques for integrating data from diverse sources
                 have attracted significant interest in recent years.
                 Much of today's data collected by businesses and
                 governments are about people, and integrating such data
                 across organizations can raise privacy concerns.
                 Various techniques that preserve privacy during data
                 integration have been developed, but several challenges
                 persist that need to be solved before such techniques
                 become useful in practical applications. We elaborate
                 on these challenges and discuss research directions.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Vogel:2014:RGA,
  author =       "Tobias Vogel and Arvid Heise and Uwe Draisbach and
                 Dustin Lange and Felix Naumann",
  title =        "Reach for gold: an annealing standard to evaluate
                 duplicate detection results",
  journal =      j-JDIQ,
  volume =       "5",
  number =       "1--2",
  pages =        "5:1--5:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629687",
  ISSN =         "1936-1955",
  bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Duplicates in a database are one of the prime causes
                 of poor data quality and are at the same time among the
                 most difficult data quality problems to alleviate. To
                 detect and remove such duplicates, many commercial and
                 academic products and methods have been developed. The
                 evaluation of such systems is usually in need of
                 pre-classified results. Such gold standards are often
                 expensive to come by (much manual classification is
                 necessary), not representative (too small or too
                 synthetic), and proprietary and thus preclude
                 repetition (company-internal data). This lament has
                 been uttered in many papers and even more paper
                 reviews. The proposed annealing standard is a
                 structured set of duplicate detection results, some of
                 which are manually verified and some of which are
                 merely validated by many classifiers. As more and more
                 classifiers are evaluated against the annealing
                 standard, more and more results are verified and
                 validation becomes more and more confident. We formally
                 define gold, silver, and the annealing standard and
                 their maintenance. Experiments show how quickly an
                 annealing standard converges to a gold standard.
                 Finally, we provide an annealing standard for 750,000
                 CDs to the duplicate detection community.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Fan:2014:CRD,
  author =       "Wenfei Fan and Floris Geerts and Nan Tang and Wenyuan
                 Yu",
  title =        "Conflict resolution with data currency and
                 consistency",
  journal =      j-JDIQ,
  volume =       "5",
  number =       "1--2",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2631923",
  ISSN =         "1936-1955",
  bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "This article introduces a new approach for conflict
                 resolution: given a set of tuples pertaining to the
                 same entity, it identifies a single tuple in which each
                 attribute has the latest and consistent value in the
                 set. This problem is important in data integration,
                 data cleaning, and query answering. It is, however,
                 challenging since in practice, reliable time stamps are
                 often absent, among other things. We propose a model
                 for conflict resolution by specifying data currency in
                 terms of partial currency orders and currency
                 constraints and by enforcing data consistency with
                 constant conditional functional dependencies. We show
                 that identifying data currency orders helps us repair
                 inconsistent data, and vice versa. We investigate a
                 number of fundamental problems associated with conflict
                 resolution and establish their complexity. In addition,
                 we introduce a framework and develop algorithms for
                 conflict resolution by integrating data currency and
                 consistency inferences into a single process and by
                 interacting with users. We experimentally verify the
                 accuracy and efficiency of our methods using real-life
                 and synthetic data.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Glowalla:2014:PDD,
  author =       "Paul Glowalla and Ali Sunyaev",
  title =        "Process-driven data quality management: a critical
                 review on the application of process modeling
                 languages",
  journal =      j-JDIQ,
  volume =       "5",
  number =       "1--2",
  pages =        "7:1--7:??",
  month =        aug,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2629568",
  ISSN =         "1936-1955",
  bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Data quality is critical to organizational success. In
                 order to improve and sustain data quality in the long
                 term, process-driven data quality management (PDDQM)
                 seeks to redesign processes that create or modify data.
                 Consequently, process modeling is mandatory for PDDQM.
                 Current research examines process modeling languages
                 with respect to representational capabilities. However,
                 there is a gap, since process modeling languages for
                 PDDQM are not considered. We address this research gap
                 by providing a synthesis of the varying applications of
                 process modeling languages for PDDQM. We conducted a
                 keyword-based literature review in conferences as well
                 as 74 highranked information systems and computer
                 science journals, reviewing 1,555 articles from 1995
                 onwards. For practitioners, it is possible to integrate
                 the quality perspective within broadly applied process
                 models. For further research, we derive
                 representational requirements for PDDQM that should be
                 integrated within existing process modeling languages.
                 However, there is a need for further representational
                 analysis to examine the adequacy of upcoming process
                 modeling languages. New or enhanced process modeling
                 languages may substitute for PDDQM-specific process
                 modeling languages and facilitate development of a
                 broadly applicable and accepted process modeling
                 language for PDDQM.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Belhajjame:2015:E,
  author =       "Khalid Belhajjame and Domenico Beneventano and Laure
                 Berti-Equille and James Cheney and Victor Cuevas and
                 Tom {De Nies} and Helena Galhardas and Ashish Gehani
                 and Boris Glavic and Paul Groth and Olaf Hartig and
                 Scott Jensen and Andrea Maurino and Gianni Mecca and
                 Renee Miller and Luc Moreau and Mourad Ouzzani and
                 Jaehong Park",
  title =        "Editorial",
  journal =      j-JDIQ,
  volume =       "5",
  number =       "3",
  pages =        "8:1--8:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2692312",
  ISSN =         "1936-1955",
  bibdate =      "Tue Mar 3 14:42:39 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Cheah:2015:PQA,
  author =       "You-Wei Cheah and Beth Plale",
  title =        "Provenance Quality Assessment Methodology and
                 Framework",
  journal =      j-JDIQ,
  volume =       "5",
  number =       "3",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2665069",
  ISSN =         "1936-1955",
  bibdate =      "Tue Mar 3 14:42:39 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Data provenance, a form of metadata describing the
                 life cycle of a data product, is crucial in the sharing
                 of research data. Research data, when shared over
                 decades, requires recipients to make a determination of
                 both use and trust. That is, can they use the data?
                 More importantly, can they trust it? Knowing the data
                 are of high quality is one factor to establishing
                 fitness for use and trust. Provenance can be used to
                 assert the quality of the data, but the quality of the
                 provenance must be known as well. We propose a
                 framework for assessing the quality of data provenance.
                 We identify quality issues in data provenance,
                 establish key quality dimensions, and define a
                 framework of analysis. We apply the analysis framework
                 to synthetic and real-world provenance.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Herschel:2015:HAA,
  author =       "Melanie Herschel",
  title =        "A Hybrid Approach to Answering Why-Not Questions on
                 Relational Query Results",
  journal =      j-JDIQ,
  volume =       "5",
  number =       "3",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2665070",
  ISSN =         "1936-1955",
  bibdate =      "Tue Mar 3 14:42:39 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "In analyzing and debugging data transformations, or
                 more specifically relational queries, a subproblem is
                 to understand why some data are not part of the query
                 result. This problem has recently been addressed from
                 different perspectives for various fragments of
                 relational queries. The different perspectives yield
                 different yet complementary explanations of such
                 missing answers. This article first aims at unifying
                 the different approaches by defining a new type of
                 explanation, called hybrid explanation, that
                 encompasses the variety of previously defined types of
                 explanations. This solution goes beyond simply forming
                 the union of explanations produced by different
                 algorithms and is shown to be able to explain a larger
                 set of missing answers. Second, we present Conseil, an
                 algorithm to generate hybrid explanations. Conseil is
                 also the first algorithm to handle nonmonotonic
                 queries. Experiments on efficiency and explanation
                 quality show that Conseil is comparable and even
                 outperforms previous algorithms. This article extends a
                 previous short conference paper by providing proofs,
                 additional theorems, and a detailed discussion of each
                 step of the Conseil algorithm. It also significantly
                 extends the experimental evaluation on efficiency and
                 explanation quality.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Chong:2015:SID,
  author =       "Stephen Chong and Christian Skalka and Jeffrey A.
                 Vaughan",
  title =        "Self-Identifying Data for Fair Use",
  journal =      j-JDIQ,
  volume =       "5",
  number =       "3",
  pages =        "11:1--11:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2687422",
  ISSN =         "1936-1955",
  bibdate =      "Tue Mar 3 14:42:39 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Public-use earth science datasets are a useful
                 resource with the unfortunate feature that their
                 provenance is easily disconnected from their content.
                 ``Fair-use policies'' typically associated with these
                 datasets require appropriate attribution of providers
                 by users, but sound and complete attribution is
                 difficult if provenance information is lost. To address
                 this, we introduce a technique to directly associate
                 provenance information with sensor datasets. Our
                 technique is similar to traditional watermarking but is
                 intended for application to unstructured time-series
                 datasets. Our approach is potentially imperceptible
                 given sufficient margins of error in datasets and is
                 robust to a number of benign but likely transformations
                 including truncation, rounding, bit-flipping, sampling,
                 and reordering. We provide algorithms for both one-bit
                 and blind mark checking and show how our system can be
                 adapted to various data representation types. Our
                 algorithms are probabilistic in nature and are
                 characterized by both combinatorial and empirical
                 analyses. Mark embedding can be applied at any point in
                 the data life cycle, allowing adaptation of our scheme
                 to social or scientific concerns.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Baillie:2015:QPA,
  author =       "Chris Baillie and Peter Edwards and Edoardo Pignotti",
  title =        "{QUAL}: a Provenance-Aware Quality Model",
  journal =      j-JDIQ,
  volume =       "5",
  number =       "3",
  pages =        "12:1--12:??",
  month =        feb,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700413",
  ISSN =         "1936-1955",
  bibdate =      "Tue Mar 3 14:42:39 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "In this article, we present a model for quality
                 assessment over linked data. This model has been
                 designed to align with emerging standards for
                 provenance on the Web to enable agents to reason about
                 data provenance when performing quality assessment. The
                 model also enables quality assessment provenance to be
                 represented, thus allowing agents to make decisions
                 about reuse of existing assessments. We also discuss
                 the development of an OWL ontology as part of a
                 software framework to support reasoning about data
                 quality and assessment reuse. Finally, we evaluate this
                 framework using two real-world case studies derived
                 from transport and invasive-species monitoring
                 applications.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Attenberg:2015:BMC,
  author =       "Joshua Attenberg and Panos Ipeirotis and Foster
                 Provost",
  title =        "Beat the Machine: Challenging Humans to Find a
                 Predictive Model's ``Unknown Unknowns''",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2700832",
  ISSN =         "1936-1955",
  bibdate =      "Thu Mar 5 07:53:50 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "We present techniques for gathering data that expose
                 errors of automatic predictive models. In certain
                 common settings, traditional methods for evaluating
                 predictive models tend to miss rare but important
                 errors --- most importantly, cases for which the model
                 is confident of its prediction (but wrong). In this
                 article, we present a system that, in a game-like
                 setting, asks humans to identify cases that will cause
                 the predictive model-based system to fail. Such
                 techniques are valuable in discovering problematic
                 cases that may not reveal themselves during the normal
                 operation of the system and may include cases that are
                 rare but catastrophic. We describe the design of the
                 system, including design iterations that did not quite
                 work. In particular, the system incentivizes humans to
                 provide examples that are difficult for the model to
                 handle by providing a reward proportional to the
                 magnitude of the predictive model's error. The humans
                 are asked to ``Beat the Machine'' and find cases where
                 the automatic model (``the Machine'') is wrong.
                 Experiments show that the humans using Beat the Machine
                 identify more errors than do traditional techniques for
                 discovering errors in predictive models, and, indeed,
                 they identify many more errors where the machine is
                 (wrongly) confident it is correct. Furthermore, those
                 cases the humans identify seem to be not simply
                 outliers, but coherent areas missed completely by the
                 model. Beat the Machine identifies the ``unknown
                 unknowns.'' Beat the Machine has been deployed at an
                 industrial scale by several companies. The main impact
                 has been that firms are changing their perspective on
                 and practice of evaluating predictive models. ``There
                 are known knowns. These are things we know that we
                 know. There are known unknowns. That is to say, there
                 are things that we know we don't know. But there are
                 also unknown unknowns. There are things we don't know
                 we don't know.'' --- Donald Rumsfeld",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Alonso:2015:CLQ,
  author =       "Omar Alonso",
  title =        "Challenges with Label Quality for Supervised
                 Learning",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2724721",
  ISSN =         "1936-1955",
  bibdate =      "Thu Mar 5 07:53:50 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Organizations that develop and use technologies around
                 information retrieval, machine learning, recommender
                 systems, and natural language processing depend on
                 labels for engineering and experimentation. These
                 labels, usually gathered via human computation, are
                 used in machine-learned models for prediction and
                 evaluation purposes. In such scenarios, collecting
                 high-quality labels is a very important part of the
                 overall process. We elaborate on these challenges and
                 discuss research directions.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Lukyanenko:2015:IQR,
  author =       "Roman Lukyanenko and Jeffrey Parsons",
  title =        "Information Quality Research Challenge: Adapting
                 Information Quality Principles to User-Generated
                 Content",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2723166",
  ISSN =         "1936-1955",
  bibdate =      "Thu Mar 5 07:53:50 MST 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Naumann:2015:E,
  author =       "Felix Naumann",
  title =        "Editorial",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "2--3",
  pages =        "4:1--4:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2762716",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Varshney:2015:DCD,
  author =       "Kush R. Varshney and Dennis Wei and Karthikeyan
                 Natesan Ramamurthy and Aleksandra Mojsilovi{\'c}",
  title =        "Data Challenges in Disease Response: The 2014 {Ebola}
                 Outbreak and Beyond",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "2--3",
  pages =        "5:1--5:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742550",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Barnaghi:2015:CQD,
  author =       "Payam Barnaghi and Maria Bermudez-Edo and Ralf
                 T{\"o}njes",
  title =        "Challenges for Quality of Data in Smart Cities",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "2--3",
  pages =        "6:1--6:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2747881",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Grant:2015:CLT,
  author =       "Christan Earl Grant and Daisy Zhe Wang",
  title =        "A Challenge for Long-Term Knowledge Base Maintenance",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "2--3",
  pages =        "7:1--7:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2738044",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Sha:2015:DQC,
  author =       "Kewei Sha and Sherali Zeadally",
  title =        "Data Quality Challenges in Cyber-Physical Systems",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "2--3",
  pages =        "8:1--8:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2740965",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Gennari:2015:CQT,
  author =       "Rosella Gennari and Sara Tonelli and Pierpaolo
                 Vittorini",
  title =        "Challenges in Quality of Temporal Data --- Starting
                 with Gold Standards",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "2--3",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2736699",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Basole:2015:DAC,
  author =       "Rahul C. Basole and Mark L. Braunstein and Jimeng
                 Sun",
  title =        "Data and Analytics Challenges for a Learning
                 Healthcare System",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "2--3",
  pages =        "10:1--10:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2755489",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Todoran:2015:MEI,
  author =       "Ion-George Todoran and Laurent Lecornu and Ali
                 Khenchaf and Jean-Marc {Le Caillec}",
  title =        "A Methodology to Evaluate Important Dimensions of
                 Information Quality in Systems",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "2--3",
  pages =        "11:1--11:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2744205",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Assessing the quality of the information proposed by
                 an information system has become one of the major
                 research topics in the last two decades. A quick
                 literature survey shows that a significant number of
                 information quality frameworks are proposed in
                 different domains of application: management
                 information systems, web information systems,
                 information fusion systems, and so forth.
                 Unfortunately, they do not provide a feasible
                 methodology that is both simple and intuitive to be
                 implemented in practice. In order to address this need,
                 we present in this article a new information quality
                 methodology. Our methodology makes use of existing
                 frameworks and proposes a three-step process capable of
                 tracking the quality changes through the system. In the
                 first step and as a novelty compared to existing
                 studies, we propose decomposing the information system
                 into its elementary modules. Having access to each
                 module allows us to locally define the information
                 quality. Then, in the second step, we model each
                 processing module by a quality transfer function,
                 capturing the module's influence over the information
                 quality. In the third step, we make use of the previous
                 two steps in order to estimate the quality of the
                 entire information system. Thus, our methodology allows
                 informing the end-user on both output quality and local
                 quality. The proof of concept of our methodology has
                 been carried out considering two applications: an
                 automatic target recognition system and a diagnosis
                 coding support system.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Zarraga-Rodriguez:2015:EID,
  author =       "Marta Zarraga-Rodriguez and M. Jesus Alvarez",
  title =        "Experience: Information Dimensions Affecting
                 Employees' Perceptions Towards Being Well Informed",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "2--3",
  pages =        "12:1--12:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2774223",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Information is a strategic company resource, but there
                 is no consensus in the literature regarding the set of
                 dimensions to be considered when measuring the quality
                 of the information. Most measures of information
                 quality depend on user perception. Using multiple
                 correlation analysis, we obtain a model that allows us
                 to explain how information quality dimensions influence
                 information consumers' overall feeling of being well
                 informed. A set of dimensions that any measure of
                 information quality should at least include is
                 proposed. This exploratory study reports the results of
                 a research survey among managers of companies committed
                 to quality management within the framework of a Total
                 Quality Management (TQM) model, which is an
                 information-intensive management model.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bartoli:2015:DQC,
  author =       "Alberto Bartoli and Andrea {De Lorenzo} and Eric
                 Medvet and Fabiano Tarlao",
  title =        "Data Quality Challenge: Toward a Tool for String
                 Processing by Examples",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "4",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2786983",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ahlers:2015:DCQ,
  author =       "Dirk Ahlers and John Krogstie",
  title =        "Document and Corpus Quality Challenges for Knowledge
                 Management in Engineering Enterprises",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "4",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818379",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ramadan:2015:DSN,
  author =       "Banda Ramadan and Peter Christen and Huizhi Liang and
                 Ross W. Gayler",
  title =        "Dynamic Sorted Neighborhood Indexing for Real-Time
                 Entity Resolution",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "4",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2816821",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Real-time Entity Resolution (ER) is the process of
                 matching query records in subsecond time with records
                 in a database that represent the same real-world
                 entity. Indexing techniques are generally used to
                 efficiently extract a set of candidate records from the
                 database that are similar to a query record, and that
                 are to be compared with the query record in more
                 detail. The sorted neighborhood indexing method, which
                 sorts a database and compares records within a sliding
                 window, has been successfully used for ER of large
                 static databases. However, because it is based on
                 static sorted arrays and is designed for batch ER that
                 resolves all records in a database rather than
                 resolving those relating to a single query record, this
                 technique is not suitable for real-time ER on dynamic
                 databases that are constantly updated. We propose a
                 tree-based technique that facilitates dynamic indexing
                 based on the sorted neighborhood method, which can be
                 used for real-time ER, and investigate both static and
                 adaptive window approaches. We propose an approach to
                 reduce query matching times by precalculating the
                 similarities between attribute values stored in
                 neighboring tree nodes. We also propose a multitree
                 solution where different sorting keys are used to
                 reduce the effects of errors and variations in
                 attribute values on matching quality by building
                 several distinct index trees. We experimentally
                 evaluate our proposed techniques on large real
                 datasets, as well as on synthetic data with different
                 data quality characteristics. Our results show that as
                 the index grows, no appreciable increase occurs in both
                 record insertion and query times, and that using
                 multiple trees gives noticeable improvements on
                 matching quality with only a small increase in query
                 time. Compared to earlier indexing techniques for
                 real-time ER, our approach achieves significantly
                 reduced indexing and query matching times while
                 maintaining high matching accuracy.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Coletti:2015:DCH,
  author =       "Paolo Coletti and Maurizio Murgia",
  title =        "Design and Construction of a Historical Financial
                 Database of the {Italian} Stock Market 1973--2011",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "4",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2822898",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "This article presents the technical aspects of
                 designing and building a historical database of the
                 Italian Stock Market. The database contains daily
                 market data from 1973 to 2011 and is constructed by
                 merging two main digital sources and several other
                 hand-collected data sources. We analyzed and developed
                 semiautomatic tools to deal with problems related to
                 time-series matchings, quality of data, and numerical
                 errors. We also developed a concatenation structure to
                 allow the handling of company name changes, mergers,
                 and spin-offs without artificially altering numerical
                 series. At the same time, we maintained the
                 transparency of the historical information on each
                 individual company listed. Thanks to the overlapping of
                 digital and hand-collected data, the completed database
                 has a very high level of detail and accuracy. The
                 dataset is particularly suited for any empirical
                 research in financial economics and for more
                 practically oriented numerical applications and
                 forecasting simulations.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Missier:2015:CSI,
  author =       "Paolo Missier",
  title =        "Corrigendum to the Special Issue Editorial in {JDIQ}
                 Volume 5, Issue 3",
  journal =      j-JDIQ,
  volume =       "6",
  number =       "4",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2821019",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Chapman:2016:CQD,
  author =       "Adriane P. Chapman and Arnon Rosenthal and Len
                 Seligman",
  title =        "The Challenge of ``Quick and Dirty'' Information
                 Quality",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "1--2",
  pages =        "1:1--1:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2834123",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Millar:2016:DQC,
  author =       "Jeremy R. Millar and Douglas D. Hodson and Gilbert L.
                 Peterson and Darryl K. Ahner",
  title =        "Data Quality Challenges in Distributed
                 Live-Virtual-Constructive Test Environments",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "1--2",
  pages =        "2:1--2:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2850420",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Lukyanenko:2016:IQR,
  author =       "Roman Lukyanenko",
  title =        "Information Quality Research Challenge: Information
                 Quality in the Age of Ubiquitous Digital
                 Intermediation",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "1--2",
  pages =        "3:1--3:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2856038",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "As information technology becomes an integral part of
                 daily life, increasingly, people understand the world
                 around them by turning to digital sources as opposed to
                 directly interacting with objects in the physical
                 world. This has ushered in the age of Ubiquitous
                 Digital Intermediation (UDI). With the explosion of
                 UDI, the scope of Information Quality (IQ) research is
                 due to expand dramatically as the challenge becomes to
                 capture the wealth and nuances of human experience.
                 This article presents three key changes to the IQ
                 landscape brought about by UDI, including expansion of
                 the scope of traditional IQ dimensions, digital to
                 physical mapping challenge, and the increased need to
                 manage content authenticity. UDI generates many novel
                 questions and opportunities for the IQ research
                 community.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Zhu:2016:DSC,
  author =       "Hongwei Zhu and Yang W. Lee and Arnon S. Rosenthal",
  title =        "Data Standards Challenges for Interoperable and
                 Quality Data",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "1--2",
  pages =        "4:1--4:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2903723",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ulbricht:2016:CCD,
  author =       "Robert Ulbricht and Hilko Donker and Claudio Hartmann
                 and Martin Hahmann and Wolfgang Lehner",
  title =        "Challenges for Context-Driven Time Series
                 Forecasting",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "1--2",
  pages =        "5:1--5:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2896822",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Predicting time series is a crucial task for
                 organizations, since decisions are often based on
                 uncertain information. Many forecasting models are
                 designed from a generic statistical point of view.
                 However, each real-world application requires
                 domain-specific adaptations to obtain high-quality
                 results. All such specifics are summarized by the term
                 of context. In contrast to current approaches, we want
                 to integrate context as the primary driver in the
                 forecasting process. We introduce context-driven time
                 series forecasting focusing on two exemplary domains:
                 renewable energy and sparse sales data. In view of
                 this, we discuss the challenge of context integration
                 in the individual process steps.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ceolin:2016:CUR,
  author =       "Davide Ceolin and Paul Groth and Valentina Maccatrozzo
                 and Wan Fokkink and Willem Robert {Van Hage} and
                 Archana Nottamkandath",
  title =        "Combining User Reputation and Provenance Analysis for
                 Trust Assessment",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "1--2",
  pages =        "6:1--6:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2818382",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Trust is a broad concept that in many systems is often
                 reduced to user reputation alone. However, user
                 reputation is just one way to determine trust. The
                 estimation of trust can be tackled from other
                 perspectives as well, including by looking at
                 provenance. Here, we present a complete pipeline for
                 estimating the trustworthiness of artifacts given their
                 provenance and a set of sample evaluations. The
                 pipeline is composed of a series of algorithms for (1)
                 extracting relevant provenance features, (2) generating
                 stereotypes of user behavior from provenance features,
                 (3) estimating the reputation of both stereotypes and
                 users, (4) using a combination of user and stereotype
                 reputations to estimate the trustworthiness of
                 artifacts, and (5) selecting sets of artifacts to
                 trust. These algorithms rely on the W3C PROV
                 recommendations for provenance and on evidential
                 reasoning by means of subjective logic. We evaluate the
                 pipeline over two tagging datasets: tags and
                 evaluations from the Netherlands Institute for Sound
                 and Vision's Waisda? video tagging platform, as well as
                 crowdsourced annotations from the Steve.Museum project.
                 The approach achieves up to 85\% precision when
                 predicting tag trustworthiness. Perhaps more
                 importantly, the pipeline provides satisfactory results
                 using relatively little evidence through the use of
                 provenance.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Christen:2016:ADA,
  author =       "Peter Christen and Ross W. Gayler and Khoi-Nguyen Tran
                 and Jeffrey Fisher and Dinusha Vatsalan",
  title =        "Automatic Discovery of Abnormal Values in Large
                 Textual Databases",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "1--2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2889311",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Textual databases are ubiquitous in many application
                 domains. Examples of textual data range from names and
                 addresses of customers to social media posts and
                 bibliographic records. With online services,
                 individuals are increasingly required to enter their
                 personal details for example when purchasing products
                 online or registering for government services, while
                 many social network and e-commerce sites allow users to
                 post short comments. Many online sites leave open the
                 possibility for people to enter unintended or malicious
                 abnormal values, such as names with errors, bogus
                 values, profane comments, or random character
                 sequences. In other applications, such as online
                 bibliographic databases or comparative online shopping
                 sites, databases are increasingly populated in (semi-)
                 automatic ways through Web crawls. This practice can
                 result in low quality data being added automatically
                 into a database. In this article, we develop three
                 techniques to automatically discover abnormal
                 (unexpected or unusual) values in large textual
                 databases. Following recent work in categorical outlier
                 detection, our assumption is that ``normal'' values are
                 those that occur frequently in a database, while an
                 individual abnormal value is rare. Our techniques are
                 unsupervised and address the challenge of discovering
                 abnormal values as an outlier detection problem. Our
                 first technique is a basic but efficient q-gram set
                 based technique, the second is based on a probabilistic
                 language model, and the third employs morphological
                 word features to train a one-class support vector
                 machine classifier. Our aim is to investigate and
                 develop techniques that are fast, efficient, and
                 automatic. The output of our techniques can help in the
                 development of rule-based data cleaning and information
                 extraction systems, or be used as training data for
                 further supervised data cleaning procedures. We
                 evaluate our techniques on four large real-world
                 datasets from different domains: two US voter
                 registration databases containing personal details, the
                 2013 KDD Cup dataset of bibliographic records, and the
                 SNAP Memetracker dataset of phrases from social
                 networking sites. Our results show that our techniques
                 can efficiently and automatically discover abnormal
                 textual values, allowing an organization to conduct
                 efficient data exploration, and improve the quality of
                 their textual databases without the need of requiring
                 explicit training data.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Aiken:2016:ESD,
  author =       "Peter Aiken",
  title =        "{EXPERIENCE}: Succeeding at Data Management-{BigCo}
                 Attempts to Leverage Data",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "1--2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2893482",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "In a manner similar to most organizations, BigCompany
                 (BigCo) was determined to benefit strategically from
                 its widely recognized and vast quantities of data.
                 (U.S. government agencies make regular visits to BigCo
                 to learn from its experiences in this area.) When faced
                 with an explosion in data volume, increases in
                 complexity, and a need to respond to changing
                 conditions, BigCo struggled to respond using a
                 traditional, information technology (IT) project-based
                 approach to address these challenges. As BigCo was not
                 data knowledgeable, it did not realize that traditional
                 approaches could not work. Two full years into the
                 initiative, BigCo was far from achieving its initial
                 goals. How much more time, money, and effort would be
                 required before results were achieved? Moreover, could
                 the results be achieved in time to support a larger,
                 critical, technology-driven challenge that also
                 depended on solving the data challenges? While these
                 questions remain unaddressed, these considerations
                 increase our collective understanding of data assets as
                 separate from IT projects. Only by reconceiving data as
                 a strategic asset can organizations begin to address
                 these new challenges. Transformation to a data-driven
                 culture requires far more than technology, which
                 remains just one of three required ``stool legs''
                 (people and process being the other two). Seven
                 prerequisites to effectively leveraging data are
                 necessary, but insufficient awareness exists in most
                 organizations-hence, the widespread misfires in these
                 areas, especially when attempting to implement the
                 so-called big data initiatives. Refocusing on
                 foundational data management practices is required for
                 all organizations, regardless of their organizational
                 or data strategies.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Chiang:2016:UDC,
  author =       "Fei Chiang and Siddharth Sitaramachandran",
  title =        "Unifying Data and Constraint Repairs",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "3",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2883616",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Integrity constraints play an important role in data
                 design. However, in an operational database, they may
                 not be enforced for many reasons. Hence, over time,
                 data may become inconsistent with respect to the
                 constraints. To manage this, several approaches have
                 proposed techniques to repair the data by finding
                 minimal or lowest cost changes to the data that make it
                 consistent with the constraints. Such techniques are
                 appropriate for applications where only the data
                 changes, but schemas and their constraints remain
                 fixed. In many modern applications, however,
                 constraints may evolve over time as application or
                 business rules change, as data are integrated with new
                 data sources or as the underlying semantics of the data
                 evolves. In such settings, when an inconsistency
                 occurs, it is no longer clear if there is an error in
                 the data (and the data should be repaired) or if the
                 constraints have evolved (and the constraints should be
                 repaired). In this work, we present a novel unified
                 cost model that allows data and constraint repairs to
                 be compared on an equal footing. We consider repairs
                 over a database that is inconsistent with respect to a
                 set of rules, modeled as functional dependencies (FDs).
                 FDs are the most common type of constraint and are
                 known to play an important role in maintaining data
                 quality. We propose modifications to the data and to
                 the FDs such that the data and the constraints are
                 better aligned. We evaluate the quality and scalability
                 of our repair algorithms over synthetic and real
                 datasets. The results show that our repair algorithms
                 not only scale well for large datasets but also are
                 able to accurately capture and correct inconsistencies
                 and accurately decide when a data repair versus a
                 constraint repair is best.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Maltese:2016:SAC,
  author =       "Vincenzo Maltese and Fausto Giunchiglia",
  title =        "Search and Analytics Challenges in Digital Libraries
                 and Archives",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "3",
  pages =        "10:1--10:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2939377",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Gelernter:2016:COE,
  author =       "J. Gelernter and J. Jha",
  title =        "Challenges in Ontology Evaluation",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "3",
  pages =        "11:1--11:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2935751",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Berti-Equille:2016:VBD,
  author =       "Laure Berti-Equille and Mouhamadou Lamine Ba",
  title =        "Veracity of Big Data: Challenges of Cross-Modal Truth
                 Discovery",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "3",
  pages =        "12:1--12:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2935753",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Haralabopoulos:2016:CIC,
  author =       "Giannis Haralabopoulos and Ioannis Anagnostopoulos and
                 Sherali Zeadally",
  title =        "The Challenge of Improving Credibility of
                 User-Generated Content in Online Social Networks",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "3",
  pages =        "13:1--13:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2899003",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "In every environment of information exchange,
                 Information Quality (IQ) is considered one of the most
                 important issues. Studies in Online Social Networks
                 (OSNs) analyze a number of related subjects that span
                 both theoretical and practical aspects, from data
                 quality identification and simple attribute
                 classification to quality assessment models for various
                 social environments. Among several factors that affect
                 information quality in online social networks is the
                 credibility of user-generated content. To address this
                 challenge, some proposed solutions include
                 community-based evaluation and labeling of
                 user-generated content in terms of accuracy, clarity,
                 and timeliness, along with well-established real-time
                 data mining techniques.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{DUrso:2016:EGD,
  author =       "Ciro D'Urso",
  title =        "{EXPERIENCE}: Glitches in Databases, How to Ensure
                 Data Quality by Outlier Detection Techniques",
  journal =      j-JDIQ,
  volume =       "7",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2950109",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Enterprise's archives are inevitably affected by the
                 presence of data quality problems (also called
                 glitches). This article proposes the application of a
                 new method to analyze the quality of datasets stored in
                 the tables of a database, with no knowledge of the
                 semantics of the data and without the need to define
                 repositories of rules. The proposed method is based on
                 proper revisions of different approaches for outlier
                 detection that are combined to boost overall
                 performance and accuracy. A novel transformation
                 algorithm is conceived that treats the items in
                 database tables as data points in real coordinate space
                 of n dimensions, so that fields containing dates and
                 fields containing text are processed to calculate
                 distances between those data points. The implementation
                 of an iterative approach ensures that global and local
                 outliers are discovered even if they are subject,
                 primarily in datasets with multiple outliers or
                 clusters of outliers, to masking and swamping effects.
                 The application of the method to a set of archives,
                 some of which have been studied extensively in the
                 literature, provides very promising experimental
                 results and outperforms the application of a single
                 other technique. Finally, a list of future research
                 directions is highlighted.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Labouseur:2017:IDD,
  author =       "Alan G. Labouseur and Carolyn C. Matheus",
  title =        "An Introduction to Dynamic Data Quality Challenges",
  journal =      j-JDIQ,
  volume =       "8",
  number =       "2",
  pages =        "6:1--6:??",
  month =        feb,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2998575",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:27 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Becker:2017:CTD,
  author =       "Christoph Becker and Kresimir Duretec and Andreas
                 Rauber",
  title =        "The Challenge of Test Data Quality in Data
                 Processing",
  journal =      j-JDIQ,
  volume =       "8",
  number =       "2",
  pages =        "7:1--7:??",
  month =        feb,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3012004",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:27 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ferro:2017:RCI,
  author =       "Nicola Ferro",
  title =        "Reproducibility Challenges in Information Retrieval
                 Evaluation",
  journal =      j-JDIQ,
  volume =       "8",
  number =       "2",
  pages =        "8:1--8:??",
  month =        feb,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3020206",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:27 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Shankaranarayanan:2017:CCE,
  author =       "G. Shankaranarayanan and Roger Blake",
  title =        "From Content to Context: The Evolution and Growth of
                 Data Quality Research",
  journal =      j-JDIQ,
  volume =       "8",
  number =       "2",
  pages =        "9:1--9:??",
  month =        feb,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2996198",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:27 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Research in data and information quality has made
                 significant strides over the last 20 years. It has
                 become a unified body of knowledge incorporating
                 techniques, methods, and applications from a variety of
                 disciplines including information systems, computer
                 science, operations management, organizational
                 behavior, psychology, and statistics. With
                 organizations viewing ``Big Data'', social media data,
                 data-driven decision-making, and analytics as critical,
                 data quality has never been more important. We believe
                 that data quality research is reaching the threshold of
                 significant growth and a metamorphosis from focusing on
                 measuring and assessing data quality-content-toward a
                 focus on usage and context. At this stage, it is vital
                 to understand the identity of this research area in
                 order to recognize its current state and to effectively
                 identify an increasing number of research opportunities
                 within. Using Latent Semantic Analysis (LSA) to analyze
                 the abstracts of 972 peer-reviewed journal and
                 conference articles published over the past 20 years,
                 this article contributes by identifying the core topics
                 and themes that define the identity of data quality
                 research. It further explores their trends over time,
                 pointing to the data quality dimensions that have-and
                 have not-been well-studied, and offering insights into
                 topics that may provide significant opportunities in
                 this area.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Goldberg:2017:PIS,
  author =       "Sean Goldberg and Daisy Zhe Wang and Christan Grant",
  title =        "A Probabilistically Integrated System for
                 Crowd-Assisted Text Labeling and Extraction",
  journal =      j-JDIQ,
  volume =       "8",
  number =       "2",
  pages =        "10:1--10:??",
  month =        feb,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3012003",
  ISSN =         "1936-1955",
  bibdate =      "Sat Apr 8 09:38:27 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "The amount of text data has been growing exponentially
                 in recent years, giving rise to automatic information
                 extraction methods that store text annotations in a
                 database. The current state-of-the-art structured
                 prediction methods, however, are likely to contain
                 errors and it is important to be able to manage the
                 overall uncertainty of the database. On the other hand,
                 the advent of crowdsourcing has enabled humans to aid
                 machine algorithms at scale. In this article, we
                 introduce pi-CASTLE, a system that optimizes and
                 integrates human and machine computing as applied to a
                 complex structured prediction problem involving
                 Conditional Random Fields (CRFs). We propose strategies
                 grounded in information theory to select a token
                 subset, formulate questions for the crowd to label, and
                 integrate these labelings back into the database using
                 a method of constrained inference. On both a text
                 segmentation task over academic citations and a named
                 entity recognition task over tweets we show an order of
                 magnitude improvement in accuracy gain over baseline
                 methods.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Woodall:2017:DRC,
  author =       "Philip Woodall",
  title =        "The Data Repurposing Challenge: New Pressures from
                 Data Analytics",
  journal =      j-JDIQ,
  volume =       "8",
  number =       "3--4",
  pages =        "11:1--11:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3022698",
  ISSN =         "1936-1955",
  bibdate =      "Mon Oct 2 09:44:30 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Markovic:2017:CQS,
  author =       "Milan Markovic and Peter Edwards",
  title =        "The Challenge of Quality in Social Computation",
  journal =      j-JDIQ,
  volume =       "8",
  number =       "3--4",
  pages =        "12:1--12:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3041762",
  ISSN =         "1936-1955",
  bibdate =      "Mon Oct 2 09:44:30 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Al-Hussaini:2017:EIB,
  author =       "Leena Al-Hussaini",
  title =        "Experience: Insights into the Benchmarking Data of
                 {Hunspell} and {Aspell} Spell Checkers",
  journal =      j-JDIQ,
  volume =       "8",
  number =       "3--4",
  pages =        "13:1--13:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3092700",
  ISSN =         "1936-1955",
  bibdate =      "Mon Oct 2 09:44:30 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib;
                 http://www.math.utah.edu/pub/tex/bib/spell.bib",
  abstract =     "Hunspell is a morphological spell checker and
                 automatic corrector for Macintosh 10.6 and later
                 versions. Aspell is a general spell checker and
                 automatic corrector for the GNU operating system. In
                 this experience article, we present a benchmarking
                 study of the performance of Hunspell and Aspell. Ginger
                 is a general grammatical spell checker that is used as
                 a baseline to compare the performance of Hunspell and
                 Aspell. A benchmark dataset was carefully selected to
                 be a mixture of different error types at different word
                 length levels. Further, the benchmarking data are from
                 very bad spellers and will challenge any spell checker.
                 The extensive study described in this work will
                 characterize the respective softwares and benchmarking
                 data from multiple perspectives and will consider many
                 error statistics. Overall, Hunspell can correct 415/469
                 words and Aspell can correct 414/469 words. The
                 baseline Ginger can correct 279/469 words. We recommend
                 this dataset as the preferred benchmark dataset for
                 evaluating newly developed ``isolated word'' spell
                 checkers.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Abdellaoui:2017:QSD,
  author =       "Sabrina Abdellaoui and Fahima Nader and Rachid
                 Chalal",
  title =        "{QDflows}: a System Driven by Knowledge Bases for
                 Designing Quality-Aware Data flows",
  journal =      j-JDIQ,
  volume =       "8",
  number =       "3--4",
  pages =        "14:1--14:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3064173",
  ISSN =         "1936-1955",
  bibdate =      "Mon Oct 2 09:44:30 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "In the big data era, data integration is becoming
                 increasingly important. It is usually handled by data
                 flows processes that extract, transform, and clean data
                 from several sources, and populate the data integration
                 system (DIS). Designing data flows is facing several
                 challenges. In this article, we deal with data quality
                 issues such as (1) specifying a set of quality rules,
                 (2) enforcing them on the data flow pipeline to detect
                 violations, and (3) producing accurate repairs for the
                 detected violations. We propose QDflows, a system for
                 designing quality-aware data flows that considers the
                 following as input: (1) a high-quality knowledge base
                 (KB) as the global schema of integration, (2) a set of
                 data sources and a set of validated users'
                 requirements, (3) a set of defined mappings between
                 data sources and the KB, and (4) a set of quality rules
                 specified by users. QDflows uses an ontology to design
                 the DIS schema. It offers the ability to define the DIS
                 ontology as a module of the knowledge base, based on
                 validated users' requirements. The DIS ontology model
                 is then extended with multiple types of quality rules
                 specified by users. QDflows extracts and transforms
                 data from sources to populate the DIS. It detects
                 violations of quality rules enforced on the data flows,
                 constructs repair patterns, searches for horizontal and
                 vertical matches in the knowledge base, and performs an
                 automatic repair when possible or generates possible
                 repairs. It interactively involves users to validate
                 the repair process before loading the clean data into
                 the DIS. Using real-life and synthetic datasets, the
                 DBpedia and Yago knowledge bases, we experimentally
                 evaluate the generality, effectiveness, and efficiency
                 of QDflows. We also showcase an interactive tool
                 implementing our system.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{St-Maurice:2017:ECS,
  author =       "Justin St-Maurice and Catherine Burns",
  title =        "An Exploratory Case Study to Understand Primary Care
                 Users and Their Data Quality Tradeoffs",
  journal =      j-JDIQ,
  volume =       "8",
  number =       "3--4",
  pages =        "15:1--15:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3058750",
  ISSN =         "1936-1955",
  bibdate =      "Mon Oct 2 09:44:30 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Primary care data is an important part of the evolving
                 healthcare ecosystem. Generally, users in primary care
                 are expected to provide excellent patient care and
                 record high-quality data. In practice, users must
                 balance sets of priorities regarding care and data. The
                 goal of this study was to understand data quality
                 tradeoffs between timeliness, validity, completeness,
                 and use among primary care users. As a case study, data
                 quality measures and metrics are developed through a
                 focus group session with managers. After calculating
                 and extracting measurements of data quality from six
                 years of historic data, each measure was modeled with
                 logit binomial regression to show correlations,
                 characterize tradeoffs, and investigate data quality
                 interactions. Measures and correlations for
                 completeness, use, and timeliness were calculated for
                 196,967 patient encounters. Based on the analysis,
                 there was a positive relationship between validity and
                 completeness, and a negative relationship between
                 timeliness and use. Use of data and reductions in entry
                 delay were positively associated with completeness and
                 validity. Our results suggest that if users are not
                 provided with sufficient time to record data as part of
                 their regular workflow, they will prioritize spending
                 available time with patients. As a measurement of a
                 primary care system's effectiveness, the negative
                 correlation between use and timeliness points to a
                 self-reinforcing relationship that provides users with
                 little external value. In the future, additional data
                 can be generated from comparable organizations to test
                 several new hypotheses about primary care users.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Wang:2017:DDR,
  author =       "Jiannan Wang and Nan Tang",
  title =        "Dependable Data Repairing with Fixing Rules",
  journal =      j-JDIQ,
  volume =       "8",
  number =       "3--4",
  pages =        "16:1--16:??",
  month =        jul,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3041761",
  ISSN =         "1936-1955",
  bibdate =      "Mon Oct 2 09:44:30 MDT 2017",
  bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "One of the main challenges that data-cleaning systems
                 face is to automatically identify and repair data
                 errors in a dependable manner. Though data dependencies
                 (also known as integrity constraints) have been widely
                 studied to capture errors in data, automated and
                 dependable data repairing on these errors has remained
                 a notoriously difficult problem. In this work, we
                 introduce an automated approach for dependably
                 repairing data errors, based on a novel class of fixing
                 rules. A fixing rule contains an evidence pattern, a
                 set of negative patterns, and a fact value. The heart
                 of fixing rules is deterministic: given a tuple, the
                 evidence pattern and the negative patterns of a fixing
                 rule are combined to precisely capture which attribute
                 is wrong, and the fact indicates how to correct this
                 error. We study several fundamental problems associated
                 with fixing rules and establish their complexity. We
                 develop efficient algorithms to check whether a set of
                 fixing rules are consistent and discuss approaches to
                 resolve inconsistent fixing rules. We also devise
                 efficient algorithms for repairing data errors using
                 fixing rules. Moreover, we discuss approaches on how to
                 generate a large number of fixing rules from examples
                 or available knowledge bases. We experimentally
                 demonstrate that our techniques outperform other
                 automated algorithms in terms of the accuracy of
                 repairing data errors, using both real-life and
                 synthetic data.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Marcheggiani:2017:ELQ,
  author =       "Diego Marcheggiani and Fabrizio Sebastiani",
  title =        "On the Effects of Low-Quality Training Data on
                 Information Extraction from Clinical Reports",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "1",
  pages =        "1:1--1:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106235",
  ISSN =         "1936-1955",
  bibdate =      "Mon Jan 22 16:07:56 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "In the last five years there has been a flurry of work
                 on information extraction from clinical documents, that
                 is, on algorithms capable of extracting, from the
                 informal and unstructured texts that are generated
                 during everyday clinical practice, mentions of concepts
                 relevant to such practice. Many of these research works
                 are about methods based on supervised learning, that
                 is, methods for training an information extraction
                 system from manually annotated examples. While a lot of
                 work has been devoted to devising learning methods that
                 generate more and more accurate information extractors,
                 no work has been devoted to investigating the effect of
                 the quality of training data on the learning process
                 for the clinical domain. Low quality in training data
                 often derives from the fact that the person who has
                 annotated the data is different from the one against
                 whose judgment the automatically annotated data must be
                 evaluated. In this article, we test the impact of such
                 data quality issues on the accuracy of information
                 extraction systems as applied to the clinical domain.
                 We do this by comparing the accuracy deriving from
                 training data annotated by the authoritative coder
                 (i.e., the one who has also annotated the test data and
                 by whose judgment we must abide) with the accuracy
                 deriving from training data annotated by a different
                 coder, equally expert in the subject matter. The
                 results indicate that, although the disagreement
                 between the two coders (as measured on the training
                 set) is substantial, the difference is (surprisingly
                 enough) not always statistically significant. While the
                 dataset used in the present work originated in a
                 clinical context, the issues we study in this work are
                 of more general interest.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Basheer:2017:CBQ,
  author =       "Aseel Basheer and Kewei Sha",
  title =        "Cluster-Based Quality-Aware Adaptive Data Compression
                 for Streaming Data",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "1",
  pages =        "2:1--2:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3122863",
  ISSN =         "1936-1955",
  bibdate =      "Mon Jan 22 16:07:56 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Wireless sensor networks (WSNs) are widely applied in
                 data collection applications. Energy efficiency is one
                 of the most important design goals of WSNs. In this
                 article, we examine the tradeoffs between the energy
                 efficiency and the data quality. First, four attributes
                 used to evaluate data quality are formally defined.
                 Then, we propose a novel data compression algorithm,
                 Quality-Aware Adaptive data Compression (QAAC), to
                 reduce the amount of data communication to save energy.
                 QAAC utilizes an adaptive clustering algorithm to build
                 clusters from dataset; then a code for each cluster is
                 generated and stored in a Huffman encoding tree. The
                 encoding algorithm encodes the original dataset based
                 on the Haffman encoding tree. An improvement algorithm
                 is also designed to reduce the information loss when
                 data are compressed. After the encoded data, the
                 Huffman encoding tree and parameters used in the
                 improvement algorithm have been received at the sink, a
                 decompression algorithm is used to retrieve the
                 approximation of the original dataset. The performance
                 evaluation shows that QAAC is efficient and achieves a
                 much higher compression ratio than lossy and lossless
                 compression algorithms, while it has much smaller
                 information loss than lossy compression algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Corsar:2017:COD,
  author =       "David Corsar and Peter Edwards",
  title =        "Challenges of Open Data Quality: More Than Just
                 License, Format, and Customer Support",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "1",
  pages =        "3:1--3:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3110291",
  ISSN =         "1936-1955",
  bibdate =      "Mon Jan 22 16:07:56 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{El-Mawass:2017:DQC,
  author =       "Nour El-Mawass and Saad Alaboodi",
  title =        "Data Quality Challenges in Social Spam Research",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "1",
  pages =        "4:1--4:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3090057",
  ISSN =         "1936-1955",
  bibdate =      "Mon Jan 22 16:07:56 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Chen:2017:IQC,
  author =       "Min Chen and Roman Lukyanenko and Monica Chiarini
                 Tremblay",
  title =        "Information Quality Challenges in Shared Healthcare
                 Decision Making",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "1",
  pages =        "5:1--5:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3090056",
  ISSN =         "1936-1955",
  bibdate =      "Mon Jan 22 16:07:56 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Arbuckle:2017:CPC,
  author =       "Peter Arbuckle and Ezra Kahn and Adam Kriesberg",
  title =        "Challenge Paper: Challenges to Sharing Data and Models
                 for Life Cycle Assessment",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "1",
  pages =        "6:1--6:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3106236",
  ISSN =         "1936-1955",
  bibdate =      "Mon Jan 22 16:07:56 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Raschid:2018:ECJ,
  author =       "Louiqa Raschid",
  title =        "{Editor-in-Chief (January 2014--May 2017)} Farewell
                 Report",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3143313",
  ISSN =         "1936-1955",
  bibdate =      "Mon Jan 22 16:07:57 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Catarci:2018:FNJ,
  author =       "Tiziana Catarci",
  title =        "Foreword from the New {JDIQ Editor-in-Chief}",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3143316",
  ISSN =         "1936-1955",
  bibdate =      "Mon Jan 22 16:07:57 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Truong:2018:CEQ,
  author =       "Hong-Linh Truong and Aitor Murguzur and Erica Yang",
  title =        "Challenges in Enabling Quality of Analytics in the
                 Cloud",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3138806",
  ISSN =         "1936-1955",
  bibdate =      "Mon Jan 22 16:07:57 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Koh:2018:ELA,
  author =       "Kyu Han Koh and Eric Fouh and Mohammed F. Farghally
                 and Hossameldin Shahin and Clifford A. Shaffer",
  title =        "Experience: Learner Analytics Data Quality for an
                 {eTextbook} System",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3148240",
  ISSN =         "1936-1955",
  bibdate =      "Mon Jan 22 16:07:57 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "We present lessons learned related to data collection
                 and analysis from 5 years of experience with the
                 eTextbook system OpenDSA. The use of such cyberlearning
                 systems is expanding rapidly in both formal and
                 informal educational settings. Although the precise
                 issues related to any such project are idiosyncratic
                 based on the data collection technology and goals of
                 the project, certain types of data collection problems
                 will be common. We begin by describing the nature of
                 the data transmitted between the student's client
                 machine and the database server, and our initial
                 database schema for storing interaction log data. We
                 describe many problems that we encountered, with the
                 nature of the problems categorized as syntactic-level
                 data collection issues, issues with relating events to
                 users, or issues with tracking users over time.
                 Relating events to users and tracking the time spent on
                 tasks are both prerequisites to converting
                 syntactic-level interaction streams to semantic-level
                 behavior needed for higher-order analysis of the data.
                 Finally, we describe changes made to our database
                 schema that helped to resolve many of the issues that
                 we had encountered. These changes help advance our
                 ultimate goal of encouraging a change from ineffective
                 learning behavior by students to more productive
                 behavior.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Cappiello:2018:VDQ,
  author =       "C. Cappiello and C. Cerletti and C. Fratto and B.
                 Pernici",
  title =        "Validating Data Quality Actions in Scoring Processes",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3141248",
  ISSN =         "1936-1955",
  bibdate =      "Mon Jan 22 16:07:57 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Data quality has gained momentum among organizations
                 upon the realization that poor data quality might cause
                 failures and/or inefficiencies, thus compromising
                 business processes and application results. However,
                 enterprises often adopt data quality assessment and
                 improvement methods based on practical and empirical
                 approaches without conducting a rigorous analysis of
                 the data quality issues and outcome of the enacted data
                 quality improvement practices. In particular, data
                 quality management, especially the identification of
                 the data quality dimensions to be monitored and
                 improved, is performed by knowledge workers on the
                 basis of their skills and experience. Control methods
                 are therefore designed on the basis of expected and
                 evident quality problems; thus, these methods may not
                 be effective in dealing with unknown and/or unexpected
                 problems. This article aims to provide a methodology,
                 based on fault injection, for validating the data
                 quality actions used by organizations. We show how it
                 is possible to check whether the adopted techniques
                 properly monitor the real issues that may damage
                 business processes. At this stage, we focus on scoring
                 processes, i.e., those in which the output represents
                 the evaluation or ranking of a specific object. We show
                 the effectiveness of our proposal by means of a case
                 study in the financial risk management area.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Heinrich:2018:RDQ,
  author =       "Bernd Heinrich and Diana Hristova and Mathias Klier
                 and Alexander Schiller and Michael Szubartowicz",
  title =        "Requirements for Data Quality Metrics",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3148238",
  ISSN =         "1936-1955",
  bibdate =      "Mon Jan 22 16:07:57 MST 2018",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Data quality and especially the assessment of data
                 quality have been intensively discussed in research and
                 practice alike. To support an economically oriented
                 management of data quality and decision making under
                 uncertainty, it is essential to assess the data quality
                 level by means of well-founded metrics. However, if not
                 adequately defined, these metrics can lead to wrong
                 decisions and economic losses. Therefore, based on a
                 decision-oriented framework, we present a set of five
                 requirements for data quality metrics. These
                 requirements are relevant for a metric that aims to
                 support an economically oriented management of data
                 quality and decision making under uncertainty. We
                 further demonstrate the applicability and efficacy of
                 these requirements by evaluating five data quality
                 metrics for different data quality dimensions.
                 Moreover, we discuss practical implications when
                 applying the presented requirements.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Geerts:2018:ESI,
  author =       "Floris Geerts and Paolo Missier and Norman Paton",
  title =        "Editorial: Special Issue on Improving the Veracity and
                 Value of Big Data",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "3",
  pages =        "13:1--13:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3174791",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bertossi:2018:OMD,
  author =       "Leopoldo Bertossi and Mostafa Milani",
  title =        "Ontological Multidimensional Data Models and
                 Contextual Data Quality",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "3",
  pages =        "14:1--14:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3148239",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Data quality assessment and data cleaning are
                 context-dependent activities. Motivated by this
                 observation, we propose the Ontological
                 Multidimensional Data Model (OMD model), which can be
                 used to model and represent contexts as logic-based
                 ontologies. The data under assessment are mapped into
                 the context for additional analysis, processing, and
                 quality data extraction. The resulting contexts allow
                 for the representation of dimensions, and
                 multidimensional data quality assessment becomes
                 possible. At the core of a multidimensional context, we
                 include a generalized multidimensional data model and a
                 Datalog$^\pm $ ontology with provably good properties
                 in terms of query answering. These main components are
                 used to represent dimension hierarchies, dimensional
                 constraints, and dimensional rules and define
                 predicates for quality data specification. Query
                 answering relies on and triggers navigation through
                 dimension hierarchies and becomes the basic tool for
                 the extraction of quality data. The OMD model is
                 interesting per se beyond applications to data quality.
                 It allows for a logic-based and computationally
                 tractable representation of multidimensional data,
                 extending previous multidimensional data models with
                 additional expressive power and functionalities.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Mountantonakis:2018:SMM,
  author =       "Michalis Mountantonakis and Yannis Tzitzikas",
  title =        "Scalable Methods for Measuring the Connectivity and
                 Quality of Large Numbers of Linked Datasets",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "3",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3165713",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Although the ultimate objective of Linked Data is
                 linking and integration, it is not currently evident
                 how connected the current Linked Open Data (LOD) cloud
                 is. In this article, we focus on methods, supported by
                 special indexes and algorithms, for performing
                 measurements related to the connectivity of more than
                 two datasets that are useful in various tasks including
                 (a) Dataset Discovery and Selection; (b) Object
                 Coreference, i.e., for obtaining complete information
                 about a set of entities, including provenance
                 information; (c) Data Quality Assessment and
                 Improvement, i.e., for assessing the connectivity
                 between any set of datasets and monitoring their
                 evolution over time, as well as for estimating data
                 veracity; (d) Dataset Visualizations; and various other
                 tasks. Since it would be prohibitively expensive to
                 perform all these measurements in a na{\"\i}ve way, in
                 this article, we introduce indexes (and their
                 construction algorithms) that can speed up such tasks.
                 In brief, we introduce (i) a namespace-based prefix
                 index, (ii) a sameAs catalog for computing the
                 symmetric and transitive closure of the owl:sameAs
                 relationships encountered in the datasets, (iii) a
                 semantics-aware element index (that exploits the
                 aforementioned indexes), and, finally, (iv) two
                 lattice-based incremental algorithms for speeding up
                 the computation of the intersection of URIs of any set
                 of datasets. For enhancing scalability, we propose
                 parallel index construction algorithms and parallel
                 lattice-based incremental algorithms, we evaluate the
                 achieved speedup using either a single machine or a
                 cluster of machines, and we provide insights regarding
                 the factors that affect efficiency. Finally, we report
                 measurements about the connectivity of the (billion
                 triples-sized) LOD cloud that have never been carried
                 out so far.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Esteves:2018:TVA,
  author =       "Diego Esteves and Anisa Rula and Aniketh Janardhan
                 Reddy and Jens Lehmann",
  title =        "Toward Veracity Assessment in {RDF} Knowledge Bases:
                 an Exploratory Analysis",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "3",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177873",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Among different characteristics of knowledge bases,
                 data quality is one of the most relevant to maximize
                 the benefits of the provided information. Knowledge
                 base quality assessment poses a number of big data
                 challenges such as high volume, variety, velocity, and
                 veracity. In this article, we focus on answering
                 questions related to the assessment of the veracity of
                 facts through Deep Fact Validation (DeFacto), a triple
                 validation framework designed to assess facts in RDF
                 knowledge bases. Despite current developments in the
                 research area, the underlying framework faces many
                 challenges. This article pinpoints and discusses these
                 issues and conducts a thorough analysis of its
                 pipeline, aiming at reducing the error propagation
                 through its components. Furthermore, we discuss recent
                 developments related to this fact validation as well as
                 describing advantages and drawbacks of state-of-the-art
                 models. As a result of this exploratory analysis, we
                 give insights and directions toward a better
                 architecture to tackle the complex task of
                 fact-checking in knowledge bases.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Chen:2018:CAS,
  author =       "Qingyu Chen and Yu Wan and Xiuzhen Zhang and Yang Lei
                 and Justin Zobel and Karin Verspoor",
  title =        "Comparative Analysis of Sequence Clustering Methods
                 for Deduplication of Biological Databases",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "3",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3131611",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "The massive volumes of data in biological sequence
                 databases provide a remarkable resource for large-scale
                 biological studies. However, the underlying data
                 quality of these resources is a critical concern. A
                 particular challenge is duplication, in which multiple
                 records have similar sequences, creating a high level
                 of redundancy that impacts database storage, curation,
                 and search. Biological database deduplication has two
                 direct applications: for database curation, where
                 detected duplicates are removed to improve curation
                 efficiency, and for database search, where detected
                 duplicate sequences may be flagged but remain available
                 to support analysis. Clustering methods have been
                 widely applied to biological sequences for database
                 deduplication. Since an exhaustive all-by-all pairwise
                 comparison of sequences cannot scale for a high volume
                 of data, heuristic approaches have been recruited, such
                 as the use of simple similarity thresholds. In this
                 article, we present a comparison between CD-HIT and
                 UCLUST, the two best-known clustering tools for
                 sequence database deduplication. Our contributions
                 include a detailed assessment of the redundancy
                 remaining after deduplication, application of standard
                 clustering evaluation metrics to quantify the cohesion
                 and separation of the clusters generated by each
                 method, and a biological case study that assesses
                 intracluster function annotation consistency to
                 demonstrate the impact of these factors on a practical
                 application of the sequence clustering methods. Our
                 results show that the trade-off between efficiency and
                 accuracy becomes acute when low threshold values are
                 used and when cluster sizes are large. This evaluation
                 leads to practical recommendations for users for more
                 effective uses of the sequence clustering tools for
                 deduplication.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Gal:2018:CPD,
  author =       "Avigdor Gal and Arik Senderovich and Matthias
                 Weidlich",
  title =        "Challenge Paper: Data Quality Issues in Queue Mining",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "4",
  pages =        "18:1--18:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3165712",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Musyaffa:2018:EOF,
  author =       "Fathoni A. Musyaffa and Christiane Engels and
                 Maria-Esther Vidal and Fabrizio Orlandi and S{\"o}ren
                 Auer",
  title =        "Experience: Open Fiscal Datasets, Common Issues, and
                 Recommendations",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "4",
  pages =        "19:1--19:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3190576",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Public administrations are continuously publishing
                 open data, increasing the amount of government open
                 data over time. The published data includes budgets and
                 spending as part of fiscal data; publishing these data
                 is an important part of transparent and accountable
                 governance. However, open fiscal data should also meet
                 open data publication guidelines. When requirements in
                 data guidelines are not met, effective data analysis
                 over published datasets cannot be performed
                 effectively. In this article, we present Open Fiscal
                 Data Publication (OFDP), a framework to assess the
                 quality of open fiscal datasets. We also present an
                 extensive open fiscal data assessment and common data
                 quality issues found; additionally, open fiscal data
                 publishing guidelines are presented. We studied and
                 surveyed main quality factors for open fiscal datasets.
                 Moreover, the collected quality factors have been
                 scored according to the results of a questionnaire to
                 score quality factors within the OFDP assessment
                 framework. We gather and comprehensively analyze a
                 representative set of 77 fiscal datasets from several
                 public administrations across different regions at
                 different levels (e.g., supranational, national,
                 municipality). We characterize quality issues commonly
                 arising in these datasets. Our assessment shows that
                 there are many quality factors in fiscal data
                 publication that still need to be taken care of so that
                 the data can be analyzed effectively. Our proposed
                 guidelines allow for publishing open fiscal data where
                 these quality issues are avoided.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Alshayeb:2018:SSP,
  author =       "Mohammad Alshayeb and Yasser Shaaban and Jarallah
                 Al-Ghamdi",
  title =        "{SPMDL}: Software Product Metrics Definition
                 Language",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "4",
  pages =        "20:1--20:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3185049",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Software metrics are becoming more acceptable measures
                 for software quality assessment. However, there is no
                 standard form to represent metric definitions, which
                 would be useful for metrics exchange and customization.
                 In this article, we propose the Software Product
                 Metrics Definition Language (SPMDL). We develop an
                 XML-based description language to define software
                 metrics in a precise and reusable form. Metric
                 definitions in SPMDL are based on meta-models extracted
                 from either source code or design artifacts, such as
                 the Dagstuhl Middle Meta-model, with support for
                 various abstraction levels. The language defines
                 several flexible computation mechanisms, such as
                 extended Object Constraint Language queries and
                 predefined graph operations on the meta-model. SPMDL
                 provides an unambiguous description of the metric
                 definition; it is also easy to use and is extensible.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ashish:2018:MRB,
  author =       "Naveen Ashish and Arihant Patawari",
  title =        "Machine Reading of Biomedical Data Dictionaries",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "4",
  pages =        "21:1--21:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3177874",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "This article describes an approach for the automated
                 reading of biomedical data dictionaries. Automated
                 reading is the process of extracting element details
                 for each of the data elements from a data dictionary in
                 a document format (such as PDF) to a completely
                 structured representation. A structured representation
                 is essential if the data dictionary metadata are to be
                 used in applications such as data integration and also
                 in evaluating the quality of the associated data. We
                 present an approach and implemented solution for the
                 problem, considering different formats of data
                 dictionaries. We have a particular focus on the most
                 challenging format with a machine-learning
                 classification solution to the problem using
                 conditional random field classifiers. We present an
                 evaluation using several actual data dictionaries,
                 demonstrating the effectiveness of our approach.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Chiang:2018:IPS,
  author =       "Fei Chiang and Dhruv Gairola",
  title =        "{InfoClean}: Protecting Sensitive Information in Data
                 Cleaning",
  journal =      j-JDIQ,
  volume =       "9",
  number =       "4",
  pages =        "22:1--22:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3190577",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Data quality has become a pervasive challenge for
                 organizations as they wrangle with large, heterogeneous
                 datasets to extract value. Given the proliferation of
                 sensitive and confidential information, it is crucial
                 to consider data privacy concerns during the data
                 cleaning process. For example, in medical database
                 applications, varying levels of privacy are enforced
                 across the attribute values. Attributes such as a
                 patient's country or city of residence may be less
                 sensitive than the patient's prescribed medication.
                 Traditional data cleaning techniques assume the data is
                 openly accessible, without considering the differing
                 levels of information sensitivity. In this work, we
                 take the first steps toward a data cleaning model that
                 integrates privacy as part of the data cleaning
                 process. We present a privacy-aware data cleaning
                 framework that differentiates the information content
                 among the attribute values during the data cleaning
                 process to resolve data inconsistencies while
                 minimizing the amount of information disclosed. Our
                 data repair algorithm includes a set of data disclosure
                 operations that considers the information content of
                 the underlying attribute values, while maximizing data
                 utility. Our evaluation using real datasets shows that
                 our algorithm scales well, and achieves improved
                 performance and comparable repair accuracy against
                 existing data cleaning solutions.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bertino:2018:ACE,
  author =       "Elisa Bertino and Mohammad R. Jahanshahi",
  title =        "Adaptive and Cost-Effective Collection of High-Quality
                 Data for Critical Infrastructure and Emergency
                 Management in Smart Cities-Framework and Challenges",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "1",
  pages =        "1:1--1:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3190579",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Flores:2018:IQA,
  author =       "Javier Flores and Jun Sun",
  title =        "Information Quality Awareness and Information Quality
                 Practice",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "1",
  pages =        "2:1--2:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3182182",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Healthcare organizations increasingly rely on
                 electronic information to optimize their operations.
                 Information of high diversity from various sources
                 accentuate the relevance and importance of information
                 quality (IQ). The quality of information needs to be
                 improved to support a more efficient and reliable
                 utilization of healthcare information systems (IS).
                 This can only be achieved through the implementation of
                 initiatives followed by most users across an
                 organization. The purpose of this study is to examine
                 how awareness of IS users about IQ issues would affect
                 their IQ behavior. Based on multiple theoretical
                 frameworks, it is hypothesized that different aspects
                 of user motivation mediate the relationship between the
                 awareness on both beneficial and problematic situations
                 and IQ practice inclination. In addition, social
                 influence and facilitating condition moderate the
                 relationship between IQ practice inclination and overt
                 IQ practice. The theoretical and practical implications
                 of findings are discussed, especially how to enhance IQ
                 compliance in the healthcare settings.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bors:2018:VIC,
  author =       "Christian Bors and Theresia Gschwandtner and Simone
                 Kriglstein and Silvia Miksch and Margit Pohl",
  title =        "Visual Interactive Creation, Customization, and
                 Analysis of Data Quality Metrics",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "1",
  pages =        "3:1--3:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3190578",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "During data preprocessing, analysts spend a
                 significant part of their time and effort profiling the
                 quality of the data along with cleansing and
                 transforming the data for further analysis. While
                 quality metrics-ranging from general to domain-specific
                 measures-support assessment of the quality of a
                 dataset, there are hardly any approaches to visually
                 support the analyst in customizing and applying such
                 metrics. Yet, visual approaches could facilitate users'
                 involvement in data quality assessment. We present
                 MetricDoc, an interactive environment for assessing
                 data quality that provides customizable, reusable
                 quality metrics in combination with immediate visual
                 feedback. Moreover, we provide an overview
                 visualization of these quality metrics along with error
                 visualizations that facilitate interactive navigation
                 of the data to determine the causes of quality issues
                 present in the data. In this article, we describe the
                 architecture, design, and evaluation of MetricDoc,
                 which underwent several design cycles, including
                 heuristic evaluation and expert reviews as well as a
                 focus group with data quality, human-computer
                 interaction, and visual analytics experts.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Zhang:2018:ASB,
  author =       "Han Zhang and Shawndra Hill and David Rothschild",
  title =        "Addressing Selection Bias in Event Studies with
                 General-Purpose Social Media Panels",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "1",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3185048",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Data from Twitter have been employed in prior research
                 to study the impacts of events. Conventionally,
                 researchers use keyword-based samples of tweets to
                 create a panel of Twitter users who mention
                 event-related keywords during and after an event.
                 However, the keyword-based sampling is limited in its
                 objectivity dimension of data and information quality.
                 First, the technique suffers from selection bias since
                 users who discuss an event are already more likely to
                 discuss event-related topics beforehand. Second, there
                 are no viable control groups for comparison to a
                 keyword-based sample of Twitter users. We propose an
                 alternative sampling approach to construct panels of
                 users defined by their geolocation. Geolocated panels
                 are exogenous to the keywords in users' tweets,
                 resulting in less selection bias than the keyword panel
                 method. Geolocated panels allow us to follow
                 within-person changes over time and enable the creation
                 of comparison groups. We compare different panels in
                 two real-world settings: response to mass shootings and
                 TV advertising. We first show the strength of the
                 selection biases of keyword panels. Then, we
                 empirically illustrate how geolocated panels reduce
                 selection biases and allow meaningful comparison groups
                 regarding the impact of the studied events. We are the
                 first to provide a clear, empirical example of how a
                 better panel selection design, based on an exogenous
                 variable such as geography, both reduces selection bias
                 compared to the current state of the art and increases
                 the value of Twitter research for studying events.
                 While we advocate for the use of a geolocated panel, we
                 also discuss its weaknesses and application scenario
                 seriously. This article also calls attention to the
                 importance of selection bias in impacting the
                 objectivity of social media data.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Puentes:2018:CQE,
  author =       "John Puentes and Pedro Merino Laso and David Brosset",
  title =        "The Challenge of Quality Evaluation in Fraud
                 Detection",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "2",
  pages =        "5:1--5:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3228341",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3228341",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bertino:2018:CAC,
  author =       "Elisa Bertino and Amani Abu Jabal and Seraphin Calo
                 and Dinesh Verma and Christopher Williams",
  title =        "The Challenge of Access Control Policies Quality",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "2",
  pages =        "6:1--6:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209668",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3209668",
  abstract =     "Access Control policies allow one to control data
                 sharing among multiple subjects. For high assurance
                 data security, it is critical that such policies be fit
                 for their purpose. In this paper we introduce the
                 notion of ``policy quality'' and elaborate on its many
                 dimensions, such as consistency, completeness, and
                 minimality. We introduce a framework supporting the
                 analysis of policies with respect to the introduced
                 quality dimensions and elaborate on research
                 challenges, including policy analysis for large-scale
                 distributed systems, assessment of policy correctness,
                 and analysis of policies expressed in richer policy
                 models.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Karanja:2018:CPT,
  author =       "Evanson Mwangi Karanja and Shedden Masupe and Mandu
                 Gasennelwe-Jeffrey",
  title =        "Challenge Paper: Towards Open Datasets for {Internet
                 of Things} Malware",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "2",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3230669",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Koumarelas:2018:EEA,
  author =       "Ioannis Koumarelas and Axel Kroschk and Clifford
                 Mosley and Felix Naumann",
  title =        "Experience: Enhancing Address Matching with Geocoding
                 and Similarity Measure Selection",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "2",
  pages =        "8:1--8:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3232852",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Given a query record, record matching is the problem
                 of finding database records that represent the same
                 real-world object. In the easiest scenario, a database
                 record is completely identical to the query. However,
                 in most cases, problems do arise, for instance, as a
                 result of data errors or data integrated from multiple
                 sources or received from restrictive form fields. These
                 problems are usually difficult, because they require a
                 variety of actions, including field segmentation,
                 decoding of values, and similarity comparisons, each
                 requiring some domain knowledge. In this article, we
                 study the problem of matching records that contain
                 address information, including attributes such as
                 Street-address and City. To facilitate this matching
                 process, we propose a domain-specific procedure to,
                 first, enrich each record with a more complete
                 representation of the address information through
                 geocoding and reverse-geocoding and, second, to select
                 the best similarity measure per each address attribute
                 that will finally help the classifier to achieve the
                 best f-measure. We report on our experience in
                 selecting geocoding services and discovering similarity
                 measures for a concrete but common industry use-case.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ferro:2018:ISIa,
  author =       "Nicola Ferro and Norbert Fuhr and Andreas Rauber",
  title =        "Introduction to the Special Issue on Reproducibility
                 in Information Retrieval: Evaluation Campaigns,
                 Collections, and Analyses",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "3",
  pages =        "9:1--9:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3268408",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Moffat:2018:EMU,
  author =       "Alistair Moffat and Falk Scholer and Ziying Yang",
  title =        "Estimating Measurement Uncertainty for Information
                 Retrieval Effectiveness Metrics",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "3",
  pages =        "10:1--10:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3239572",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3239572",
  abstract =     "One typical way of building test collections for
                 offline measurement of information retrieval systems is
                 to pool the ranked outputs of different systems down to
                 some chosen depth d and then form relevance judgments
                 for those documents only. Non-pooled documents-ones
                 that did not appear in the top- d sets of any of the
                 contributing systems-are then deemed to be non-relevant
                 for the purposes of evaluating the relative behavior of
                 the systems. In this article, we use RBP-derived
                 residuals to re-examine the reliability of that
                 process. By fitting the RBP parameter $ \phi $ to
                 maximize similarity between AP- and NDCG-induced system
                 rankings, on the one hand, and RBP-induced rankings, on
                 the other, an estimate can be made as to the potential
                 score uncertainty associated with those two
                 recall-based metrics. We then consider the effect that
                 residual size-as an indicator of possible measurement
                 uncertainty in utility-based metrics-has in connection
                 with recall-based metrics by computing the effect of
                 increasing pool sizes and examining the trends that
                 arise in terms of both metric score and system
                 separability using standard statistical tests. The
                 experimental results show that the confidence levels
                 expressed via the p -values generated by statistical
                 tests are only weakly connected to the size of the
                 residual and to the degree of measurement uncertainty
                 caused by the presence of unjudged documents.
                 Statistical confidence estimates are, however, largely
                 consistent as pooling depths are altered. We therefore
                 recommend that all such experimental results should
                 report, in addition to the outcomes of statistical
                 significance tests, the residual measurements generated
                 by a suitably matched weighted-precision metric, to
                 give a clear indication of measurement uncertainty that
                 arises due to the presence of unjudged documents in
                 test collections with finite pooled judgments.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Roitero:2018:RGE,
  author =       "Kevin Roitero and Marco Passon and Giuseppe Serra and
                 Stefano Mizzaro",
  title =        "{Reproduce}. {Generalize}. {Extend}. {On} Information
                 Retrieval Evaluation without Relevance Judgments",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "3",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3241064",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3241064",
  abstract =     "The evaluation of retrieval effectiveness by means of
                 test collections is a commonly used methodology in the
                 information retrieval field. Some researchers have
                 addressed the quite fascinating research question of
                 whether it is possible to evaluate effectiveness
                 completely automatically, without human relevance
                 assessments. Since human relevance assessment is one of
                 the main costs of building a test collection, both in
                 human time and money resources, this rather ambitious
                 goal would have a practical impact. In this article, we
                 reproduce the main results on evaluating information
                 retrieval systems without relevance judgments;
                 furthermore, we generalize such previous work to
                 analyze the effect of test collections, evaluation
                 metrics, and pool depth. We also expand the idea to
                 semi-automatic evaluation and estimation of topic
                 difficulty. Our results show that (i) previous work is
                 overall reproducible, although some specific results
                 are not; (ii) collection, metric, and pool depth impact
                 the automatic evaluation of systems, which is anyway
                 accurate in several cases; (iii) semi-automatic
                 evaluation is an effective methodology; and (iv)
                 automatic evaluation can (to some extent) be used to
                 predict topic difficulty.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Roitero:2018:RIE,
  author =       "Kevin Roitero and Michael Soprano and Andrea Brunello
                 and Stefano Mizzaro",
  title =        "Reproduce and Improve: an Evolutionary Approach to
                 Select a Few Good Topics for Information Retrieval
                 Evaluation",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3239573",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3239573",
  abstract =     "Effectiveness evaluation of information retrieval
                 systems by means of a test collection is a widely used
                 methodology. However, it is rather expensive in terms
                 of resources, time, and money; therefore, many
                 researchers have proposed methods for a cheaper
                 evaluation. One particular approach, on which we focus
                 in this article, is to use fewer topics: in TREC-like
                 initiatives, usually system effectiveness is evaluated
                 as the average effectiveness on a set of n topics
                 (usually, n =50, but more than 1,000 have been also
                 adopted); instead of using the full set, it has been
                 proposed to find the best subsets of a few good topics
                 that evaluate the systems in the most similar way to
                 the full set. The computational complexity of the task
                 has so far limited the analysis that has been
                 performed. We develop a novel and efficient approach
                 based on a multi-objective evolutionary algorithm. The
                 higher efficiency of our new implementation allows us
                 to reproduce some notable results on topic set
                 reduction, as well as perform new experiments to
                 generalize and improve such results. We show that our
                 approach is able to both reproduce the main
                 state-of-the-art results and to allow us to analyze the
                 effect of the collection, metric, and pool depth used
                 for the evaluation. Finally, differently from previous
                 studies, which have been mainly theoretical, we are
                 also able to discuss some practical topic selection
                 strategies, integrating results of automatic evaluation
                 approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Jagerman:2018:OLL,
  author =       "Rolf Jagerman and Krisztian Balog and Maarten {De
                 Rijke}",
  title =        "{OpenSearch}: Lessons Learned from an Online
                 Evaluation Campaign",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3239575",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3239575",
  abstract =     "We report on our experience with TREC OpenSearch, an
                 online evaluation campaign that enabled researchers to
                 evaluate their experimental retrieval methods using
                 real users of a live website. Specifically, we focus on
                 the task of ad hoc document retrieval within the
                 academic search domain, and work with two search
                 engines, CiteSeerX and SSOAR, that provide us with
                 traffic. We describe our experimental platform, which
                 is based on the living labs methodology, and report on
                 the experimental results obtained. We also share our
                 experiences, challenges, and the lessons learned from
                 running this track in 2016 and 2017.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ferro:2018:ISIb,
  author =       "Nicola Ferro and Norbert Fuhr and Andreas Rauber",
  title =        "Introduction to the Special Issue on Reproducibility
                 in Information Retrieval: Tools and Infrastructures",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "4",
  pages =        "14:1--14:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3268410",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Hopfgartner:2018:ESC,
  author =       "Frank Hopfgartner and Allan Hanbury and Henning
                 M{\"u}ller and Ivan Eggel and Krisztian Balog and
                 Torben Brodt and Gordon V. Cormack and Jimmy Lin and
                 Jayashree Kalpathy-Cramer and Noriko Kando and Makoto
                 P. Kato and Anastasia Krithara and Tim Gollub and
                 Martin Potthast and Evelyne Viegas and Simon Mercer",
  title =        "Evaluation-as-a-Service for the Computational
                 Sciences: Overview and Outlook",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "4",
  pages =        "15:1--15:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3239570",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Evaluation in empirical computer science is essential
                 to show progress and assess technologies developed.
                 Several research domains such as information retrieval
                 have long relied on systematic evaluation to measure
                 progress: here, the Cranfield paradigm of creating
                 shared test collections, defining search tasks, and
                 collecting ground truth for these tasks has persisted
                 up until now. In recent years, however, several new
                 challenges have emerged that do not fit this paradigm
                 very well: extremely large data sets, confidential data
                 sets as found in the medical domain, and rapidly
                 changing data sets as often encountered in industry.
                 Crowdsourcing has also changed the way in which
                 industry approaches problem-solving with companies now
                 organizing challenges and handing out monetary awards
                 to incentivize people to work on their challenges,
                 particularly in the field of machine learning. This
                 article is based on discussions at a workshop on
                 Evaluation-as-a-Service (EaaS). EaaS is the paradigm of
                 not providing data sets to participants and have them
                 work on the data locally, but keeping the data central
                 and allowing access via Application Programming
                 Interfaces (API), Virtual Machines (VM), or other
                 possibilities to ship executables. The objectives of
                 this article are to summarize and compare the current
                 approaches and consolidate the experiences of these
                 approaches to outline the next steps of EaaS,
                 particularly toward sustainable research
                 infrastructures. The article summarizes several
                 existing approaches to EaaS and analyzes their usage
                 scenarios and also the advantages and disadvantages.
                 The many factors influencing EaaS are summarized, and
                 the environment in terms of motivations for the various
                 stakeholders, from funding agencies to challenge
                 organizers, researchers and participants, to industry
                 interested in supplying real-world problems for which
                 they require solutions. EaaS solves many problems of
                 the current research environment, where data sets are
                 often not accessible to many researchers. Executables
                 of published tools are equally often not available
                 making the reproducibility of results impossible. EaaS,
                 however, creates reusable/citable data sets as well as
                 available executables. Many challenges remain, but such
                 a framework for research can also foster more
                 collaboration between researchers, potentially
                 increasing the speed of obtaining research results.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Yang:2018:ARR,
  author =       "Peilin Yang and Hui Fang and Jimmy Lin",
  title =        "{Anserini}: Reproducible Ranking Baselines Using
                 {Lucene}",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "4",
  pages =        "16:1--16:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3239571",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "This work tackles the perennial problem of
                 reproducible baselines in information retrieval
                 research, focusing on bag-of-words ranking models.
                 Although academic information retrieval researchers
                 have a long history of building and sharing systems,
                 they are primarily designed to facilitate the
                 publication of research papers. As such, these systems
                 are often incomplete, inflexible, poorly documented,
                 difficult to use, and slow, particularly in the context
                 of modern web-scale collections. Furthermore, the
                 growing complexity of modern software ecosystems and
                 the resource constraints most academic research groups
                 operate under make maintaining open-source systems a
                 constant struggle. However, except for a small number
                 of companies (mostly commercial web search engines)
                 that deploy custom infrastructure, Lucene has become
                 the de facto platform in industry for building search
                 applications. Lucene has an active developer base, a
                 large audience of users, and diverse capabilities to
                 work with heterogeneous collections at scale. However,
                 it lacks systematic support for ad hoc experimentation
                 using standard test collections. We describe Anserini,
                 an information retrieval toolkit built on Lucene that
                 fills this gap. Our goal is to simplify ad hoc
                 experimentation and allow researchers to easily
                 reproduce results with modern bag-of-words ranking
                 models on diverse test collections. With Anserini, we
                 demonstrate that Lucene provides a suitable framework
                 for supporting information retrieval research.
                 Experiments show that our system efficiently indexes
                 large web collections, provides modern ranking models
                 that are on par with research implementations in terms
                 of effectiveness, and supports low-latency query
                 evaluation to facilitate rapid experimentation",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Kiesel:2018:RWC,
  author =       "Johannes Kiesel and Florian Kneist and Milad Alshomary
                 and Benno Stein and Matthias Hagen and Martin
                 Potthast",
  title =        "Reproducible {Web} Corpora: Interactive Archiving with
                 Automatic Quality Assessment",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "4",
  pages =        "17:1--17:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3239574",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "The evolution of web pages from static HTML pages
                 toward dynamic pieces of software has rendered
                 archiving them increasingly difficult. Nevertheless, an
                 accurate, reproducible web archive is a necessity to
                 ensure the reproducibility of web-based research.
                 Archiving web pages reproducibly, however, is currently
                 not part of best practices for web corpus construction.
                 As a result, and despite the ongoing efforts of other
                 stakeholders to archive the web, tools for the
                 construction of reproducible web corpora are
                 insufficient or ill-fitted. This article presents a new
                 tool tailored to this purpose. It relies on emulating
                 user interactions with a web page while recording all
                 network traffic. The customizable user interactions can
                 be replayed on demand, while requests sent by the
                 archived page are served with the recorded responses.
                 The tool facilitates reproducible user studies, user
                 simulations, and evaluations of algorithms that rely on
                 extracting data from web pages. To evaluate our tool,
                 we conduct the first systematic assessment of
                 reproduction quality for rendered web pages. Using our
                 tool, we create a corpus of 10,000\ web pages
                 carefully sampled from the Common Crawl and manually
                 annotated with regard to reproduction quality via
                 crowdsourcing. Based on this data, we test three
                 approaches to automatic reproduction-quality
                 assessment. An off-the-shelf neural network, trained on
                 visual differences between the web page during
                 archiving and reproduction, matches the manual
                 assessments best. This automatic assessment of
                 reproduction quality allows for immediate bugfixing
                 during archiving and continuous development of our tool
                 as the web continues to evolve.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Roy:2018:CCD,
  author =       "Dwaipayan Roy and Mandar Mitra and Debasis Ganguly",
  title =        "To Clean or Not to Clean: Document Preprocessing and
                 Reproducibility",
  journal =      j-JDIQ,
  volume =       "10",
  number =       "4",
  pages =        "18:1--18:??",
  month =        nov,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3242180",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  abstract =     "Web document collections such as WT10G, GOV2, and
                 ClueWeb are widely used for text retrieval experiments.
                 Documents in these collections contain a fair amount of
                 non-content-related markup in the form of tags,
                 hyperlinks, and so on. Published articles that use
                 these corpora generally do not provide specific details
                 about how this markup information is handled during
                 indexing. However, this question turns out to be
                 important: Through experiments, we find that including
                 or excluding metadata in the index can produce
                 significantly different results with standard IR
                 models. More importantly, the effect varies across
                 models and collections. For example, metadata filtering
                 is found to be generally beneficial when using BM25, or
                 language modeling with Dirichlet smoothing, but can
                 significantly reduce retrieval effectiveness if
                 language modeling is used with Jelinek-Mercer
                 smoothing. We also observe that, in general, the
                 performance differences become more noticeable as the
                 amount of metadata in the test collections increase.
                 Given this variability, we believe that the details of
                 document preprocessing are significant from the point
                 of view of reproducibility. In a second set of
                 experiments, we also study the effect of preprocessing
                 on query expansion using RM3. In this case, once again,
                 we find that it is generally better to remove markup
                 before using documents for query expansion.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Srivastava:2019:EHQ,
  author =       "Divesh Srivastava and Monica Scannapieco and Thomas C.
                 Redman",
  title =        "Ensuring High-Quality Private Data for Responsible
                 Data Science: Vision and Challenges",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3287168",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3287168",
  abstract =     "High-quality data is critical for effective data
                 science. As the use of data science has grown, so too
                 have concerns that individuals' rights to privacy will
                 be violated. This has led to the development of data
                 protection regulations around the globe and the use of
                 sophisticated anonymization techniques to protect
                 privacy. Such measures make it more challenging for the
                 data scientist to understand the data, exacerbating
                 issues of data quality. Responsible data science aims
                 to develop useful insights from the data while fully
                 embracing these considerations. We pose the high-level
                 problem in this article, ``How can a data scientist
                 develop the needed trust that private data has high
                 quality?'' We then identify a series of challenges for
                 various data-centric communities and outline research
                 questions for data quality and privacy researchers,
                 which would need to be addressed to effectively answer
                 the problem posed in this article.",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Rios:2019:CTF,
  author =       "Julio C{\'e}sar Cort{\'e}s R{\'\i}os and Norman W.
                 Paton and Alvaro A. A. Fernandes and Edward Abel and
                 John A. Keane",
  title =        "Crowdsourced Targeted Feedback Collection for
                 Multicriteria Data Source Selection",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3284934",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3284934",
  abstract =     "A multicriteria data source selection (MCSS) scenario
                 identifies, from a set of candidate data sources, the
                 subset that best meets users' needs. These needs are
                 expressed using several criteria, which are used to
                 evaluate the candidate data sources. An MCSS problem
                 can be solved using multidimensional optimization
                 techniques that trade off the different objectives.
                 Sometimes one may have uncertain knowledge regarding
                 how well the candidate data sources meet the criteria.
                 In order to overcome this uncertainty, one may rely on
                 end-users or crowds to annotate the data items produced
                 by the sources in relation to the selection criteria.
                 In this article, a proposed Targeted Feedback
                 Collection (TFC) approach is introduced that aims to
                 identify those data items on which feedback should be
                 collected, thereby providing evidence on how the
                 sources satisfy the required criteria. The proposed TFC
                 targets feedback by considering the confidence
                 intervals around the estimated criteria values, with a
                 view to increasing the confidence in the estimates that
                 are most relevant to the multidimensional optimization.
                 Variants of the proposed TFC approach have been
                 developed for use where feedback is expected to be
                 reliable (e.g., where it is provided by trusted
                 experts) and where feedback is expected to be
                 unreliable (e.g., from crowd workers). Both variants
                 have been evaluated, and positive results are reported
                 against other approaches to feedback collection,
                 including active learning, in experiments that involve
                 real-world datasets and crowdsourcing.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Dallachiesa:2019:ICQ,
  author =       "Michele Dallachiesa and Charu C. Aggarwal and Themis
                 Palpanas",
  title =        "Improving Classification Quality in Uncertain Graphs",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3242095",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3242095",
  abstract =     "In many real applications that use and analyze
                 networked data, the links in the network graph may be
                 erroneous or derived from probabilistic techniques. In
                 such cases, the node classification problem can be
                 challenging, since the unreliability of the links may
                 affect the final results of the classification process.
                 If the information about link reliability is not used
                 explicitly, then the classification accuracy in the
                 underlying network may be affected adversely. In this
                 article, we focus on situations that require the
                 analysis of the uncertainty that is present in the
                 graph structure. We study the novel problem of node
                 classification in uncertain graphs, by treating
                 uncertainty as a first-class citizen. We propose two
                 techniques based on a Bayes model and automatic
                 parameter selection and show that the incorporation of
                 uncertainty in the classification process as a
                 first-class citizen is beneficial. We experimentally
                 evaluate the proposed approach using different real
                 data sets and study the behavior of the algorithms
                 under different conditions. The results demonstrate the
                 effectiveness and efficiency of our approach.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Casey:2019:FRR,
  author =       "K. Michael Casey and Kevin {Casey Jr.}",
  title =        "Financial Regulatory and Risk Management Challenges
                 Stemming from Firm-Specific Digital Misinformation",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jan,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3274655",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3274655",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Fan:2019:DGC,
  author =       "Wenfei Fan",
  title =        "Dependencies for Graphs: Challenges and
                 Opportunities",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "2",
  pages =        "5:1--5:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3310230",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3310230",
  abstract =     "What are graph dependencies? What do we need them for?
                 What new challenges do they introduce? This article
                 tackles these questions. It aims to incite curiosity
                 and interest in this emerging area of research.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Sillaber:2019:EDI,
  author =       "Christian Sillaber and Andrea Mussmann and Ruth Breu",
  title =        "Experience: Data and Information Quality Challenges in
                 Governance, Risk, and Compliance Management",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "2",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3297721",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3297721",
  abstract =     "Governance, risk, and compliance (GRC) managers often
                 struggle to document the current state of their
                 organizations. This is due to the complexity of their
                 IS landscape, the complex regulatory and organizational
                 environment, and the frequent changes to both. GRC
                 tools seek to support them by integrating existing
                 information sources. However, a comprehensive analysis
                 of how the data is managed in such tools, as well as
                 the impact of data quality, is still missing. To build
                 a basis of empirical data, we conducted a series of
                 interviews with information security managers
                 responsible for GRC management activities in their
                 organizations. The results of a qualitative content
                 analysis of these interviews suggest that decision
                 makers largely depend on high-quality documentation but
                 struggle to maintain their documentation at the
                 required level for long periods of time. This work
                 discusses factors affecting the quality of GRC data and
                 information and provides insights into approaches
                 implemented by organizations to analyze, improve, and
                 maintain the quality of their GRC data and
                 information.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Lazar:2019:EEM,
  author =       "Alina Lazar and Ling Jin and C. Anna Spurlock and
                 Kesheng Wu and Alex Sim and Annika Todd",
  title =        "Evaluating the Effects of Missing Values and Mixed
                 Data Types on Social Sequence Clustering Using {t-SNE}
                 Visualization",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "2",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3301294",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3301294",
  abstract =     "The goal of this work is to investigate the impact of
                 missing values in clustering joint categorical social
                 sequences. Identifying patterns in sociodemographic
                 longitudinal data is important in a number of social
                 science settings. However, performing analytical
                 operations, such as clustering on life course
                 trajectories, is challenging due to the categorical and
                 multidimensional nature of the data, their mixed data
                 types, and corruption by missing and inconsistent
                 values. Data quality issues were investigated
                 previously on single variable sequences. To understand
                 their effects on multivariate sequence analysis, we
                 employ a dataset of mixed data types and missing
                 values, a dissimilarity measure designed for joint
                 categorical sequence data, together with dimensionality
                 reduction methodologies in a systematic design of
                 sequence clustering experiments. Given the categorical
                 nature of our data, we employ an ``edit'' distance
                 using optimal matching. Because each data record has
                 multiple variables of different types, we investigate
                 the impact of mixing these variables in a single
                 dissimilarity measure. Between variables with binary
                 values and those with multiple nominal values, we find
                 that the ability to overcome missing data problems is
                 more difficult in the nominal domain than in the binary
                 domain. Additionally, alignment of leading missing
                 values can result in systematic biases in dissimilarity
                 matrices and subsequently introduce both artificial
                 clusters and unrealistic interpretations of associated
                 data domains. We demonstrate the usage of t-distributed
                 stochastic neighborhood embedding to visually guide
                 mitigation of such biases by tuning the missing value
                 substitution cost parameter or determining an optimal
                 sequence span.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Muller:2019:ADQ,
  author =       "Daniel M{\"u}ller and Pratiksha Jain and Yieh-Funk
                 Te",
  title =        "Augmenting Data Quality through High-Precision Gender
                 Categorization",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "2",
  pages =        "8:1--8:??",
  month =        may,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3297720",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3297720",
  abstract =     "Mappings of first name to gender have been widely
                 recognized as a critical tool for the completion,
                 study, and validation of data records in a range of
                 areas. In this study, we investigate how organizations
                 with large databases of existing entities can create
                 their own mappings between first names and gender and
                 how these mappings can be improved and utilized.
                 Therefore, we first explore a dataset with demographic
                 information on more than 4 million people, which was
                 provided by a car insurance company. Then, we study how
                 naming conventions have changed over time and how they
                 differ by nationality. Next, we build a probabilistic
                 first-name-to-gender mapping and augment the mapping by
                 adding nationality and decade of birth to improve the
                 mapping's performance. We test our mapping in two-label
                 and three-label settings and further validate our
                 mapping by categorizing patent filings by gender of the
                 inventor. We compare the results with previous studies'
                 outcomes and find that our mapping produces
                 high-precision results. We validate that the additional
                 information of nationality and year of birth improve
                 the precision scores of name-to-gender mappings.
                 Therefore, the proposed approach constitutes an
                 efficient process for improving the data quality of
                 organizations' records, if the gender attribute is
                 missing or unreliable.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Hassan:2019:ISI,
  author =       "Naeemul Hassan and Chengkai Li and Jun Yang and Cong
                 Yu",
  title =        "Introduction to the Special Issue on Combating Digital
                 Misinformation and Disinformation",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "3",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3321484",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321484",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Zannettou:2019:WFI,
  author =       "Savvas Zannettou and Michael Sirivianos and Jeremy
                 Blackburn and Nicolas Kourtellis",
  title =        "The {Web} of False Information: Rumors, Fake News,
                 Hoaxes, Clickbait, and Various Other Shenanigans",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "3",
  pages =        "10:1--10:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3309699",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309699",
  abstract =     "A new era of Information Warfare has arrived. Various
                 actors, including state-sponsored ones, are weaponizing
                 information on Online Social Networks to run
                 false-information campaigns with targeted manipulation
                 of public opinion on specific topics. These
                 false-information campaigns can have dire consequences
                 to the public: mutating their opinions and actions,
                 especially with respect to critical world events like
                 major elections. Evidently, the problem of false
                 information on the Web is a crucial one and needs
                 increased public awareness as well as immediate
                 attention from law enforcement agencies, public
                 institutions, and in particular, the research
                 community. In this article, we make a step in this
                 direction by providing a typology of the Web's
                 false-information ecosystem, composed of various types
                 of false-information, actors, and their motives. We
                 report a comprehensive overview of existing research on
                 the false-information ecosystem by identifying several
                 lines of work: (1) how the public perceives false
                 information; (2) understanding the propagation of false
                 information; (3) detecting and containing false
                 information on the Web; and (4) false information on
                 the political stage. In this work, we pay particular
                 attention to political false information as: (1) it can
                 have dire consequences to the community (e.g., when
                 election results are mutated) and (2) previous work
                 shows that this type of false information propagates
                 faster and further when compared to other types of
                 false information. Finally, for each of these lines of
                 work, we report several future research directions that
                 can help us better understand and mitigate the emerging
                 problem of false-information dissemination on the
                 Web.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Xue:2019:CAT,
  author =       "Hao Xue and Qiaozhi Wang and Bo Luo and Hyunjin Seo
                 and Fengjun Li",
  title =        "Content-Aware Trust Propagation Toward Online Review
                 Spam Detection",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "3",
  pages =        "11:1--11:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3305258",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3305258",
  abstract =     "With the increasing popularity of online review
                 systems, a large volume of user-generated content
                 becomes available to help people make reasonable
                 judgments about the quality of services and products
                 from unknown providers. However, these platforms are
                 frequently abused since fraudulent information can be
                 freely inserted by potentially malicious users without
                 validation. Consequently, online review systems become
                 targets of individual and professional spammers, who
                 insert deceptive reviews by manipulating the rating
                 and/or the content of the reviews. In this work, we
                 propose a review spamming detection scheme based on the
                 deviation between the aspect-specific opinions
                 extracted from individual reviews and the aggregated
                 opinions on the corresponding aspects. In particular,
                 we model the influence on the trustworthiness of the
                 user due to his opinion deviations from the majority in
                 the form of a deviation-based penalty, and integrate
                 this penalty into a three-layer trust propagation
                 framework to iteratively compute the trust scores for
                 users, reviews, and review targets, respectively. The
                 trust scores are effective indicators of spammers,
                 since they reflect the overall deviation of a user from
                 the aggregated aspect-specific opinions across all
                 targets and all aspects. Experiments on the dataset
                 collected from Yelp.com show that the proposed
                 detection scheme based on aspect-specific content-aware
                 trust propagation is able to measure users'
                 trustworthiness based on opinions expressed in
                 reviews.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Atanasova:2019:AFC,
  author =       "Pepa Atanasova and Preslav Nakov and Llu{\'\i}s
                 M{\`a}rquez and Alberto Barr{\'o}n-Cede{\~n}o and
                 Georgi Karadzhov and Tsvetomila Mihaylova and Mitra
                 Mohtarami and James Glass",
  title =        "Automatic Fact-Checking Using Context and Discourse
                 Information",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "3",
  pages =        "12:1--12:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3297722",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3297722",
  abstract =     "We study the problem of automatic fact-checking,
                 paying special attention to the impact of contextual
                 and discourse information. We address two related
                 tasks: ( i )\ detecting check-worthy claims and (
                 ii )\ fact-checking claims. We develop supervised
                 systems based on neural networks, kernel-based support
                 vector machines, and combinations thereof, which make
                 use of rich input representations in terms of discourse
                 cues and contextual features. For the check-worthiness
                 estimation task, we focus on political debates, and we
                 model the target claim in the context of the full
                 intervention of a participant and the previous and
                 following turns in the debate, taking into account
                 contextual meta information. For the fact-checking
                 task, we focus on answer verification in a community
                 forum, and we model the veracity of the answer with
                 respect to the entire question-answer thread in which
                 it occurs as well as with respect to other related
                 posts from the entire forum. We develop annotated
                 datasets for both tasks and we run extensive
                 experimental evaluation, confirming that both types of
                 information-but especially contextual features-play an
                 important role.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Lin:2019:DPF,
  author =       "Peng Lin and Qi Song and Yinghui Wu and Jiaxing Pi",
  title =        "Discovering Patterns for Fact Checking in Knowledge
                 Graphs",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "3",
  pages =        "13:1--13:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3286488",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3286488",
  abstract =     "This article presents a new framework that
                 incorporates graph patterns to support fact checking in
                 knowledge graphs. Our method discovers discriminant
                 graph patterns to construct classifiers for fact
                 prediction. First, we propose a class of graph fact
                 checking rules (GFCs). A GFC incorporates graph
                 patterns that best distinguish true and false facts of
                 generalized fact statements. We provide statistical
                 measures to characterize useful patterns that are both
                 discriminant and diversified. Second, we show that it
                 is feasible to discover GFCs in large graphs with
                 optimality guarantees. We develop an algorithm that
                 performs localized search to generate a stream of graph
                 patterns, and dynamically assemble the best GFCs from
                 multiple GFC sets, where each set ensures quality
                 scores within certain ranges. The algorithm guarantees
                 a $ (1 / 2 - \epsilon) $ approximation when it (early)
                 terminates. We also develop a space-efficient
                 alternative that dynamically spawns prioritized
                 patterns with best marginal gains to the verified GFCs.
                 It guarantees a $ (1 - 1 / e) $ approximation. Both
                 strategies guarantee a bounded time cost independent of
                 the size of the underlying graph. Third, to support
                 fact checking, we develop two classifiers, which make
                 use of top-ranked GFCs as predictive rules or
                 instance-level features of the pattern matches induced
                 by GFCs, respectively. Using real-world data, we
                 experimentally verify the efficiency and the
                 effectiveness of GFC-based techniques for fact checking
                 in knowledge graphs and verify its application in
                 knowledge exploration and news prediction.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Borges:2019:CSF,
  author =       "Lu{\'\i}s Borges and Bruno Martins and P{\'a}vel
                 Calado",
  title =        "Combining Similarity Features and Deep Representation
                 Learning for Stance Detection in the Context of
                 Checking Fake News",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "3",
  pages =        "14:1--14:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3287763",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3287763",
  abstract =     "Fake news is nowadays an issue of pressing concern,
                 given its recent rise as a potential threat to
                 high-quality journalism and well-informed public
                 discourse. The Fake News Challenge (FNC-1) was
                 organized in early 2017 to encourage the development of
                 machine-learning-based classification systems for
                 stance detection (i.e., for identifying whether a
                 particular news article agrees, disagrees, discusses,
                 or is unrelated to a particular news headline), thus
                 helping in the detection and analysis of possible
                 instances of fake news. This article presents a novel
                 approach to tackle this stance detection problem, based
                 on the combination of string similarity features with a
                 deep neural network architecture that leverages ideas
                 previously advanced in the context of
                 learning-efficient text representations, document
                 classification, and natural language inference.
                 Specifically, we use bi-directional Recurrent Neural
                 Networks (RNNs), together with max-pooling over the
                 temporal/sequential dimension and neural attention, for
                 representing (i) the headline, (ii) the first two
                 sentences of the news article, and (iii) the entire
                 news article. These representations are then
                 combined/compared, complemented with similarity
                 features inspired on other FNC-1 approaches, and passed
                 to a final layer that predicts the stance of the
                 article toward the headline. We also explore the use of
                 external sources of information, specifically large
                 datasets of sentence pairs originally proposed for
                 training and evaluating natural language inference
                 methods to pre-train specific components of the neural
                 network architecture (e.g., the RNNs used for encoding
                 sentences). The obtained results attest to the
                 effectiveness of the proposed ideas and show that our
                 model, particularly when considering pre-training and
                 the combination of neural representations together with
                 similarity features, slightly outperforms the previous
                 state of the art.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Abiteboul:2019:TFD,
  author =       "Serge Abiteboul and Julia Stoyanovich",
  title =        "Transparency, Fairness, Data Protection, Neutrality:
                 Data Management Challenges in the Face of New
                 Regulation",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "3",
  pages =        "15:1--15:??",
  month =        jul,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3310231",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3310231",
  abstract =     "The data revolution continues to transform every
                 sector of science, industry, and government. Due to the
                 incredible impact of data-driven technology on society,
                 we are becoming increasingly aware of the imperative to
                 use data and algorithms responsibly-in accordance with
                 laws and ethical norms. In this article, we discuss
                 three recent regulatory frameworks: the European
                 Union's General Data Protection Regulation (GDPR), the
                 New York City Automated Decisions Systems (ADS) Law,
                 and the Net Neutrality principle, which aim to protect
                 the rights of individuals who are impacted by data
                 collection and analysis. These frameworks are prominent
                 examples of a global trend: Governments are starting to
                 recognize the need to regulate data-driven algorithmic
                 technology. Our goal in this article is to bring these
                 regulatory frameworks to the attention of the data
                 management community and to underscore the technical
                 challenges they raise and that we, as a community, are
                 well-equipped to address. The main takeaway of this
                 article is that legal and ethical norms cannot be
                 incorporated into data-driven systems as an
                 afterthought. Rather, we must think in terms of
                 responsibility by design, viewing it as a systems
                 requirement.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bertino:2019:DTB,
  author =       "Elisa Bertino and Ahish Kundu and Zehra Sura",
  title =        "Data Transparency with Blockchain and {AI} Ethics",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "4",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3312750",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
                 http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3312750",
  abstract =     "Providing a 360${}^\circ $ view of a given data item
                 especially for sensitive data is essential toward not
                 only protecting the data and associated privacy but
                 also assuring trust, compliance, and ethics of the
                 systems that use or manage such data. With the advent
                 of General Data Protection Regulation, California Data
                 Privacy Law, and other such regulatory requirements, it
                 is essential to support data transparency in all such
                 dimensions. Moreover, data transparency should not
                 violate privacy and security requirements. In this
                 article, we put forward a vision for how data
                 transparency would be achieved in a de-centralized
                 fashion using blockchain technology.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Fard:2019:ARA,
  author =       "Amir Ebrahimi Fard and Scott Cunningham",
  title =        "Assessing the Readiness of Academia in the Topic of
                 False and Unverified Information",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "4",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3313788",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3313788",
  abstract =     "The spread of false and unverified information has the
                 potential to inflict damage by harming the reputation
                 of individuals or organisations, shaking financial
                 markets, and influencing crowd decisions in important
                 events. This phenomenon needs to be properly curbed,
                 otherwise it can contaminate other aspects of our
                 social life. In this regard, academia as a key
                 institution against false and unverified information is
                 expected to play a pivotal role. Despite a great deal
                 of research in this arena, the amount of progress by
                 academia is not clear yet. This can lead to
                 misjudgements about the performance of the topic of
                 interest that can ultimately result in wrong science
                 policies regarding academic efforts for quelling false
                 and unverified information. In this research, we
                 address this issue by assessing the readiness of
                 academia in the topic of false and unverified
                 information. To this end, we adopt the emergence
                 framework and measure its dimensions (novelty, growth,
                 coherence, and impact) over more than 21,000 articles
                 published by academia about false and unverified
                 information. Our results show the current body of
                 research has had organic growth so far, which is not
                 promising enough for confronting the problem of false
                 and unverified information. To tackle this problem, we
                 suggest an external push strategy that, compared to the
                 early stages of the topic of interest, reinforces the
                 emergence dimensions and leads to a higher level in
                 every dimension.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Babcock:2019:DFF,
  author =       "Matthew Babcock and David M. Beskow and Kathleen M.
                 Carley",
  title =        "Different Faces of False: The Spread and Curtailment
                 of False Information in the {Black Panther Twitter}
                 Discussion",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "4",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3339468",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3339468",
  abstract =     "The task of combating false information online appears
                 daunting, in part due to a public focus on how quickly
                 it can spread and the clamor for automated
                 platform-based interventions. While such concerns can
                 be warranted, threat analysis and intervention design
                 both benefit from a fuller understanding of different
                 types of false information and of the community
                 responses to them. Here, we present a study of the most
                 tweeted about movie ever ( Black Panther ) in which the
                 spread of false information of four different types is
                 compared to the ad hoc Twitter community response. We
                 find that (1) false information tweets played a small
                 part in the overall conversation, (2) community-based
                 debunking and shaming responses to false posts about
                 attacks at theaters overwhelmed such posts by orders of
                 magnitude, (3) as another form of community response,
                 one type of false narrative (Satire) was used to attack
                 another (Fake Attacks), and (4) the four types of
                 false-information tweets differed in the use of
                 hashtags and in the role played by originating users
                 and responding users. Overall, this work helps to
                 illustrate the importance of investigating
                 ``on-the-ground'' community responses to fake news and
                 other types of digital false information and to inform
                 identification and intervention design and
                 implementation.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bosu:2019:EQB,
  author =       "Michael F. Bosu and Stephen G. Macdonell",
  title =        "Experience: Quality Benchmarking of Datasets Used in
                 Software Effort Estimation",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "4",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3328746",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3328746",
  abstract =     "Data is a cornerstone of empirical software
                 engineering (ESE) research and practice. Data underpin
                 numerous process and project management activities,
                 including the estimation of development effort and the
                 prediction of the likely location and severity of
                 defects in code. Serious questions have been raised,
                 however, over the quality of the data used in ESE. Data
                 quality problems caused by noise, outliers, and
                 incompleteness have been noted as being especially
                 prevalent. Other quality issues, although also
                 potentially important, have received less attention. In
                 this study, we assess the quality of 13 datasets that
                 have been used extensively in research on software
                 effort estimation. The quality issues considered in
                 this article draw on a taxonomy that we published
                 previously based on a systematic mapping of data
                 quality issues in ESE. Our contributions are as
                 follows: (1) an evaluation of the ``fitness for
                 purpose'' of these commonly used datasets and (2) an
                 assessment of the utility of the taxonomy in terms of
                 dataset benchmarking. We also propose a template that
                 could be used to both improve the ESE data
                 collection/submission process and to evaluate other
                 such datasets, contributing to enhanced awareness of
                 data quality issues in the ESE community and, in time,
                 the availability and use of higher-quality datasets.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ding:2019:CSA,
  author =       "Junhua Ding and Xinchuan Li and Xiaojun Kang and
                 Venkat N. Gudivada",
  title =        "A Case Study of the Augmentation and Evaluation of
                 Training Data for Deep Learning",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "4",
  pages =        "20:1--20:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3317573",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3317573",
  abstract =     "Deep learning has been widely used for extracting
                 values from big data. As many other machine learning
                 algorithms, deep learning requires significant training
                 data. Experiments have shown both the volume and the
                 quality of training data can significantly impact the
                 effectiveness of the value extraction. In some cases,
                 the volume of training data is not sufficiently large
                 for effectively training a deep learning model. In
                 other cases, the quality of training data is not high
                 enough to achieve the optimal performance. Many
                 approaches have been proposed for augmenting training
                 data to mitigate the deficiency. However, whether the
                 augmented data are ``fit for purpose'' of deep learning
                 is still a question. A framework for comprehensively
                 evaluating the effectiveness of the augmented data for
                 deep learning is still not available. In this article,
                 we first discuss a data augmentation approach for deep
                 learning. The approach includes two components: the
                 first one is to remove noisy data in a dataset using a
                 machine learning based classification to improve its
                 quality, and the second one is to increase the volume
                 of the dataset for effectively training a deep learning
                 model. To evaluate the quality of the augmented data in
                 fidelity, variety, and veracity, a data quality
                 evaluation framework is proposed. We demonstrated the
                 effectiveness of the data augmentation approach and the
                 data quality evaluation framework through studying an
                 automated classification of biology cell images using
                 deep learning. The experimental results clearly
                 demonstrated the impact of the volume and quality of
                 training data to the performance of deep learning and
                 the importance of the data quality evaluation. The data
                 augmentation approach and the data quality evaluation
                 framework can be straightforwardly adapted for deep
                 learning study in other domains.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Akhtar:2019:IAV,
  author =       "Zahaib Akhtar and Anh Minh Le and Yun Seong Nam and
                 Jessica Chen and Ramesh Govindan and Ethan Katz-Bassett
                 and Sanjay Rao and Jibin Zhan",
  title =        "Improving Adaptive Video Streaming through Session
                 Classification",
  journal =      j-JDIQ,
  volume =       "11",
  number =       "4",
  pages =        "21:1--21:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3309682",
  ISSN =         "1936-1955",
  bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309682",
  abstract =     "With internet video gaining increasing popularity and
                 soaring to dominate network traffic, extensive studies
                 are being carried out on how to achieve higher Quality
                 of Experience (QoE) with the delivery of video content.
                 Associated with the chunk-based streaming protocol,
                 Adaptive Bitrate (ABR) algorithms have recently emerged
                 to cope with the diverse and fluctuating network
                 conditions by dynamically adjusting bitrates for future
                 chunks. This inevitably involves predicting the future
                 throughput of a video session. Some of the session
                 features like Internet Service Provider (ISP),
                 geographical location, and so on, could affect network
                 conditions and contain helpful information for this
                 throughput prediction. In this article, we consider how
                 our knowledge about the session features can be
                 utilized to improve ABR quality via customized
                 parameter settings. We present our ABR-independent,
                 QoE-driven, feature-based partition method to classify
                 the logged video sessions so that different parameter
                 settings could be adopted in different situations to
                 reach better quality. A variation of Decision Tree is
                 developed for the classification and has been applied
                 to a sample ABR for evaluation. The experiment shows
                 that our approach can improve the average bitrate of
                 the sample ABR by 36.1\% without causing the increase
                 of the rebuffering ratio where 99\% of the sessions can
                 get improvement. It can also improve the rebuffering
                 ratio by 87.7\% without causing the decrease of the
                 average bitrate, where, among those sessions involved
                 in rebuffering, 82\% receives improvement and 18\%
                 remains the same.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Milo:2020:GRD,
  author =       "Tova Milo",
  title =        "Getting Rid of Data",
  journal =      j-JDIQ,
  volume =       "12",
  number =       "1",
  pages =        "1--7",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3326920",
  ISSN =         "1936-1955",
  bibdate =      "Thu Jan 23 07:39:46 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3326920",
  abstract =     "We are experiencing an amazing data-centered
                 revolution. Incredible amounts of data are collected,
                 integrated, and analyzed, leading to key breakthroughs
                 in science and society. This well of knowledge,
                 however, is at a great risk if we do not dispense
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "https://dl.acm.org/loi/jdiq",
}

@Article{Firmani:2020:EDD,
  author =       "Donatella Firmani and Letizia Tanca and Riccardo
                 Torlone",
  title =        "Ethical Dimensions for Data Quality",
  journal =      j-JDIQ,
  volume =       "12",
  number =       "1",
  pages =        "1--5",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3362121",
  ISSN =         "1936-1955",
  bibdate =      "Thu Jan 23 07:39:46 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3362121",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "https://dl.acm.org/loi/jdiq",
}

@Article{Draisbach:2020:TPD,
  author =       "Uwe Draisbach and Peter Christen and Felix Naumann",
  title =        "Transforming Pairwise Duplicates to Entity Clusters
                 for High-quality Duplicate Detection",
  journal =      j-JDIQ,
  volume =       "12",
  number =       "1",
  pages =        "1--30",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3352591",
  ISSN =         "1936-1955",
  bibdate =      "Thu Jan 23 07:39:46 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3352591",
  abstract =     "Duplicate detection algorithms produce clusters of
                 database records, each cluster representing a single
                 real-world entity. As most of these algorithms use
                 pairwise comparisons, the resulting (transitive)
                 clusters can be inconsistent: Not all records
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "https://dl.acm.org/loi/jdiq",
}

@Article{Shakeel:2020:ASQ,
  author =       "Yusra Shakeel and Jacob Kr{\~A}$1/4$ger and Ivonne Von
                 Nostitz-Wallwitz and Gunter Saake and Thomas Leich",
  title =        "Automated Selection and Quality Assessment of Primary
                 Studies: a Systematic Literature Review",
  journal =      j-JDIQ,
  volume =       "12",
  number =       "1",
  pages =        "1--26",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3356901",
  ISSN =         "1936-1955",
  bibdate =      "Thu Jan 23 07:39:46 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3356901",
  abstract =     "Researchers use\ systematic literature reviews
                 (SLRs) to synthesize existing evidence regarding a
                 research topic. While being an important means to
                 condense knowledge, conducting an SLR requires a large
                 amount of time and effort. Consequently, \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "https://dl.acm.org/loi/jdiq",
}

@Article{Siagian:2020:RWC,
  author =       "Al Hafiz Akbar Maulana Siagian and Masayoshi
                 Aritsugi",
  title =        "Robustness of Word and Character {$N$}-gram
                 Combinations in Detecting Deceptive and Truthful
                 Opinions",
  journal =      j-JDIQ,
  volume =       "12",
  number =       "1",
  pages =        "1--24",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3349536",
  ISSN =         "1936-1955",
  bibdate =      "Thu Jan 23 07:39:46 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3349536",
  abstract =     "Opinions in reviews about the quality of products or
                 services can be important information for readers.
                 Unfortunately, such opinions may include deceptive ones
                 posted for some business reasons. To keep the opinions
                 as a valuable and trusted source of \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "https://dl.acm.org/loi/jdiq",
}

@Article{Aswani:2020:EMM,
  author =       "Reema Aswani and Arpan Kumar Kar and P. Vigneswara
                 Ilavarasan",
  title =        "Experience: Managing Misinformation in Social
                 Media-Insights for Policymakers from {Twitter}
                 Analytics",
  journal =      j-JDIQ,
  volume =       "12",
  number =       "1",
  pages =        "1--18",
  month =        jan,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3341107",
  ISSN =         "1936-1955",
  bibdate =      "Thu Jan 23 07:39:46 MST 2020",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3341107",
  abstract =     "Governance of misinformation is a serious concern in
                 social media platforms. Based on experiences gathered
                 from different case studies, we offer insights for the
                 policymakers on managing misinformation in social
                 media. These platforms are widely used \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "Journal of Data and Information Quality (JDIQ)",
  journal-URL =  "https://dl.acm.org/loi/jdiq",
}