%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.21",
%%%     date            = "23 January 2020",
%%%     time            = "07:45:58 MST",
%%%     filename        = "jdiq.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "45241 7052 37648 356077",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "Journal of Data and Information Quality
%%%                       (JDIQ); bibliography",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        the ACM Journal of Data and Information
%%%                        Quality (JDIQ) (CODEN ????, ISSN 1936-1955),
%%%                        covering all journal issues from 2009 --
%%%                        date.
%%%
%%%                        At version 1.21, the COMPLETE journal
%%%                        coverage looked like this:
%%%
%%%                             2009 (  17)    2013 (   8)    2017 (  17)
%%%                             2010 (   6)    2014 (  11)    2018 (  34)
%%%                             2011 (   8)    2015 (  22)    2019 (  21)
%%%                             2012 (  15)    2016 (  14)    2020 (   6)
%%%
%%%                             Article:        179
%%%
%%%                             Total entries:  179
%%%
%%%
%%%                            http://www.acm.org/jdiq/
%%%                            http://portal.acm.org/browse_dl.cfm?idx=J1191
%%%                            https://dl.acm.org/loi/jdiq
%%%
%%%                        Qualified subscribers can retrieve the full
%%%                        text of recent articles in PDF form.
%%%
%%%                        The initial draft was extracted from the ACM
%%%                        Web pages.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.  Article reviews have been
%%%                        omitted, until their copyright status has
%%%                        been clarified.
%%%
%%%                        bibsource keys in the bibliography entries
%%%                        below indicate the entry originally came
%%%                        from the computer science bibliography
%%%                        archive, even though it has likely since
%%%                        been corrected and updated.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using bibsort -byvolume.''
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility."
%%%     }
%%% ====================================================================
@Preamble{"\input bibnames.sty" #
"\def \TM {${}^{\sc TM}$}"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-JDIQ                  = "Journal of Data and Information
Quality (JDIQ)"}

%%% ====================================================================
%%% Bibliography entries:
author =       "Stuart E. Madnick and Yang W. Lee",
title =        "Editorial for the Inaugural Issue of the {ACM Journal
of Data and Information Quality (JDIQ)}",
journal =      j-JDIQ,
volume =       "1",
number =       "1",
pages =        "1:1--1:??",
month =        jun,
year =         "2009",
CODEN =        "????",
ISSN =         "1936-1955",
bibdate =      "Fri Sep 18 15:11:35 MDT 2009",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "1",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

author =       "Stuart E. Madnick and Richard Y. Wang and Yang W. Lee
and Hongwei Zhu",
title =        "Overview and Framework for Data and Information
Quality Research",
journal =      j-JDIQ,
volume =       "1",
number =       "1",
pages =        "2:1--2:??",
month =        jun,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1515693.1516680",
ISSN =         "1936-1955",
bibdate =      "Fri Sep 18 15:11:35 MDT 2009",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Awareness of data and information quality issues has
grown rapidly in light of the critical role played by
the quality of information in our data-intensive,
knowledge-based economy. Research in the past two
decades has produced a large body of data quality
knowledge and has expanded our ability to solve many
we present an overview of the evolution and current
landscape of data and information quality research. We
introduce a framework to characterize the research
along two dimensions: topics and methods.
Representative papers are cited for purposes of
illustrating the issues addressed and the methods used.
We also identify and discuss challenges to be addressed
in future research.",
acknowledgement = ack-nhfb,
articleno =    "2",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Li:2009:BAE,
author =       "Xiao-Bai Li",
title =        "A {Bayesian} Approach for Estimating and Replacing
Missing Categorical Data",
journal =      j-JDIQ,
volume =       "1",
number =       "1",
pages =        "3:1--3:??",
month =        jun,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1515693.1515695",
ISSN =         "1936-1955",
bibdate =      "Fri Sep 18 15:11:35 MDT 2009",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "We propose a new approach for estimating and replacing
missing categorical data. With this approach, the
posterior probabilities of a missing attribute value
belonging to a certain category are estimated using the
simple Bayes method. Two alternative methods for
replacing the missing value are proposed: The first
replaces the missing value with the value having the
estimated maximum probability; the second uses a value
that is selected with probability proportional to the
estimated posterior distribution. The effectiveness of
the proposed approach is evaluated based on some
important data quality measures for data warehousing
and data mining. The results of the experimental study
demonstrate the effectiveness of the proposed
approach.",
acknowledgement = ack-nhfb,
articleno =    "3",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Weber:2009:OSD,
author =       "Kristin Weber and Boris Otto and Hubert {\"O}sterle",
title =        "One Size Does Not Fit All---{A} Contingency Approach
to Data Governance",
journal =      j-JDIQ,
volume =       "1",
number =       "1",
pages =        "4:1--4:??",
month =        jun,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1515693.1515696",
ISSN =         "1936-1955",
bibdate =      "Fri Sep 18 15:11:35 MDT 2009",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Enterprizes need Data Quality Management (DQM) to
respond to strategic and operational challenges
demanding high-quality corporate data. Hitherto,
companies have mostly assigned accountabilities for DQM
to Information Technology (IT) departments. They have
thereby neglected the organizational issues critical to
successful DQM. With data governance, however,
companies may implement corporate-wide accountabilities
for DQM that encompass professionals from business and
IT departments. This research aims at starting a
scientific discussion on data governance by
transferring concepts from IT governance and
organizational theory to the previously largely ignored
field of data governance. The article presents the
first results of a community action research project on
data governance comprising six international companies
from various industries. It outlines a data governance
model that consists of three components (data quality
roles, decision areas, and responsibilities), which
together form a responsibility assignment matrix. The
data governance model documents data quality roles and
their type of interaction with DQM activities. In
addition, the article describes a data governance
contingency model and demonstrates the influence of
organization structure, competitive strategy, degree of
process harmonization, degree of market regulation, and
decision-making style on data governance. Based on
these findings, companies can structure their specific
data governance model.",
acknowledgement = ack-nhfb,
articleno =    "4",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Heinrich:2009:PDM,
author =       "B. Heinrich and M. Klier and M. Kaiser",
title =        "A Procedure to Develop Metrics for Currency and its
Application in {CRM}",
journal =      j-JDIQ,
volume =       "1",
number =       "1",
pages =        "5:1--5:??",
month =        jun,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1515693.1515697",
ISSN =         "1936-1955",
bibdate =      "Fri Sep 18 15:11:35 MDT 2009",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Due to the importance of using up-to-date data in
data-quality dimension currency can be quantified.
Based on several requirements (e.g., normalization and
interpretability) and a literature review, we design a
procedure to develop probability-based metrics for
currency which can be adjusted to the specific
characteristics of data attribute values. We evaluate
the presented procedure with regard to the requirements
and illustrate the applicability as well as its
practical benefit. In cooperation with a major German
mobile services provider, the procedure was applied in
the field of campaign management in order to improve
both success rates and profits.",
acknowledgement = ack-nhfb,
articleno =    "5",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

author =       "Stuart E. Madnick and Yang W. Lee",
title =        "Editorial Letter for the Special Issue on Data Quality
in Databases and Information Systems",
journal =      j-JDIQ,
volume =       "1",
number =       "2",
pages =        "6:1--6:??",
month =        sep,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1577840.1577841",
ISSN =         "1936-1955",
bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "6",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Naumann:2009:GES,
author =       "Felix Naumann and Louiqa Raschid",
title =        "Guest Editorial for the Special Issue on Data Quality
in Databases",
journal =      j-JDIQ,
volume =       "1",
number =       "2",
pages =        "7:1--7:??",
month =        sep,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1577840.1577842",
ISSN =         "1936-1955",
bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "7",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Dash:2009:MLN,
author =       "Manoranjan Dash and Ayush Singhania",
title =        "Mining in Large Noisy Domains",
journal =      j-JDIQ,
volume =       "1",
number =       "2",
pages =        "8:1--8:??",
month =        sep,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1577840.1577843",
ISSN =         "1936-1955",
bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
efficiently in large and noisy data. We propose an
efficient sampling algorithm ({\em Concise\/}) as a
solution for large and noisy data. Concise is far more
superior than the Simple Random Sampling ({\em SRS\/})
in selecting a representative sample. Particularly when
the data is very large and noisy, Concise achieves the
maximum gain over SRS. The comparison is in terms of
their impact on subsequent data mining tasks,
specifically, classification, clustering, and
association rule mining. We compared Concise with a few
existing noise removal algorithms followed by SRS.
Although the accuracy of mining results are similar,
Concise spends very little time compared to the
existing algorithms because Concise has linear time
complexity.",
acknowledgement = ack-nhfb,
articleno =    "8",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
keywords =     "association rule mining; classification; clustering;
data mining; Information filtering; sampling; selection
process",
}

@Article{Moustakides:2009:OSR,
author =       "George V. Moustakides and Vassilios S. Verykios",
title =        "Optimal Stopping: a Record-Linkage Approach",
journal =      j-JDIQ,
volume =       "1",
number =       "2",
pages =        "9:1--9:??",
month =        sep,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1577840.1577844",
ISSN =         "1936-1955",
bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Record-linkage is the process of identifying whether
two separate records refer to the same real-world
entity when some elements of the record's identifying
information (attributes) agree and others disagree.
Existing record-linkage decision methodologies use the
outcomes from the comparisons of the whole set of
attributes. Here, we propose an alternative scheme that
assesses the attributes sequentially, allowing for a
decision to made at any attribute's comparison stage,
and thus before exhausting all available attributes.
The scheme we develop is optimum in that it minimizes a
well-defined average cost criterion while the
corresponding optimum solution can be easily mapped
into a decision tree to facilitate the record-linkage
decision process. Experimental results performed in
real datasets indicate the superiority of our
methodology compared to existing approaches.",
acknowledgement = ack-nhfb,
articleno =    "9",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
keywords =     "duplicate detection; optimal stopping;
}

@Article{Klein:2009:RDQ,
author =       "A. Klein and W. Lehner",
title =        "Representing Data Quality in Sensor Data Streaming
Environments",
journal =      j-JDIQ,
volume =       "1",
number =       "2",
pages =        "10:1--10:??",
month =        sep,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1577840.1577845",
ISSN =         "1936-1955",
bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Sensors in smart-item environments capture data about
product conditions and usage to support business
decisions as well as production automation processes. A
challenging issue in this application area is the
restricted quality of sensor data due to limited sensor
precision and sensor failures. Moreover, data stream
processing to meet resource constraints in streaming
environments introduces additional noise and decreases
the data quality. In order to avoid wrong business
decisions due to dirty data, quality characteristics
have to be captured, processed, and provided to the
efficiently provide applications with information about
data quality is still an open research problem.\par

a flexible model for the propagation and processing of
data quality. The comprehensive analysis of common data
stream processing operators and their impact on data
quality allows a fruitful data evaluation and
diminishes incorrect business decisions. Further, we
propose the data quality model control to adapt the
data quality granularity to the data stream
interestingness.",
acknowledgement = ack-nhfb,
articleno =    "10",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
keywords =     "data quality; Data stream processing; smart items",
}

@Article{Embury:2009:IDS,
author =       "Suzanne M. Embury and Paolo Missier and Sandra Sampaio
and R. Mark Greenwood and Alun D. Preece",
title =        "Incorporating Domain-Specific Information Quality
Constraints into Database Queries",
journal =      j-JDIQ,
volume =       "1",
number =       "2",
pages =        "11:1--11:??",
month =        sep,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1577840.1577846",
ISSN =         "1936-1955",
bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "The range of information now available in queryable
repositories opens up a host of possibilities for new
and valuable forms of data analysis. Database query
languages such as SQL and XQuery offer a concise and
high-level means by which such analyses can be
implemented, facilitating the extraction of relevant
data subsets into either generic or bespoke data
analysis environments. Unfortunately, the quality of
data in these repositories is often highly variable.
The data is still useful, but only if the consumer is
aware of the data quality problems and can work around
them. Standard query languages offer little support for
this aspect of data management. In principle, however,
it should be possible to embed constraints describing
the consumer's data quality requirements into the query
directly, so that the query evaluator can take over
responsibility for enforcing them during query
processing.\par

Most previous attempts to incorporate information
quality constraints into database queries have been
based around a small number of highly generic quality
measures, which are defined and computed by the
information provider. This is a useful approach in some
application areas but, in practice, quality criteria
are more commonly determined by the user of the
explore an approach to incorporating quality
constraints into database queries where the definition
of quality is set by the user and not the provider of
the information. Our approach is based around the
concept of a {\em quality view}, a configurable quality
assessment component into which domain-specific notions
of quality can be embedded. We examine how quality
views can be incorporated into XQuery, and draw from
this the language features that are required in general
to embed quality views into any query language. We also
propose some syntactic sugar on top of XQuery to
simplify the process of querying with quality
constraints.",
acknowledgement = ack-nhfb,
articleno =    "11",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
keywords =     "database query languages; Information quality; views;
XQuery",
}

author =       "Stuart E. Madnick and Yang W. Lee",
title =        "Call for Papers Special Issue on Healthcare
Information Quality: the Challenges and Opportunities
in Healthcare Systems and Services",
journal =      j-JDIQ,
volume =       "1",
number =       "2",
pages =        "12:1--12:??",
month =        sep,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1577840.1577847",
ISSN =         "1936-1955",
bibdate =      "Wed Mar 17 14:47:40 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "12",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

author =       "Stuart E. Madnick and Yang W. Lee",
title =        "Editors' Comments: Where the {JDIQ} Articles Come
From: Incubating Research in an Emerging Field",
journal =      j-JDIQ,
volume =       "1",
number =       "3",
pages =        "13:1--13:??",
month =        dec,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1659225.1659226",
ISSN =         "1936-1955",
bibdate =      "Wed Mar 17 14:47:55 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "13",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Sessions:2009:TMD,
author =       "V. Sessions and M. Valtorta",
title =        "Towards a Method for Data Accuracy Assessment
Utilizing a {Bayesian} Network Learning Algorithm",
journal =      j-JDIQ,
volume =       "1",
number =       "3",
pages =        "14:1--14:??",
month =        dec,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1659225.1659227",
ISSN =         "1936-1955",
bibdate =      "Wed Mar 17 14:47:55 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "This research develops a data quality algorithm
entitled the Accuracy Assessment Algorithm (AAA). This
is an extension of research in developing an
enhancement to a Bayesian Network (BN) learning
algorithm called the Data Quality (DQ) algorithm. This
new algorithm is concerned with estimating the accuracy
levels of a dataset by assessing the quality of the
data with no prior knowledge of the dataset. The AAA
and associated metrics were tested using two canonical
BNs and one large-scale medical network. The article
presents the results regarding the efficacy of the
algorithm and the implications for future research and
practice.",
acknowledgement = ack-nhfb,
articleno =    "14",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
keywords =     "accuracy levels; Bayesian networks; data quality
assessment; PC algorithm",
}

author =       "Adir Even and G. Shankaranarayanan",
title =        "Dual Assessment of Data Quality in Customer
Databases",
journal =      j-JDIQ,
volume =       "1",
number =       "3",
pages =        "15:1--15:??",
month =        dec,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1659225.1659228",
ISSN =         "1936-1955",
bibdate =      "Wed Mar 17 14:47:55 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Quantitative assessment of data quality is critical
for identifying the presence of data defects and the
extent of the damage due to these defects. Quantitative
assessment can help define realistic quality
improvement targets, track progress, evaluate the
impacts of different solutions, and prioritize
improvement efforts accordingly. This study describes a
methodology for quantitatively assessing both impartial
{\em and\/} contextual data quality in large datasets.
Impartial assessment measures the extent to which a
dataset is defective, independent of the context in
which that dataset is used. Contextual assessment, as
defined in this study, measures the extent to which the
presence of defects reduces a dataset's utility, the
benefits gained by using that dataset in a specific
context. The dual assessment methodology is
demonstrated in the context of Customer Relationship
Management (CRM), using large data samples from
real-world datasets. The results from comparing the two
assessments offer important insights for directing
quality maintenance efforts and prioritizing quality
improvement solutions for this dataset. The study
describes the steps and the computation involved in the
dual-assessment methodology and discusses the
implications for applying the methodology in other
acknowledgement = ack-nhfb,
articleno =    "15",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
keywords =     "CRM; customer relationship management; databases; Data
quality; information value; total data quality
management",
}

@Article{Fisher:2009:AMP,
author =       "Craig W. Fisher and Eitel J. M. Lauria and Carolyn C.
Matheus",
title =        "An Accuracy Metric: Percentages, Randomness, and
Probabilities",
journal =      j-JDIQ,
volume =       "1",
number =       "3",
pages =        "16:1--16:??",
month =        dec,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1659225.1659229",
ISSN =         "1936-1955",
bibdate =      "Wed Mar 17 14:47:55 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Practitioners and researchers regularly refer to error
rates or accuracy percentages of databases. The former
is the number of cells in error divided by the total
number of cells; the latter is the number of correct
cells divided by the total number of cells. However,
databases may have similar error rates (or accuracy
percentages) but differ drastically in the complexity
of their accuracy problems. A simple percent does not
provide information as to whether the errors are
systematic or randomly distributed throughout the
database. We expand the accuracy metric to include a
randomness measure and include a probability
distribution value. The proposed randomness check is
based on the Lempel--Ziv (LZ) complexity measure.
Through two simulation studies we show that the LZ
complexity measure can clearly differentiate as to
whether the errors are random or systematic. This
determination is a significant first step and is a
major departure from the percentage-alone technique.
Once it is determined that the errors are random, a
probability distribution, Poisson, is used to help
acknowledgement = ack-nhfb,
articleno =    "16",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
keywords =     "complexity; Data and information quality; randomness",
}

@Article{Ababneh:2009:CSE,
author =       "Sufyan Ababneh and Rashid Ansari and Ashfaq Khokhar",
title =        "Compensated Signature Embedding for Multimedia Content
Authentication",
journal =      j-JDIQ,
volume =       "1",
number =       "3",
pages =        "17:1--17:??",
month =        dec,
year =         "2009",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1659225.1659230",
ISSN =         "1936-1955",
bibdate =      "Wed Mar 17 14:47:55 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "One of the main goals of digital content
authentication and preservation techniques is to
guarantee the originality and quality of the
used to embed content-based fragile signatures in
multimedia signals to achieve efficient authentication
without requiring any third-party reference or side
information. To overcome the signature alteration
caused by the embedding perturbation and other possible
encoding operations, a closed-form compensation
technique is proposed for ensuring signature
consistency by employing a Lagrangian-based approach. A
minimum distortion criterion is used to ensure signal
quality. The effectiveness of the proposed approach is
investigated with simulations of examples of image
authentication in which signatures are designed to
reveal tamper localization. Results using quantitative
performance criteria show successful authentication
over a range of robustness in embedding watermarks
using both QIM-DM and spread-spectrum techniques. A
comparison with two iterative compensation schemes is
also presented.",
acknowledgement = ack-nhfb,
articleno =    "17",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
keywords =     "compensated signature embedding; Content
authentication; watermarking",
}

author =       "Stuart E. Madnick and Yang W. Lee",
title =        "{Editors}' Comments: {ACM Journal of Data and
Information Quality (JDIQ)} is alive and well!",
journal =      j-JDIQ,
volume =       "2",
number =       "1",
pages =        "1:1--1:??",
month =        jul,
year =         "2010",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1805286.1805287",
ISSN =         "1936-1955",
bibdate =      "Tue Sep 7 08:41:54 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "1",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Tremblay:2010:UDM,
author =       "Monica Chiarini Tremblay and Kaushik Dutta and Debra
Vandermeer",
title =        "Using Data Mining Techniques to Discover Bias Patterns
in Missing Data",
journal =      j-JDIQ,
volume =       "2",
number =       "1",
pages =        "2:1--2:??",
month =        jul,
year =         "2010",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1805286.1805288",
ISSN =         "1936-1955",
bibdate =      "Tue Sep 7 08:41:54 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "In today's data-rich environment, decision makers draw
conclusions from data repositories that may contain
data quality problems. In this context, missing data is
an important and known problem, since it can seriously
affect the accuracy of conclusions drawn. Researchers
have described several approaches for dealing with
missing data, primarily attempting to infer values or
estimate the impact of missing data on conclusions.
However, few have considered approaches to characterize
patterns of bias in missing data, that is, to determine
the specific attributes that predict the missingness of
data values. Knowledge of the specific systematic bias
patterns in the incidence of missing data can help
analysts more accurately assess the quality of
conclusions drawn from data sets with missing data.
This research proposes a methodology to combine a
number of Knowledge Discovery and Data Mining
techniques, including association rule mining, to
discover patterns in related attribute values that help
characterize these bias patterns. We demonstrate the
efficacy of our proposed approach by applying it on a
demo census dataset seeded with biased missing data.
The experimental results show that our approach was
able to find seeded biases and filter out most seeded
noise.",
acknowledgement = ack-nhfb,
articleno =    "2",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
keywords =     "Data quality; missing data; pattern discovery",
}

@Article{Jensen:2010:JCI,
author =       "Matthew L. Jensen and Judee K. Burgoon and Jay F.
{Nunamaker, Jr.}",
title =        "Judging the Credibility of Information Gathered from
Face-to-Face Interactions",
journal =      j-JDIQ,
volume =       "2",
number =       "1",
pages =        "3:1--3:??",
month =        jul,
year =         "2010",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1805286.1805289",
ISSN =         "1936-1955",
bibdate =      "Tue Sep 7 08:41:54 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "One of the most pernicious threats to information
quality comes through perpetration of deception by
information suppliers. Deception undermines many
critical dimensions of information quality, such as
accuracy, completeness, and believability. Despite this
threat, information gatherers are ill equipped to
assess the credibility of information suppliers. This
work presents a prototype system that examines messages
gathered during direct, face-to-face information
gathering. The system unobtrusively identifies kinesic
and linguistic features that may indicate deception in
information suppliers' messages. System use was found
to significantly improve assessment ability in
between-subjects and within-subjects tests. The
improved ability to accurately assess credibility
during face-to-face interactions should yield higher
information quality.",
acknowledgement = ack-nhfb,
articleno =    "3",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
keywords =     "Credibility assessment; deception detection;
decision-aids; human-computer interaction; information
veracity; kinesics; linguistics",
}

@Article{Meda:2010:DDF,
author =       "Hema S. Meda and Anup Kumar Sen and Amitava Bagchi",
title =        "On Detecting Data Flow Errors in Workflows",
journal =      j-JDIQ,
volume =       "2",
number =       "1",
pages =        "4:1--4:??",
month =        jul,
year =         "2010",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1805286.1805290",
ISSN =         "1936-1955",
bibdate =      "Tue Sep 7 08:41:54 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "When designing a business workflow, it is customary
practice to create the control flow structure first and
to ensure its correctness. Information about the flow
of data is introduced subsequently into the workflow
and its correctness is independently verified. Improper
specification of data requirements of tasks and XOR
splits can cause problems such as wrong branching at
XOR splits and the failure of tasks to execute. Here we
present a graph traversal algorithm called GTforDF for
detecting data flow errors in both nested and
unstructured workflows, and illustrate its operation on
realistic examples. Two of these have interconnected
loops and are free of control flow errors, and the
third one is an unstructured loop-free workflow. Our
approach extends and generalizes data flow verification
methods that have been recently proposed. It also makes
use of the concept of corresponding pairs lately
introduced in control flow verification. It thus has
the potential for development into a unified
algorithmic procedure for the concurrent detection of
control flow and data flow errors. The correctness of
the algorithm has been proved theoretically. It has
also been tested experimentally on many examples.",
acknowledgement = ack-nhfb,
articleno =    "4",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
keywords =     "Corresponding pair; Data flow errors; Workflow
management",
}

@Article{Magnani:2010:SUM,
author =       "Matteo Magnani and Danilo Montesi",
title =        "A Survey on Uncertainty Management in Data
Integration",
journal =      j-JDIQ,
volume =       "2",
number =       "1",
pages =        "5:1--5:??",
month =        jul,
year =         "2010",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1805286.1805291",
ISSN =         "1936-1955",
bibdate =      "Tue Sep 7 08:41:54 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "In the last few years, uncertainty management has come
to be recognized as a fundamental aspect of data
integration. It is now accepted that it may not be
possible to remove uncertainty generated during data
integration processes and that uncertainty in itself
may represent a source of relevant information. Several
issues, such as the aggregation of uncertain mappings
and the querying of uncertain mediated schemata, have
been addressed by applying well-known uncertainty
management theories. However, several problems lie
this highly active research area; it details existing
works in the light of a homogeneous framework, and
identifies and discusses the leading issues awaiting
solutions.",
acknowledgement = ack-nhfb,
articleno =    "5",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
keywords =     "Data integration; uncertainty",
}

@Article{Talburt:2010:CPS,
author =       "John R. Talburt and Stuart E. Madnick and Yang W.
Lee",
title =        "Call for Papers: Special Issue on Entity Resolution",
journal =      j-JDIQ,
volume =       "2",
number =       "1",
pages =        "6:1--6:??",
month =        jul,
year =         "2010",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1805286.1805292",
ISSN =         "1936-1955",
bibdate =      "Tue Sep 7 08:41:54 MDT 2010",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "6",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

author =       "Stuart E. Madnick and Yang W. Lee",
title =        "Editorial: In Search of Novel Ideas and Solutions with
a Broader Context of Data Quality in Mind",
journal =      j-JDIQ,
volume =       "2",
number =       "2",
pages =        "7:1--7:??",
month =        feb,
year =         "2011",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1891879.1891880",
ISSN =         "1936-1955",
bibdate =      "Mon Mar 28 12:03:59 MDT 2011",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "7",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Blake:2011:EID,
author =       "Roger Blake and Paul Mangiameli",
title =        "The Effects and Interactions of Data Quality and
Problem Complexity on Classification",
journal =      j-JDIQ,
volume =       "2",
number =       "2",
pages =        "8:1--8:??",
month =        feb,
year =         "2011",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1891879.1891881",
ISSN =         "1936-1955",
bibdate =      "Mon Mar 28 12:03:59 MDT 2011",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "8",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Gelman:2011:GGA,
title =        "{GIGO} or not {GIGO}: The Accuracy of Multi-Criteria
Satisficing Decisions",
journal =      j-JDIQ,
volume =       "2",
number =       "2",
pages =        "9:1--9:??",
month =        feb,
year =         "2011",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1891879.1891882",
ISSN =         "1936-1955",
bibdate =      "Mon Mar 28 12:03:59 MDT 2011",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "9",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Fan:2011:GBN,
author =       "Xiaoming Fan and Jianyong Wang and Xu Pu and Lizhu
Zhou and Bing Lv",
title =        "On Graph-Based Name Disambiguation",
journal =      j-JDIQ,
volume =       "2",
number =       "2",
pages =        "10:1--10:??",
month =        feb,
year =         "2011",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1891879.1891883",
ISSN =         "1936-1955",
bibdate =      "Mon Mar 28 12:03:59 MDT 2011",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "10",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ngugi:2011:TBI,
author =       "Benjamin Ngugi and Beverly K. Kahn and Marilyn
Tremaine",
title =        "Typing Biometrics: Impact of Human Learning on
Performance Quality",
journal =      j-JDIQ,
volume =       "2",
number =       "2",
pages =        "11:1--11:??",
month =        feb,
year =         "2011",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/1891879.1891884",
ISSN =         "1936-1955",
bibdate =      "Mon Mar 28 12:03:59 MDT 2011",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "11",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

author =       "Stuart E. Madnick and Yang W. Lee",
title =        "Editorial Notes: Classification and Assessment of
Large Amounts of Data: Examples in the Healthcare
Industry and Collaborative Digital Libraries",
journal =      j-JDIQ,
volume =       "2",
number =       "3",
pages =        "12:1--12:??",
month =        dec,
year =         "2011",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2063504.2063505",
ISSN =         "1936-1955",
bibdate =      "Thu Dec 15 09:41:55 MST 2011",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "12",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Lauria:2011:CBT,
author =       "Eitel J. M. Laur{\'\i}a and Alan D. March",
title =        "Combining {Bayesian} Text Classification and Shrinkage
to Automate Healthcare Coding: a Data Quality
Analysis",
journal =      j-JDIQ,
volume =       "2",
number =       "3",
pages =        "13:1--13:??",
month =        dec,
year =         "2011",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2063504.2063506",
ISSN =         "1936-1955",
bibdate =      "Thu Dec 15 09:41:55 MST 2011",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "13",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

author =       "Daniel Hasan Dalip and Marcos Andr{\'e}
Gon{\c{c}}alves and Marco Cristo and P{\'a}vel Calado",
title =        "Automatic Assessment of Document Quality in {Web}
Collaborative Digital Libraries",
journal =      j-JDIQ,
volume =       "2",
number =       "3",
pages =        "14:1--14:??",
month =        dec,
year =         "2011",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2063504.2063507",
ISSN =         "1936-1955",
bibdate =      "Thu Dec 15 09:41:55 MST 2011",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "14",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Muller:2012:IDQ,
author =       "Heiko M{\"u}ller and Johann-Christoph Freytag and Ulf
Leser",
title =        "Improving data quality by source analysis",
journal =      j-JDIQ,
volume =       "2",
number =       "4",
pages =        "15:1--15:??",
month =        feb,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2107536.2107538",
ISSN =         "1936-1955",
bibdate =      "Fri Mar 16 15:01:48 MDT 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "In many domains, data cleaning is hampered by our
limited ability to specify a comprehensive set of
integrity constraints to assist in identification of
erroneous data. An alternative approach to improve data
quality is to exploit different data sources that
contain information about the same set of objects. Such
overlapping sources highlight hot-spots of poor data
quality through conflicting data values and immediately
provide alternative values for conflict resolution. In
order to derive a dataset of high quality, we can merge
the overlapping sources based on a quality assessment
of the conflicting values. The quality of the resulting
dataset, however, is highly dependent on our ability to
asses the quality of conflicting values effectively.
methods that aid the developer of an integrated system
of improving the quality of data. Value conflicts
between contradicting sources are often systematic,
caused by some characteristic of the different sources.
Our goal is to identify such systematic differences and
outline data patterns that occur in conjunction with
them. Evaluated by an expert user, the regularities
discovered provide insights into possible conflict
reasons and help to assess the quality of inconsistent
patterns and minimal update sequences. Contradiction
patterns resemble a special form of association rules
that summarize characteristic data properties for
conflict occurrence. We adapt existing association rule
mining algorithms for mining contradiction patterns.
Contradiction patterns, however, view each class of
conflicts in isolation, sometimes leading to largely
overlapping patterns. Sequences of set-oriented update
operations that transform one data source into the
other are compact descriptions for all regular
differences among the sources. We consider minimal
update sequences as the most likely explanation for
observed differences between overlapping data sources.
Furthermore, the order of operations within the
sequences point out potential dependencies between
systematic differences. Finding minimal update
sequences, however, is beyond reach in practice. We
show that the problem already is NP-complete for a
restricted set of operations. In the light of this
intractability result, we present heuristics that lead
to convincing results for all examples we considered.",
acknowledgement = ack-nhfb,
articleno =    "15",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Gelman:2012:BMC,
title =        "Biases in multi-criteria, satisfying decisions due to
data errors",
journal =      j-JDIQ,
volume =       "2",
number =       "4",
pages =        "16:1--16:??",
month =        feb,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2107536.2107539",
ISSN =         "1936-1955",
bibdate =      "Fri Mar 16 15:01:48 MDT 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "This inquiry centers on an asymmetry, or bias, in the
accuracy of multi-criteria, conjunctive, and
disjunctive decisions, which originates from
fundamental properties of the logical conjunction and
disjunction operations. A mathematical-statistical
analysis indicates that, as we keep adding criteria to
a multi-criteria conjunctive or disjunctive decision
rule, errors in the data produce decision errors
asymmetrically. As a result, in conjunctive decisions,
the probability of a false negative increases while the
probability of a false positive decreases. In contrast,
in disjunctive decisions, as we keep adding criteria,
the probability of a false positive increases while
that of a false negative decreases. For instance, in a
conjunctive business decision rule, the probability of
overlooking a bargain can be far greater than the
probability of misjudging an unattractive offer to be a
good one. A series of Monte Carlo simulations validates
the analytical findings and explores the contribution
acknowledgement = ack-nhfb,
articleno =    "16",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Sachdeva:2012:SIS,
author =       "Shelly Sachdeva and Subhash Bhalla",
title =        "Semantic interoperability in standardized electronic
health record databases",
journal =      j-JDIQ,
volume =       "3",
number =       "1",
pages =        "1:1--1:??",
month =        apr,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2166788.2166789",
ISSN =         "1936-1955",
bibdate =      "Thu Nov 8 18:27:12 MST 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Different clinics and hospitals have their own
information systems to maintain patient data. This
hinders the exchange of data among systems (and
organizations). Hence there is a need to provide
standards for data exchange. In digitized form, the
individual patient's medical record can be stored,
retrieved, and shared over a network through
enhancement in information technology. Thus, electronic
health records (EHRs) should be standardized,
incorporating semantic interoperability. A subsequent
step requires that healthcare professionals and
patients get involved in using the EHRs, with the help
of technological developments. This study aims to
provide different approaches in understanding some
current and challenging concepts in health informatics.
Successful handling of these challenges will lead to
improved quality in healthcare by reducing medical
errors, decreasing costs, and enhancing patient care.
The study is focused on the following goals: (1)
understanding the role of EHRs; (2) understanding the
need for standardization to improve quality; (3)
establishing interoperability in maintaining EHRs; (4)
examining a framework for standardization and
interoperability (the openEHR architecture); (5)
identifying the role of archetypes for knowledge-based
systems; and (6) understanding the difficulties in
querying HER data.",
acknowledgement = ack-nhfb,
articleno =    "1",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Brown:2012:DQT,
author =       "Steven Brown and Trent S. Rosenbloom and Shawn P.
Hardenbrook and Terry Clark and Elliot Fielstein and
Peter Elkin and Ted Speroff",
title =        "Documentation quality and time costs: a randomized
controlled trial of structured entry versus dictation",
journal =      j-JDIQ,
volume =       "3",
number =       "1",
pages =        "2:1--2:??",
month =        apr,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2166788.2166790",
ISSN =         "1936-1955",
bibdate =      "Thu Nov 8 18:27:12 MST 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "The Department of Veterans Affairs (VA) performs over
800,000 disability exams and distributes over
{\&}dollor;37 billion in disability benefits per year.
VA developed and deployed a computer-based disability
exam documentation system in order to improve exam
report quality and timeliness. We conducted a
randomized controlled trial comparing joint disability
examinations supported by computerized templates to the
examinations documented via dictation, to determine if
the system met the intended goals or had unintended
consequences. Consenting veterans were randomized to
undergo exams documented using computerized templates
or via dictation. We compared exam report quality,
documentation time costs, encounter length, total time
to fulfill an exam request with a finalized exam
report, and veteran satisfaction. Computer-based
templates resulted in disability exam reports that had
higher quality scores (p. 0.042) and were returned to
the requesting office faster than exam reports created
via dictation (p. 0.02). Documentation time and veteran
satisfaction were similar for both the documentation
techniques. Encounter length was significantly longer
for the template group. Computer-based templates
impacted the VA disability evaluation system by
improving report quality scores and production time and
lengthening encounter times. Oversight bodies have
called for mandated use of computer-based templates
nationwide. We believe mandates regarding use of health
information technology should be guided by data
regarding its positive and negative impacts.",
acknowledgement = ack-nhfb,
articleno =    "2",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Sunyaev:2012:SCD,
author =       "Ali Sunyaev and Dmitry Chornyi",
title =        "Supporting chronic disease care quality: Design and
implementation of a health service and its integration
with electronic health records",
journal =      j-JDIQ,
volume =       "3",
number =       "2",
pages =        "3:1--3:??",
month =        may,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2184442.2184443",
ISSN =         "1936-1955",
bibdate =      "Thu Nov 8 18:27:12 MST 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Chronic medical conditions take a huge toll on lives
of a growing number of people and are a major
contributor to the rising costs in healthcare. As
patients are increasingly willing to take an active
part in managing their conditions, chronic disease
self-management programs and information systems that
support them are recognized for their potential to
improve the quality of healthcare delivery. These
programs often rely on recording longitudinal patient
data and analyzing it. Therefore, maintaining
appropriate data quality is important for
self-management programs to be efficient and safe. We
designed and implemented a prototype of a health
self-management service for chronically ill people. It
is a distributed application that supports patients
with diabetes at tracking their blood glucose levels.
The main design goals were usability, extensibility,
security, and interoperability. The system integrates
with the Microsoft HealthVault and Google Health
personal health record platforms. It utilizes
industry-strength storage and security mechanisms, is
scalable, and as a result, can be used to gather,
securely store, and analyze patient data over long
software information technology can support chronic
disease self-management and its impact on the quality
of patient data. Furthermore, we describe the
requirements that drove the system's development, its
architecture, and design decisions.",
acknowledgement = ack-nhfb,
articleno =    "3",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Elizabeth:2012:NSA,
author =       "D. Shiloah Elizabeth and H. Khanna Nehemiah and C.
Sunil Retmin Raj and A. Kannan",
title =        "A novel segmentation approach for improving diagnostic
accuracy of {CAD} systems for detecting lung cancer
from chest computed tomography images",
journal =      j-JDIQ,
volume =       "3",
number =       "2",
pages =        "4:1--4:??",
month =        may,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2184442.2184444",
ISSN =         "1936-1955",
bibdate =      "Thu Nov 8 18:27:12 MST 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Segmentation of lung tissue is an important and
challenging task in any computer aided diagnosis
system. The accuracy of the segmentation subsystem
determines the performance of the other subsystems in
any computer aided diagnosis system based on image
analysis. We propose a novel technique for segmentation
of lung tissue from computed tomography of the chest.
Manual segmentation of lung parenchyma becomes
difficult with an enormous volume of images. The goal
of this work is to present an automated approach to
segmentation of lung parenchyma from the rest of the
chest CT image. The approach involves the conventional
optimal thresholding technique and operations based on
convex edge and centroid properties of the lung region.
be used to preprocess lung images given to a computer
aided diagnosis system for diagnosis of lung disorders.
This improves the diagnostic performance of the system.
This has been tested by using it in a computer aided
diagnosis system that was used for detection of lung
cancer from chest computed tomography images. The
results obtained show that the lungs can be correctly
segmented even in the presence of peripheral pathology
bearing regions; pathology bearing regions that could
not be detected using a CAD system that applies optimal
thresholding could be detected using a CAD system using
out proposed approach for segmentation of lungs.",
acknowledgement = ack-nhfb,
articleno =    "4",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Yakout:2012:EPA,
author =       "Mohamed Yakout and Mikhail J. Atallah and Ahmed
Elmagarmid",
title =        "Efficient and Practical Approach for Private Record
journal =      j-JDIQ,
volume =       "3",
number =       "3",
pages =        "5:1--5:??",
month =        aug,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2287714.2287715",
ISSN =         "1936-1955",
bibdate =      "Thu Nov 8 18:27:13 MST 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Record linkage is used to associate entities from
multiple data sources. For example, two organizations
contemplating a merger may want to know how common
their customer bases are so that they may better assess
the benefits of the merger. Another example is a
database of people who are forbidden from a certain
activity by regulators, may need to be compared to a
list of people engaged in that activity. The autonomous
entities who wish to carry out the record matching
computation are often reluctant to fully share their
data; they fear losing control over its subsequent
dissemination and usage, or they want to insure privacy
because the data is proprietary or confidential, and/or
they are cautious simply because privacy laws forbid
its disclosure or regulate the form of that disclosure.
In such cases, the problem of carrying out the linkage
computation without full data exchange has been called
techniques have made use of a third party. We provide
efficient techniques for private record linkage that
improve on previous work in that (1) our techniques
make no use of a third party, and (2) they achieve much
better performance than previous schemes in terms of
their execution time while maintaining acceptable
quality of output compared to nonprivacy settings. Our
protocol consists of two phases. The first phase
primarily produces candidate record pairs for matching,
by carrying out a very fast (but not accurate) matching
between such pairs of records. The second phase is a
novel protocol for efficiently computing distances
between each candidate pair (without any expensive
cryptographic operations such as modular
exponentiations). Our experimental evaluation of our
approach validates these claims.",
acknowledgement = ack-nhfb,
articleno =    "5",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Yang:2012:ECD,
author =       "Yanjuan Yang and Michael Mannino",
title =        "An Experimental Comparison of a Document Deception
Detection Policy using Real and Artificial Deception",
journal =      j-JDIQ,
volume =       "3",
number =       "3",
pages =        "6:1--6:??",
month =        aug,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2287714.2287716",
ISSN =         "1936-1955",
bibdate =      "Thu Nov 8 18:27:13 MST 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Developing policies to screen documents for deception
is often hampered by the cost of data collection and
the inability to evaluate policy alternatives due to
lack of data. To lower data collection costs and
increase the amount of data, artificially generated
deception data can be used, but the impact of using
artificially generated deception data is not well
artificially generated deception on document screening
policies. The deception and truth data were collected
from financial aid applications, a document-centric
area with limited resources for screening. Real
deception was augmented with artificial data generated
by noise and deception generation models. Using the
real data and artificially generated data, we designed
an innovative experiment with deception type and
deception rate as factors, and harmonic mean and cost
as outcome variables. We used two budget models (fixed
and variable) typically employed by financial aid
offices to measure the cost of noncompliance in
financial aid applications. The analysis included an
evaluation of a common policy for deception screening
using both fixed and varying screening rates. The
results of the experiment provided evidence of similar
performance of screening policy with real and
artificial deception, suggesting the possibility of
using artificially generated deception to reduce the
costs associated with obtaining training data.",
acknowledgement = ack-nhfb,
articleno =    "6",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Robb:2012:INU,
author =       "David A. Robb and Paul L. Bowen and A. Faye Borthick
and Fiona H. Rohde",
title =        "Improving New Users' Query Performance: Deterring
Premature Stopping of Query Revision with Information
for Forming Ex Ante Expectations",
journal =      j-JDIQ,
volume =       "3",
number =       "4",
pages =        "7:1--7:??",
month =        sep,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2348828.2348829",
ISSN =         "1936-1955",
bibdate =      "Thu Nov 8 18:27:14 MST 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "As the volume of data in organizational databases
grows, organizations are seeking to use this data to
improve organizational success. To this end, users are
being asked to query these databases to provide
information to help answer questions posed by key
management personnel. Users who have had extensive
experience with an organization's data can often detect
the presence of errors in their queries when query
results do not correspond to their ex ante
expectations. New users, however, are less familiar
with the data they will be querying. Having no, or
limited, ex ante expectations for query results, new
users may be unaware that the result produced by their
query is incorrect. Unwarranted confidence in the
correctness of their queries predisposes these users to
stop looking for query errors even when their queries
still contain errors. This behavior, premature stopping
of query revision, prompts investigating whether new
users' query performance would improve if they were not
only provided with, but used, readily available
information to form ex ante expectations. Our results
demonstrated a threshold effect in new users heeding
information for forming ex ante expectations. That is,
the mere availability of information for forming ex
ante expectations made no difference in query
performance. When admonishing users to heed ex ante
information, however, there was an associated increase
in the accuracy of their queries. These results suggest
that users unfamiliar with a particular database might
make fewer query errors if they not only received
readily available information but were then prompted to
use the information to form ex ante expectations for
query results.",
acknowledgement = ack-nhfb,
articleno =    "7",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Varol:2012:HMA,
author =       "Cihan Varol and Coskun Bayrak",
title =        "Hybrid Matching Algorithm for Personal Names",
journal =      j-JDIQ,
volume =       "3",
number =       "4",
pages =        "8:1--8:??",
month =        sep,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2348828.2348830",
ISSN =         "1936-1955",
bibdate =      "Thu Nov 8 18:27:14 MST 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib;
http://www.math.utah.edu/pub/tex/bib/spell.bib",
abstract =     "Companies acquire personal information from phone,
World Wide Web, or email in order to sell or send an
information is acquired, moved, copied, or edited, the
data may lose its quality. Often, the use of data
administrators or a tool that has limited capabilities
to correct the mistyped information can cause many
problems. Moreover, most of the correction techniques
are particularly implemented for the words used in
daily conversations. Since personal names have
different characteristics compared to general text, a
hybrid matching algorithm (PNRS) which employs phonetic
encoding, string matching and statistical facts to
provide a possible candidate for misspelled names is
developed. At the end, the efficiency of the proposed
algorithm is compared with other well known spelling
correction techniques.",
acknowledgement = ack-nhfb,
articleno =    "8",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{ODonoghue:2012:ISI,
author =       "John O'Donoghue and Jane Grimson and Katherine
Seelman",
title =        "Introduction to the Special Issue on Information
Quality: The Challenges and Opportunities in Healthcare
Systems and Services",
journal =      j-JDIQ,
volume =       "4",
number =       "1",
pages =        "1:1--1:??",
month =        oct,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2378016.2378017",
ISSN =         "1936-1955",
bibdate =      "Thu Nov 8 18:27:14 MST 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "1",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Collins:2012:CGF,
author =       "Claire Collins and Kelly Janssens",
title =        "Creating a General (Family) Practice Epidemiological
Database in {Ireland} --- Data Quality Issue
Management",
journal =      j-JDIQ,
volume =       "4",
number =       "1",
pages =        "2:1--2:??",
month =        oct,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2378016.2378018",
ISSN =         "1936-1955",
bibdate =      "Thu Nov 8 18:27:14 MST 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "In Ireland, while detailed information is available
regarding hospital attendance, little is known
regarding general (family) practice attendance.
However, it is conservatively estimated that there are
almost nine times as many general practice encounters
than there are hospital encounters each year in
Ireland. This represents a very significant gap in
health information. Indeed, general practice has been
shown in other countries to be an important and rich
source of information about the health of the
population, their behaviors and their utilization of
health services. Funded by the Health Information and
Quality Authority (HIQA), the Irish College of General
Practitioners (ICGP) undertook a feasibility study of
diagnostic coding of routinely entered patient data and
the creation of a national general practice morbidity
and epidemiological database (GPMED project). This
article outlines the process of data quality issue
management undertaken. The study's findings suggest
that the quality of data collection and reporting
structures available in general practice throughout
Ireland at the outset of this project were not adequate
to permit the creation of a database of sufficient
quality for service planning and policy or
epidemiological research. Challenges include the dearth
of a minimum standard of data recorded in consultations
by GPs and the absence of the digital data recording
and exporting infrastructure within Irish patient
management software systems. In addition, there is at
present a lack of recognition regarding the value of
such data for patient management and service
planning---including importantly, data collectors who
do not fully accept the merit of maintaining data,
which has a direct consequence for data quality. The
work of this project has substantial implications for
the data available to the health sector in Ireland and
contributes to the knowledge base internationally
regarding general practice morbidity data.",
acknowledgement = ack-nhfb,
articleno =    "2",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Cure:2012:IDQ,
author =       "Olivier Cur{\'e}",
title =        "Improving the Data Quality of Drug Databases using
Conditional Dependencies and Ontologies",
journal =      j-JDIQ,
volume =       "4",
number =       "1",
pages =        "3:1--3:??",
month =        oct,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2378016.2378019",
ISSN =         "1936-1955",
bibdate =      "Thu Nov 8 18:27:14 MST 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Many health care systems and services exploit drug
related information stored in databases. The poor data
quality of these databases, e.g. inaccuracy of drug
consequences for the health condition of patients.
Hence it is important to ensure their quality in terms
of data completeness and soundness. In the database
domain, standard Functional Dependencies (FDs) and
INclusion Dependencies (INDs), have been proposed to
prevent the insertion of incorrect data. But they are
generally not expressive enough to represent a
domain-specific set of constraints. To this end,
conditional dependencies, i.e. standard dependencies
extended with tableau patterns containing constant
values, have been introduced and several methods have
been proposed for their discovery and representation.
The quality of drug databases can be considerably
improved by their usage. Moreover, pharmacology
information is inherently hierarchical and many
standards propose graph structures to represent them,
e.g. the Anatomical Therapeutic Chemical classification
emphasize that the technologies of the Semantic Web are
adapted to represent these hierarchical structures,
i.e. in RDFS and OWL. We also present a solution for
representing conditional dependencies using a query
language defined for these graph oriented structures,
namely SPARQL. The benefits of this approach are
interoperability with applications and ontologies of
the Semantic Web as well as a reasoning-based query
execution solution to clean underlying databases.",
acknowledgement = ack-nhfb,
articleno =    "3",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{McNaull:2012:DIQ,
author =       "James McNaull and Juan Carlos Augusto and Maurice
Mulvenna and Paul McCullagh",
title =        "Data and Information Quality Issues in Ambient
Assisted Living Systems",
journal =      j-JDIQ,
volume =       "4",
number =       "1",
pages =        "4:1--4:??",
month =        oct,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2378016.2378020",
ISSN =         "1936-1955",
bibdate =      "Thu Nov 8 18:27:14 MST 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Demographic aging, as a result of people living for
longer, has put an increased burden on health and
social care provision across most of the economies of
the developed and developing world. In order to cope
with the greater numbers of older people, together with
increasing prevalence of chronic diseases, governments
are looking to new ways to provide care and support to
older people and their care providers. A growing trend
is where health and social care providers are moving
towards the use of assisted living technologies to
provide care and assistance in the home. In this
article, the research area of Ambient Assisted Living
(AAL) systems is examined and the data, information and
the higher-level contextual knowledge quality issues in
relation to these systems, is discussed. Lack of
quality control may result in an AAL system providing
assistance and support based upon incorrect data,
information and knowledge inputs, and this may have a
detrimental effect on the person making use of the
system. We propose a model whereby contextual knowledge
gained during the AAL system's reasoning cycle can be
fed back to aid in further quality checking at the
various architectural layers, and a realistic AAL
scenario is provided to support this. Future research
should be conducted in these areas, with the
requirement of building quality criteria into the
design and implementation of AAL systems.",
acknowledgement = ack-nhfb,
articleno =    "4",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{ODonoghue:2012:DMW,
author =       "John O'Donoghue and John Herbert",
title =        "Data Management within {mHealth} Environments: Patient
Sensors, Mobile Devices, and Databases",
journal =      j-JDIQ,
volume =       "4",
number =       "1",
pages =        "5:1--5:??",
month =        oct,
year =         "2012",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2378016.2378021",
ISSN =         "1936-1955",
bibdate =      "Thu Nov 8 18:27:14 MST 2012",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Pervasive environments generate large quantities of
data, originating from backend servers, portable
devices, and wireless mobile sensors. Pervasive sensing
devices that monitor properties of the environment
(including human beings) can be a large data source.
Unprocessed datasets may include data that is faulty
and irrelevant, and data that is important and useful.
If not managed correctly the large amount of data from
a data-rich pervasive environment may result in
information overload or delivery of incorrect
information. Context-sensitive quality data management
aims to gather, verify, process, and manage the
multiple data sources in a pervasive environment in
order to deliver high quality, relevant information to
the end-user. Managing the quality of data from
different sources, correlating related data, and making
use of context, are all essential in providing end
users with accurate and meaningful data in real time.
This requirement is especially true for critical
applications such as in a medical environment. This
article presents the Data Management System (DMS)
architecture. It is designed to deliver quality data
service to its users. The DMS architecture employs an
agent-based middleware to intelligently and effectively
manage all pervasive data sources, and to make use of
context to deliver relevant information to the
end-user. Two of the DMS components are presented: (1)
data validation and (2) data consistency. The DMS
components have been rigorously evaluated using various
careful, precise approach to data based on the quality
of the data and the context of its use. It emphasises
the DMS architecture and the role of software agents in
providing quality data management.",
acknowledgement = ack-nhfb,
articleno =    "5",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Talburt:2013:SIE,
author =       "John R. Talburt",
title =        "Special Issue on Entity Resolution Overview: The
Criticality of Entity Resolution in Data and
Information Quality",
journal =      j-JDIQ,
volume =       "4",
number =       "2",
pages =        "6:1--6:??",
month =        mar,
year =         "2013",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2435221.2435222",
ISSN =         "1936-1955",
bibdate =      "Sat Jun 22 12:13:00 MDT 2013",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "6",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Song:2013:DIE,
author =       "Dezhao Song and Jeff Heflin",
title =        "Domain-Independent Entity Coreference for Linking
Ontology Instances",
journal =      j-JDIQ,
volume =       "4",
number =       "2",
pages =        "7:1--7:??",
month =        mar,
year =         "2013",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2435221.2435223",
ISSN =         "1936-1955",
bibdate =      "Sat Jun 22 12:13:00 MDT 2013",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "The objective of entity coreference is to determine if
different mentions (e.g., person names, place names,
database records, ontology instances, etc.) refer to
the same real word object. Entity coreference
algorithms can be used to detect duplicate database
records and to determine if two Semantic Web instances
represent the same underlying real word entity. The key
issues in developing an entity coreference algorithm
include how to locate context information and how to
present a novel entity coreference algorithm for
ontology instances. For scalability reasons, we select
a neighborhood of each instance from an RDF graph. To
determine the similarity between two instances, our
algorithm computes the similarity between comparable
property values in the neighborhood graphs. The
similarity of distinct URIs and blank nodes is computed
by comparing their outgoing links. In an attempt to
reduce the impact of distant nodes on the final
similarity measure, we explore a distance-based
discounting approach. To provide the best possible
domain-independent matches, we propose an approach to
compute the discriminability of triples in order to
assign weights to the context information. We evaluated
our algorithm using different instance categories from
five datasets. Our experiments show that the best
results are achieved by including both our discounting
and triple discrimination approaches.",
acknowledgement = ack-nhfb,
articleno =    "7",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Nuray-Turan:2013:ACS,
author =       "Rabia Nuray-Turan and Dmitri V. Kalashnikov and Sharad
Mehrotra",
title =        "Adaptive Connection Strength Models for
Relationship-Based Entity Resolution",
journal =      j-JDIQ,
volume =       "4",
number =       "2",
pages =        "8:1--8:??",
month =        mar,
year =         "2013",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2435221.2435224",
ISSN =         "1936-1955",
bibdate =      "Sat Jun 22 12:13:00 MDT 2013",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Entity Resolution (ER) is a data quality challenge
that deals with ambiguous references in data and whose
task is to identify all references that co-refer. Due
to practical significance of the ER problem, many
creative ER techniques have been proposed in the past,
including those that analyze relationships that exist
among entities in data. Such approaches view the
database as an entity-relationship graph, where direct
and indirect relationships correspond to paths in the
graph. These techniques rely on measuring the
connection strength among various nodes in the graph by
using a connection strength (CS) model. While such
approaches have demonstrated significant advantage over
traditional ER techniques, currently they also have a
significant limitation: the CS models that they use are
intuition-based fixed models that tend to behave well
in general, but are very generic and not tuned to a
specific domain, leading to suboptimal result quality.
employs supervised learning to adapt the connection
strength measure to the given domain using the
available past/training data. The adaptive approach has
several advantages: it increases both the quality and
efficiency of ER and it also minimizes the domain
analyst participation needed to tune the CS model to
the given domain. The extensive empirical evaluation
demonstrates that the proposed approach reaches up to
8\% higher accuracy than the graph-based ER methods
that use fixed and intuition-based CS models.",
acknowledgement = ack-nhfb,
articleno =    "8",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Panse:2013:IHU,
author =       "Fabian Panse and Maurice van Keulen and Norbert
Ritter",
title =        "Indeterministic Handling of Uncertain Decisions in
Deduplication",
journal =      j-JDIQ,
volume =       "4",
number =       "2",
pages =        "9:1--9:??",
month =        mar,
year =         "2013",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2435221.2435225",
ISSN =         "1936-1955",
bibdate =      "Sat Jun 22 12:13:00 MDT 2013",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "In current research and practice, deduplication is
usually considered as a deterministic approach in which
database tuples are either declared to be duplicates or
not. In ambiguous situations, however, it is often not
completely clear-cut, which tuples represent the same
real-world entity. In deterministic approaches, many
realistic possibilities may be ignored, which in turn
present an indeterministic approach for deduplication
by using a probabilistic target model including
techniques for proper probabilistic interpretation of
similarity matching results. Thus, instead of deciding
for one of the most likely situations, all realistic
situations are modeled in the resultant data. This
approach minimizes the negative impact of false
decisions. Moreover, the deduplication process becomes
almost fully automatic and human effort can be largely
reduced. To increase applicability, we introduce
several semi-indeterministic methods that heuristically
reduce the set of indeterministically handled decisions
in several meaningful ways. We also describe a
full-indeterministic method for theoretical and
presentational reasons.",
acknowledgement = ack-nhfb,
articleno =    "9",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Zhou:2013:GLC,
author =       "Yinle Zhou and Eric Nelson and Fumiko Kobayashi and
John R. Talburt",
title =        "A Graduate-Level Course on Entity Resolution and
Information Quality: a Step toward {ER} Education",
journal =      j-JDIQ,
volume =       "4",
number =       "2",
pages =        "10:1--10:??",
month =        mar,
year =         "2013",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2435221.2435226",
ISSN =         "1936-1955",
bibdate =      "Sat Jun 22 12:13:00 MDT 2013",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
lessons learned in teaching a graduate-level course
covering entity resolution (ER) and its relationship to
information quality (IQ). The course surveys a broad
spectrum of ER topics and activities including entity
reference extraction, entity reference preparation,
entity reference resolution techniques, entity identity
management, and entity relationship analysis. The
course content also attempts to balance aspects of ER
theory with practical application through a series of
laboratory exercises coordinated with the lecture
topics. As an additional teaching aid, a configurable,
open-source entity resolution engine (OYSTER) was
developed that allows students to experience with
different types of ER architectures including
merge-purge, record linking, identity resolution, and
identity capture.",
acknowledgement = ack-nhfb,
articleno =    "10",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

author =       "Lan Cao and Hongwei Zhu",
title =        "Normal accidents: Data quality problems in
{ERP}-enabled manufacturing",
journal =      j-JDIQ,
volume =       "4",
number =       "3",
pages =        "11:1--11:??",
month =        may,
year =         "2013",
CODEN =        "????",
ISSN =         "1936-1955",
bibdate =      "Sat Jun 22 12:13:05 MDT 2013",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "The efficient operation of Enterprise Resource
Planning (ERP) systems largely depends on data quality.
ERP can improve data quality and information sharing
within an organization. It can also pose challenges to
data quality. While it is well known that data quality
is important in ERP systems, most existing research has
focused on identifying the factors affecting the
implementation and the business values of ERP. With
normal accident theory as a theoretical lens, we
examine data quality problems in ERP using a case study
of a large, fast-growing multinational manufacturer
headquartered in China. Our findings show that
organizations that have successfully implemented ERP
can still experience certain data quality problems. We
identify major data quality problems in data
production, storage and maintenance, and utilization
processes. We also analyze the causes of these data
quality problems by linking them to certain
characteristics of ERP systems within an organizational
context. Our analysis shows that problems resulting
from the tight coupling effects and the complexity of
ERP-enabled manufacturing systems can be inevitable.
This study will help researchers and practitioners
formulate data management strategies that are effective
in the presence of certain normal'' data quality
problems.",
acknowledgement = ack-nhfb,
articleno =    "11",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Biran:2013:CII,
author =       "Dov Biran and Michael H. Zack and Richard J. Briotta",
title =        "Competitive intelligence and information quality: a
game-theoretic perspective",
journal =      j-JDIQ,
volume =       "4",
number =       "3",
pages =        "12:1--12:??",
month =        may,
year =         "2013",
CODEN =        "????",
ISSN =         "1936-1955",
bibdate =      "Sat Jun 22 12:13:05 MDT 2013",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "To better understand a competitor's tactical and
strategic plans, companies need to take a closer look
at competitive intelligence or they risk missing
lucrative opportunities. Because of this there is a
growing interest in competitive intelligence and
intelligence information gathering systems (IIS). This
article uses game-theoretic concepts to develop an
analytic framework to assess the value of deploying a
competitive intelligence gathering information system.
Modeling the competitive environment as a game provides
a useful approach to study and evaluate competitive
strategies given diverse assumptions about the quality
of the information known by the players. When
determining the value of deploying an IIS, decision
makers need to examine three components of the
competitive environment: the competitive rules of the
game, the state of player knowledge, and the
reliability of the information gathered. This framework
focuses on competitive environments where the players'
state of knowledge (i.e., common versus covert
knowledge) and the reliability of the information
generated are essential to the decision making process.
The article concludes with implications for research
and practice.",
acknowledgement = ack-nhfb,
articleno =    "12",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

author =       "Nitin R. Joglekar and Edward G. Anderson and G.
Shankaranarayanan",
title =        "Accuracy of aggregate data in distributed project
settings: Model, analysis and implications",
journal =      j-JDIQ,
volume =       "4",
number =       "3",
pages =        "13:1--13:??",
month =        may,
year =         "2013",
CODEN =        "????",
ISSN =         "1936-1955",
bibdate =      "Sat Jun 22 12:13:05 MDT 2013",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "We examine the management of data accuracy in
inter-organizational data exchanges using the context
of distributed software projects. Organizations
typically manage projects by outsourcing portions of
the project to partners. Managing a portfolio of such
projects requires sharing data regarding the status of
work-in-progress residing with the partners and
estimates of these projects' completion times.
Portfolio managers use these data to assign projects to
be outsourced to partners. These data are rarely
accurate. Unless these data are filtered, inaccuracies
can lead to myopic and expensive sourcing decisions. We
develop a model that uses project-status data to
identify an optimal assignment of projects to be
outsourced. This model permits corruption of
project-status data. We use this model to compute the
costs of using perfect versus inaccurate project-status
data and show that the costs of deviation from optimal
are sizable when the inaccuracy in the data is
significant. We further propose a filter to correct
inaccurate project-status data and generate an estimate
of true progress. With this filter, depending on the
relative magnitudes of errors, we show that accuracy of
project-status data can be improved and the associated
economic benefit is significant. We illustrate the
improvement in accuracy and associated economic benefit
by instantiating the model and the filter. We further
elaborate on how the model parameters may be estimated
and used in practice.",
acknowledgement = ack-nhfb,
articleno =    "13",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Raschid:2014:E,
author =       "Louiqa Raschid",
title =        "Editorial",
journal =      j-JDIQ,
volume =       "4",
number =       "4",
pages =        "14:1--14:??",
month =        may,
year =         "2014",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2579167",
ISSN =         "1936-1955",
bibdate =      "Tue May 27 16:54:25 MDT 2014",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "14",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Wijnhoven:2014:VBF,
author =       "Fons Wijnhoven and Chintan Amrit and Pim Dietz",
title =        "Value-Based File Retention: File Attributes as File
Value and Information Waste Indicators",
journal =      j-JDIQ,
volume =       "4",
number =       "4",
pages =        "15:1--15:??",
month =        may,
year =         "2014",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2567656",
ISSN =         "1936-1955",
bibdate =      "Tue May 27 16:54:25 MDT 2014",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Several file retention policy methods propose that a
file retention policy should be based on file value.
Though such a retention policy might increase the value
of accessible files, the method to arrive at such a
one can arrive at a method for developing file
retention policies based on the use values of files.
The method's applicability is initially assessed
through a case study at Capgemini, Netherlands. In the
case study, we hypothesize that one can develop a file
retention policy by testing causal relations between
file attributes (as used by file retention methods) and
the use value of files. Unfortunately, most file
attributes used by file retention methods have a weak
correlation with file value, resulting in the
conclusion that these methods do not well select out
high- and low-value files. This would imply the
ineffectiveness of the used attributes in our study or
errors in our conceptualization of file value. We
continue with the last possibility and develop
indicators for file utility (with low utility being
waste). With this approach we were able to detect waste
files, in a sample of files, with an accuracy of 80\%.
We therefore not only suggest further research in
information waste detection as part of a file retention
policy, but also to further explore other file
attributes that could better predict file value and
file utility.",
acknowledgement = ack-nhfb,
articleno =    "15",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Fan:2014:IBR,
author =       "Wenfei Fan and Shuai Ma and Nan Tang and Wenyuan Yu",
title =        "Interaction between Record Matching and Data
Repairing",
journal =      j-JDIQ,
volume =       "4",
number =       "4",
pages =        "16:1--16:??",
month =        may,
year =         "2014",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2567657",
ISSN =         "1936-1955",
bibdate =      "Tue May 27 16:54:25 MDT 2014",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Central to a data cleaning system are record matching
and data repairing. Matching aims to identify tuples
that refer to the same real-world object, and repairing
is to make a database consistent by fixing errors in
the data by using integrity constraints. These are
typically treated as separate processes in current data
cleaning systems, based on heuristic solutions. This
article studies a new problem in connection with data
cleaning, namely the interaction between record
matching and data repairing. We show that repairing can
effectively help us identify matches, and vice versa.
To capture the interaction, we provide a uniform
framework that seamlessly unifies repairing and
matching operations to clean a database based on
integrity constraints, matching rules, and master data.
We give a full treatment of fundamental problems
associated with data cleaning via matching and
repairing, including the static analyses of constraints
and rules taken together, and the complexity,
termination, and determinism analyses of data cleaning.
We show that these problems are hard, ranging from
NP-complete or coNP-complete, to PSPACE-complete.
Nevertheless, we propose efficient algorithms to clean
data via both matching and repairing. The algorithms
find deterministic fixes and reliable fixes based on
confidence and entropy analyses, respectively, which
are more accurate than fixes generated by heuristics.
Heuristic fixes are produced only when deterministic or
reliable fixes are unavailable. We experimentally
verify that our techniques can significantly improve
the accuracy of record matching and data repairing that
are taken as separate processes, using real-life and
synthetic data.",
acknowledgement = ack-nhfb,
articleno =    "16",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Martin:2014:MAE,
author =       "Nigel Martin and Alexandra Poulovassilis and Jianing
Wang",
title =        "A Methodology and Architecture Embedding Quality
Assessment in Data Integration",
journal =      j-JDIQ,
volume =       "4",
number =       "4",
pages =        "17:1--17:??",
month =        may,
year =         "2014",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2567663",
ISSN =         "1936-1955",
bibdate =      "Tue May 27 16:54:25 MDT 2014",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Data integration aims to combine heterogeneous
information sources and to provide interfaces for
accessing the integrated resource. Data integration is
a collaborative task that may involve many people with
different degrees of experience, knowledge of the
application domain, and expectations relating to the
integrated resource. It may be difficult to determine
and control the quality of an integrated resource due
integration methodology that has embedded within it
iterative quality assessment and improvement of the
integrated resource. We also propose an architecture
for the realisation of this methodology. The quality
assessment is based on an ontology representation of
different users' quality requirements and of the main
elements of the integrated resource. We use description
logic as the formal basis for reasoning about users'
quality requirements and for validating that an
integrated resource satisfies these requirements. We
define quality factors and associated metrics which
enable the quality of alternative global schemas for an
integrated resource to be assessed quantitatively, and
hence the improvement which results from the refinement
of a global schema following our methodology to be
measured. We evaluate our approach through a
large-scale real-life case study in biological data
integration in which an integrated resource is
constructed from three autonomous proteomics data
sources.",
acknowledgement = ack-nhfb,
articleno =    "17",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Naumann:2014:E,
author =       "Felix Naumann",
title =        "Editorial",
journal =      j-JDIQ,
volume =       "5",
number =       "1--2",
pages =        "1:1--1:??",
month =        aug,
year =         "2014",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2648781",
ISSN =         "1936-1955",
bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "1",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Talburt:2014:IQR,
author =       "John Talburt and Therese L. Williams and Thomas C.
Redman and David Becker",
title =        "Information quality research challenge: Predicting and
quantifying the impact of social issues on information
quality programs",
journal =      j-JDIQ,
volume =       "5",
number =       "1--2",
pages =        "2:1--2:??",
month =        aug,
year =         "2014",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2629603",
ISSN =         "1936-1955",
bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "2",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Rahm:2014:DPC,
author =       "Erhard Rahm",
title =        "Discovering product counterfeits in online shops: a
big data integration challenge",
journal =      j-JDIQ,
volume =       "5",
number =       "1--2",
pages =        "3:1--3:??",
month =        aug,
year =         "2014",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2629605",
ISSN =         "1936-1955",
bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "3",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Christen:2014:CPP,
author =       "Peter Christen and Dinusha Vatsalan and Vassilios S.
Verykios",
title =        "Challenges for privacy preservation in data
integration",
journal =      j-JDIQ,
volume =       "5",
number =       "1--2",
pages =        "4:1--4:??",
month =        aug,
year =         "2014",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2629604",
ISSN =         "1936-1955",
bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Techniques for integrating data from diverse sources
have attracted significant interest in recent years.
Much of today's data collected by businesses and
governments are about people, and integrating such data
across organizations can raise privacy concerns.
Various techniques that preserve privacy during data
integration have been developed, but several challenges
persist that need to be solved before such techniques
become useful in practical applications. We elaborate
on these challenges and discuss research directions.",
acknowledgement = ack-nhfb,
articleno =    "4",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Vogel:2014:RGA,
author =       "Tobias Vogel and Arvid Heise and Uwe Draisbach and
Dustin Lange and Felix Naumann",
title =        "Reach for gold: an annealing standard to evaluate
duplicate detection results",
journal =      j-JDIQ,
volume =       "5",
number =       "1--2",
pages =        "5:1--5:??",
month =        aug,
year =         "2014",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2629687",
ISSN =         "1936-1955",
bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Duplicates in a database are one of the prime causes
of poor data quality and are at the same time among the
most difficult data quality problems to alleviate. To
detect and remove such duplicates, many commercial and
academic products and methods have been developed. The
evaluation of such systems is usually in need of
pre-classified results. Such gold standards are often
expensive to come by (much manual classification is
necessary), not representative (too small or too
synthetic), and proprietary and thus preclude
repetition (company-internal data). This lament has
been uttered in many papers and even more paper
reviews. The proposed annealing standard is a
structured set of duplicate detection results, some of
which are manually verified and some of which are
merely validated by many classifiers. As more and more
classifiers are evaluated against the annealing
standard, more and more results are verified and
validation becomes more and more confident. We formally
define gold, silver, and the annealing standard and
their maintenance. Experiments show how quickly an
annealing standard converges to a gold standard.
Finally, we provide an annealing standard for 750,000
CDs to the duplicate detection community.",
acknowledgement = ack-nhfb,
articleno =    "5",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Fan:2014:CRD,
author =       "Wenfei Fan and Floris Geerts and Nan Tang and Wenyuan
Yu",
title =        "Conflict resolution with data currency and
consistency",
journal =      j-JDIQ,
volume =       "5",
number =       "1--2",
pages =        "6:1--6:??",
month =        aug,
year =         "2014",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2631923",
ISSN =         "1936-1955",
bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
resolution: given a set of tuples pertaining to the
same entity, it identifies a single tuple in which each
attribute has the latest and consistent value in the
set. This problem is important in data integration,
data cleaning, and query answering. It is, however,
challenging since in practice, reliable time stamps are
often absent, among other things. We propose a model
for conflict resolution by specifying data currency in
terms of partial currency orders and currency
constraints and by enforcing data consistency with
constant conditional functional dependencies. We show
that identifying data currency orders helps us repair
inconsistent data, and vice versa. We investigate a
number of fundamental problems associated with conflict
resolution and establish their complexity. In addition,
we introduce a framework and develop algorithms for
conflict resolution by integrating data currency and
consistency inferences into a single process and by
interacting with users. We experimentally verify the
accuracy and efficiency of our methods using real-life
and synthetic data.",
acknowledgement = ack-nhfb,
articleno =    "6",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Glowalla:2014:PDD,
author =       "Paul Glowalla and Ali Sunyaev",
title =        "Process-driven data quality management: a critical
review on the application of process modeling
languages",
journal =      j-JDIQ,
volume =       "5",
number =       "1--2",
pages =        "7:1--7:??",
month =        aug,
year =         "2014",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2629568",
ISSN =         "1936-1955",
bibdate =      "Mon Sep 8 08:45:58 MDT 2014",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Data quality is critical to organizational success. In
order to improve and sustain data quality in the long
term, process-driven data quality management (PDDQM)
seeks to redesign processes that create or modify data.
Consequently, process modeling is mandatory for PDDQM.
Current research examines process modeling languages
with respect to representational capabilities. However,
there is a gap, since process modeling languages for
PDDQM are not considered. We address this research gap
by providing a synthesis of the varying applications of
process modeling languages for PDDQM. We conducted a
keyword-based literature review in conferences as well
as 74 highranked information systems and computer
science journals, reviewing 1,555 articles from 1995
onwards. For practitioners, it is possible to integrate
the quality perspective within broadly applied process
models. For further research, we derive
representational requirements for PDDQM that should be
integrated within existing process modeling languages.
However, there is a need for further representational
analysis to examine the adequacy of upcoming process
modeling languages. New or enhanced process modeling
languages may substitute for PDDQM-specific process
modeling languages and facilitate development of a
broadly applicable and accepted process modeling
language for PDDQM.",
acknowledgement = ack-nhfb,
articleno =    "7",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Belhajjame:2015:E,
author =       "Khalid Belhajjame and Domenico Beneventano and Laure
Berti-Equille and James Cheney and Victor Cuevas and
Tom {De Nies} and Helena Galhardas and Ashish Gehani
and Boris Glavic and Paul Groth and Olaf Hartig and
Scott Jensen and Andrea Maurino and Gianni Mecca and
Renee Miller and Luc Moreau and Mourad Ouzzani and
Jaehong Park",
title =        "Editorial",
journal =      j-JDIQ,
volume =       "5",
number =       "3",
pages =        "8:1--8:??",
month =        feb,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2692312",
ISSN =         "1936-1955",
bibdate =      "Tue Mar 3 14:42:39 MST 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "8",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Cheah:2015:PQA,
author =       "You-Wei Cheah and Beth Plale",
title =        "Provenance Quality Assessment Methodology and
Framework",
journal =      j-JDIQ,
volume =       "5",
number =       "3",
pages =        "9:1--9:??",
month =        feb,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2665069",
ISSN =         "1936-1955",
bibdate =      "Tue Mar 3 14:42:39 MST 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Data provenance, a form of metadata describing the
life cycle of a data product, is crucial in the sharing
of research data. Research data, when shared over
decades, requires recipients to make a determination of
both use and trust. That is, can they use the data?
More importantly, can they trust it? Knowing the data
are of high quality is one factor to establishing
fitness for use and trust. Provenance can be used to
assert the quality of the data, but the quality of the
provenance must be known as well. We propose a
framework for assessing the quality of data provenance.
We identify quality issues in data provenance,
establish key quality dimensions, and define a
framework of analysis. We apply the analysis framework
to synthetic and real-world provenance.",
acknowledgement = ack-nhfb,
articleno =    "9",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Herschel:2015:HAA,
author =       "Melanie Herschel",
title =        "A Hybrid Approach to Answering Why-Not Questions on
Relational Query Results",
journal =      j-JDIQ,
volume =       "5",
number =       "3",
pages =        "10:1--10:??",
month =        feb,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2665070",
ISSN =         "1936-1955",
bibdate =      "Tue Mar 3 14:42:39 MST 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "In analyzing and debugging data transformations, or
more specifically relational queries, a subproblem is
to understand why some data are not part of the query
result. This problem has recently been addressed from
different perspectives for various fragments of
relational queries. The different perspectives yield
different yet complementary explanations of such
the different approaches by defining a new type of
explanation, called hybrid explanation, that
encompasses the variety of previously defined types of
explanations. This solution goes beyond simply forming
the union of explanations produced by different
algorithms and is shown to be able to explain a larger
set of missing answers. Second, we present Conseil, an
algorithm to generate hybrid explanations. Conseil is
also the first algorithm to handle nonmonotonic
queries. Experiments on efficiency and explanation
quality show that Conseil is comparable and even
previous short conference paper by providing proofs,
additional theorems, and a detailed discussion of each
step of the Conseil algorithm. It also significantly
extends the experimental evaluation on efficiency and
explanation quality.",
acknowledgement = ack-nhfb,
articleno =    "10",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Chong:2015:SID,
author =       "Stephen Chong and Christian Skalka and Jeffrey A.
Vaughan",
title =        "Self-Identifying Data for Fair Use",
journal =      j-JDIQ,
volume =       "5",
number =       "3",
pages =        "11:1--11:??",
month =        feb,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2687422",
ISSN =         "1936-1955",
bibdate =      "Tue Mar 3 14:42:39 MST 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Public-use earth science datasets are a useful
resource with the unfortunate feature that their
provenance is easily disconnected from their content.
Fair-use policies'' typically associated with these
datasets require appropriate attribution of providers
by users, but sound and complete attribution is
difficult if provenance information is lost. To address
this, we introduce a technique to directly associate
provenance information with sensor datasets. Our
technique is similar to traditional watermarking but is
intended for application to unstructured time-series
datasets. Our approach is potentially imperceptible
given sufficient margins of error in datasets and is
robust to a number of benign but likely transformations
including truncation, rounding, bit-flipping, sampling,
and reordering. We provide algorithms for both one-bit
and blind mark checking and show how our system can be
adapted to various data representation types. Our
algorithms are probabilistic in nature and are
characterized by both combinatorial and empirical
analyses. Mark embedding can be applied at any point in
the data life cycle, allowing adaptation of our scheme
to social or scientific concerns.",
acknowledgement = ack-nhfb,
articleno =    "11",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Baillie:2015:QPA,
author =       "Chris Baillie and Peter Edwards and Edoardo Pignotti",
title =        "{QUAL}: a Provenance-Aware Quality Model",
journal =      j-JDIQ,
volume =       "5",
number =       "3",
pages =        "12:1--12:??",
month =        feb,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2700413",
ISSN =         "1936-1955",
bibdate =      "Tue Mar 3 14:42:39 MST 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
assessment over linked data. This model has been
designed to align with emerging standards for
provenance on the Web to enable agents to reason about
data provenance when performing quality assessment. The
model also enables quality assessment provenance to be
represented, thus allowing agents to make decisions
about reuse of existing assessments. We also discuss
the development of an OWL ontology as part of a
software framework to support reasoning about data
quality and assessment reuse. Finally, we evaluate this
framework using two real-world case studies derived
from transport and invasive-species monitoring
applications.",
acknowledgement = ack-nhfb,
articleno =    "12",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Attenberg:2015:BMC,
author =       "Joshua Attenberg and Panos Ipeirotis and Foster
Provost",
title =        "Beat the Machine: Challenging Humans to Find a
Predictive Model's Unknown Unknowns''",
journal =      j-JDIQ,
volume =       "6",
number =       "1",
pages =        "1:1--1:??",
month =        mar,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2700832",
ISSN =         "1936-1955",
bibdate =      "Thu Mar 5 07:53:50 MST 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "We present techniques for gathering data that expose
errors of automatic predictive models. In certain
common settings, traditional methods for evaluating
predictive models tend to miss rare but important
errors --- most importantly, cases for which the model
is confident of its prediction (but wrong). In this
article, we present a system that, in a game-like
setting, asks humans to identify cases that will cause
the predictive model-based system to fail. Such
techniques are valuable in discovering problematic
cases that may not reveal themselves during the normal
operation of the system and may include cases that are
rare but catastrophic. We describe the design of the
system, including design iterations that did not quite
work. In particular, the system incentivizes humans to
provide examples that are difficult for the model to
handle by providing a reward proportional to the
magnitude of the predictive model's error. The humans
are asked to Beat the Machine'' and find cases where
the automatic model (the Machine'') is wrong.
Experiments show that the humans using Beat the Machine
identify more errors than do traditional techniques for
discovering errors in predictive models, and, indeed,
they identify many more errors where the machine is
(wrongly) confident it is correct. Furthermore, those
cases the humans identify seem to be not simply
outliers, but coherent areas missed completely by the
model. Beat the Machine identifies the unknown
unknowns.'' Beat the Machine has been deployed at an
industrial scale by several companies. The main impact
has been that firms are changing their perspective on
and practice of evaluating predictive models. There
are known knowns. These are things we know that we
know. There are known unknowns. That is to say, there
are things that we know we don't know. But there are
also unknown unknowns. There are things we don't know
we don't know.'' --- Donald Rumsfeld",
acknowledgement = ack-nhfb,
articleno =    "1",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Alonso:2015:CLQ,
author =       "Omar Alonso",
title =        "Challenges with Label Quality for Supervised
Learning",
journal =      j-JDIQ,
volume =       "6",
number =       "1",
pages =        "2:1--2:??",
month =        mar,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2724721",
ISSN =         "1936-1955",
bibdate =      "Thu Mar 5 07:53:50 MST 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Organizations that develop and use technologies around
information retrieval, machine learning, recommender
systems, and natural language processing depend on
labels for engineering and experimentation. These
labels, usually gathered via human computation, are
used in machine-learned models for prediction and
evaluation purposes. In such scenarios, collecting
high-quality labels is a very important part of the
overall process. We elaborate on these challenges and
discuss research directions.",
acknowledgement = ack-nhfb,
articleno =    "2",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Lukyanenko:2015:IQR,
author =       "Roman Lukyanenko and Jeffrey Parsons",
title =        "Information Quality Research Challenge: Adapting
Information Quality Principles to User-Generated
Content",
journal =      j-JDIQ,
volume =       "6",
number =       "1",
pages =        "3:1--3:??",
month =        mar,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2723166",
ISSN =         "1936-1955",
bibdate =      "Thu Mar 5 07:53:50 MST 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "3",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Naumann:2015:E,
author =       "Felix Naumann",
title =        "Editorial",
journal =      j-JDIQ,
volume =       "6",
number =       "2--3",
pages =        "4:1--4:??",
month =        jul,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2762716",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "4",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Varshney:2015:DCD,
author =       "Kush R. Varshney and Dennis Wei and Karthikeyan
Natesan Ramamurthy and Aleksandra Mojsilovi{\'c}",
title =        "Data Challenges in Disease Response: The 2014 {Ebola}
Outbreak and Beyond",
journal =      j-JDIQ,
volume =       "6",
number =       "2--3",
pages =        "5:1--5:??",
month =        jul,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2742550",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "5",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Barnaghi:2015:CQD,
author =       "Payam Barnaghi and Maria Bermudez-Edo and Ralf
T{\"o}njes",
title =        "Challenges for Quality of Data in Smart Cities",
journal =      j-JDIQ,
volume =       "6",
number =       "2--3",
pages =        "6:1--6:??",
month =        jul,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2747881",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "6",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Grant:2015:CLT,
author =       "Christan Earl Grant and Daisy Zhe Wang",
title =        "A Challenge for Long-Term Knowledge Base Maintenance",
journal =      j-JDIQ,
volume =       "6",
number =       "2--3",
pages =        "7:1--7:??",
month =        jul,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2738044",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "7",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Sha:2015:DQC,
author =       "Kewei Sha and Sherali Zeadally",
title =        "Data Quality Challenges in Cyber-Physical Systems",
journal =      j-JDIQ,
volume =       "6",
number =       "2--3",
pages =        "8:1--8:??",
month =        jul,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2740965",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "8",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Gennari:2015:CQT,
author =       "Rosella Gennari and Sara Tonelli and Pierpaolo
Vittorini",
title =        "Challenges in Quality of Temporal Data --- Starting
with Gold Standards",
journal =      j-JDIQ,
volume =       "6",
number =       "2--3",
pages =        "9:1--9:??",
month =        jul,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2736699",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "9",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Basole:2015:DAC,
author =       "Rahul C. Basole and Mark L. Braunstein and Jimeng
Sun",
title =        "Data and Analytics Challenges for a Learning
Healthcare System",
journal =      j-JDIQ,
volume =       "6",
number =       "2--3",
pages =        "10:1--10:??",
month =        jul,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2755489",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "10",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Todoran:2015:MEI,
author =       "Ion-George Todoran and Laurent Lecornu and Ali
Khenchaf and Jean-Marc {Le Caillec}",
title =        "A Methodology to Evaluate Important Dimensions of
Information Quality in Systems",
journal =      j-JDIQ,
volume =       "6",
number =       "2--3",
pages =        "11:1--11:??",
month =        jul,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2744205",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Assessing the quality of the information proposed by
an information system has become one of the major
research topics in the last two decades. A quick
literature survey shows that a significant number of
information quality frameworks are proposed in
different domains of application: management
information systems, web information systems,
information fusion systems, and so forth.
Unfortunately, they do not provide a feasible
methodology that is both simple and intuitive to be
implemented in practice. In order to address this need,
methodology. Our methodology makes use of existing
frameworks and proposes a three-step process capable of
tracking the quality changes through the system. In the
first step and as a novelty compared to existing
studies, we propose decomposing the information system
module allows us to locally define the information
quality. Then, in the second step, we model each
processing module by a quality transfer function,
capturing the module's influence over the information
quality. In the third step, we make use of the previous
two steps in order to estimate the quality of the
entire information system. Thus, our methodology allows
informing the end-user on both output quality and local
quality. The proof of concept of our methodology has
been carried out considering two applications: an
automatic target recognition system and a diagnosis
coding support system.",
acknowledgement = ack-nhfb,
articleno =    "11",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Zarraga-Rodriguez:2015:EID,
author =       "Marta Zarraga-Rodriguez and M. Jesus Alvarez",
title =        "Experience: Information Dimensions Affecting
Employees' Perceptions Towards Being Well Informed",
journal =      j-JDIQ,
volume =       "6",
number =       "2--3",
pages =        "12:1--12:??",
month =        jul,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2774223",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Information is a strategic company resource, but there
is no consensus in the literature regarding the set of
dimensions to be considered when measuring the quality
of the information. Most measures of information
quality depend on user perception. Using multiple
correlation analysis, we obtain a model that allows us
to explain how information quality dimensions influence
information consumers' overall feeling of being well
informed. A set of dimensions that any measure of
information quality should at least include is
proposed. This exploratory study reports the results of
a research survey among managers of companies committed
to quality management within the framework of a Total
Quality Management (TQM) model, which is an
information-intensive management model.",
acknowledgement = ack-nhfb,
articleno =    "12",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bartoli:2015:DQC,
author =       "Alberto Bartoli and Andrea {De Lorenzo} and Eric
Medvet and Fabiano Tarlao",
title =        "Data Quality Challenge: Toward a Tool for String
Processing by Examples",
journal =      j-JDIQ,
volume =       "6",
number =       "4",
pages =        "13:1--13:??",
month =        oct,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2786983",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "13",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ahlers:2015:DCQ,
author =       "Dirk Ahlers and John Krogstie",
title =        "Document and Corpus Quality Challenges for Knowledge
Management in Engineering Enterprises",
journal =      j-JDIQ,
volume =       "6",
number =       "4",
pages =        "14:1--14:??",
month =        oct,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2818379",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "14",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

author =       "Banda Ramadan and Peter Christen and Huizhi Liang and
Ross W. Gayler",
title =        "Dynamic Sorted Neighborhood Indexing for Real-Time
Entity Resolution",
journal =      j-JDIQ,
volume =       "6",
number =       "4",
pages =        "15:1--15:??",
month =        oct,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2816821",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Real-time Entity Resolution (ER) is the process of
matching query records in subsecond time with records
in a database that represent the same real-world
entity. Indexing techniques are generally used to
efficiently extract a set of candidate records from the
database that are similar to a query record, and that
are to be compared with the query record in more
detail. The sorted neighborhood indexing method, which
sorts a database and compares records within a sliding
window, has been successfully used for ER of large
static databases. However, because it is based on
static sorted arrays and is designed for batch ER that
resolves all records in a database rather than
resolving those relating to a single query record, this
technique is not suitable for real-time ER on dynamic
databases that are constantly updated. We propose a
tree-based technique that facilitates dynamic indexing
based on the sorted neighborhood method, which can be
used for real-time ER, and investigate both static and
adaptive window approaches. We propose an approach to
reduce query matching times by precalculating the
similarities between attribute values stored in
neighboring tree nodes. We also propose a multitree
solution where different sorting keys are used to
reduce the effects of errors and variations in
attribute values on matching quality by building
several distinct index trees. We experimentally
evaluate our proposed techniques on large real
datasets, as well as on synthetic data with different
data quality characteristics. Our results show that as
the index grows, no appreciable increase occurs in both
record insertion and query times, and that using
multiple trees gives noticeable improvements on
matching quality with only a small increase in query
time. Compared to earlier indexing techniques for
real-time ER, our approach achieves significantly
reduced indexing and query matching times while
maintaining high matching accuracy.",
acknowledgement = ack-nhfb,
articleno =    "15",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Coletti:2015:DCH,
author =       "Paolo Coletti and Maurizio Murgia",
title =        "Design and Construction of a Historical Financial
Database of the {Italian} Stock Market 1973--2011",
journal =      j-JDIQ,
volume =       "6",
number =       "4",
pages =        "16:1--16:??",
month =        oct,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2822898",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
designing and building a historical database of the
Italian Stock Market. The database contains daily
market data from 1973 to 2011 and is constructed by
merging two main digital sources and several other
hand-collected data sources. We analyzed and developed
semiautomatic tools to deal with problems related to
time-series matchings, quality of data, and numerical
errors. We also developed a concatenation structure to
allow the handling of company name changes, mergers,
and spin-offs without artificially altering numerical
series. At the same time, we maintained the
transparency of the historical information on each
individual company listed. Thanks to the overlapping of
digital and hand-collected data, the completed database
has a very high level of detail and accuracy. The
dataset is particularly suited for any empirical
research in financial economics and for more
practically oriented numerical applications and
forecasting simulations.",
acknowledgement = ack-nhfb,
articleno =    "16",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Missier:2015:CSI,
author =       "Paolo Missier",
title =        "Corrigendum to the Special Issue Editorial in {JDIQ}
Volume 5, Issue 3",
journal =      j-JDIQ,
volume =       "6",
number =       "4",
pages =        "17:1--17:??",
month =        oct,
year =         "2015",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2821019",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 27 22:10:29 MDT 2015",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "17",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Chapman:2016:CQD,
author =       "Adriane P. Chapman and Arnon Rosenthal and Len
Seligman",
title =        "The Challenge of Quick and Dirty'' Information
Quality",
journal =      j-JDIQ,
volume =       "7",
number =       "1--2",
pages =        "1:1--1:??",
month =        jun,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2834123",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "1",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Millar:2016:DQC,
author =       "Jeremy R. Millar and Douglas D. Hodson and Gilbert L.
Peterson and Darryl K. Ahner",
title =        "Data Quality Challenges in Distributed
Live-Virtual-Constructive Test Environments",
journal =      j-JDIQ,
volume =       "7",
number =       "1--2",
pages =        "2:1--2:??",
month =        jun,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2850420",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "2",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Lukyanenko:2016:IQR,
author =       "Roman Lukyanenko",
title =        "Information Quality Research Challenge: Information
Quality in the Age of Ubiquitous Digital
Intermediation",
journal =      j-JDIQ,
volume =       "7",
number =       "1--2",
pages =        "3:1--3:??",
month =        jun,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2856038",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "As information technology becomes an integral part of
daily life, increasingly, people understand the world
around them by turning to digital sources as opposed to
directly interacting with objects in the physical
world. This has ushered in the age of Ubiquitous
Digital Intermediation (UDI). With the explosion of
UDI, the scope of Information Quality (IQ) research is
due to expand dramatically as the challenge becomes to
capture the wealth and nuances of human experience.
landscape brought about by UDI, including expansion of
the scope of traditional IQ dimensions, digital to
physical mapping challenge, and the increased need to
manage content authenticity. UDI generates many novel
questions and opportunities for the IQ research
community.",
acknowledgement = ack-nhfb,
articleno =    "3",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Zhu:2016:DSC,
author =       "Hongwei Zhu and Yang W. Lee and Arnon S. Rosenthal",
title =        "Data Standards Challenges for Interoperable and
Quality Data",
journal =      j-JDIQ,
volume =       "7",
number =       "1--2",
pages =        "4:1--4:??",
month =        jun,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2903723",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "4",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ulbricht:2016:CCD,
author =       "Robert Ulbricht and Hilko Donker and Claudio Hartmann
and Martin Hahmann and Wolfgang Lehner",
title =        "Challenges for Context-Driven Time Series
Forecasting",
journal =      j-JDIQ,
volume =       "7",
number =       "1--2",
pages =        "5:1--5:??",
month =        jun,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2896822",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Predicting time series is a crucial task for
organizations, since decisions are often based on
uncertain information. Many forecasting models are
designed from a generic statistical point of view.
However, each real-world application requires
results. All such specifics are summarized by the term
of context. In contrast to current approaches, we want
to integrate context as the primary driver in the
forecasting process. We introduce context-driven time
series forecasting focusing on two exemplary domains:
renewable energy and sparse sales data. In view of
this, we discuss the challenge of context integration
in the individual process steps.",
acknowledgement = ack-nhfb,
articleno =    "5",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ceolin:2016:CUR,
author =       "Davide Ceolin and Paul Groth and Valentina Maccatrozzo
and Wan Fokkink and Willem Robert {Van Hage} and
Archana Nottamkandath",
title =        "Combining User Reputation and Provenance Analysis for
Trust Assessment",
journal =      j-JDIQ,
volume =       "7",
number =       "1--2",
pages =        "6:1--6:??",
month =        jun,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2818382",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Trust is a broad concept that in many systems is often
reduced to user reputation alone. However, user
reputation is just one way to determine trust. The
estimation of trust can be tackled from other
perspectives as well, including by looking at
provenance. Here, we present a complete pipeline for
estimating the trustworthiness of artifacts given their
provenance and a set of sample evaluations. The
pipeline is composed of a series of algorithms for (1)
extracting relevant provenance features, (2) generating
stereotypes of user behavior from provenance features,
(3) estimating the reputation of both stereotypes and
users, (4) using a combination of user and stereotype
reputations to estimate the trustworthiness of
artifacts, and (5) selecting sets of artifacts to
trust. These algorithms rely on the W3C PROV
recommendations for provenance and on evidential
reasoning by means of subjective logic. We evaluate the
pipeline over two tagging datasets: tags and
evaluations from the Netherlands Institute for Sound
and Vision's Waisda? video tagging platform, as well as
crowdsourced annotations from the Steve.Museum project.
The approach achieves up to 85\% precision when
predicting tag trustworthiness. Perhaps more
importantly, the pipeline provides satisfactory results
using relatively little evidence through the use of
provenance.",
acknowledgement = ack-nhfb,
articleno =    "6",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

author =       "Peter Christen and Ross W. Gayler and Khoi-Nguyen Tran
and Jeffrey Fisher and Dinusha Vatsalan",
title =        "Automatic Discovery of Abnormal Values in Large
Textual Databases",
journal =      j-JDIQ,
volume =       "7",
number =       "1--2",
pages =        "7:1--7:??",
month =        jun,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2889311",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Textual databases are ubiquitous in many application
domains. Examples of textual data range from names and
addresses of customers to social media posts and
bibliographic records. With online services,
individuals are increasingly required to enter their
personal details for example when purchasing products
online or registering for government services, while
many social network and e-commerce sites allow users to
post short comments. Many online sites leave open the
possibility for people to enter unintended or malicious
abnormal values, such as names with errors, bogus
values, profane comments, or random character
sequences. In other applications, such as online
bibliographic databases or comparative online shopping
sites, databases are increasingly populated in (semi-)
automatic ways through Web crawls. This practice can
result in low quality data being added automatically
techniques to automatically discover abnormal
(unexpected or unusual) values in large textual
databases. Following recent work in categorical outlier
detection, our assumption is that normal'' values are
those that occur frequently in a database, while an
individual abnormal value is rare. Our techniques are
unsupervised and address the challenge of discovering
abnormal values as an outlier detection problem. Our
first technique is a basic but efficient q-gram set
based technique, the second is based on a probabilistic
language model, and the third employs morphological
word features to train a one-class support vector
machine classifier. Our aim is to investigate and
develop techniques that are fast, efficient, and
automatic. The output of our techniques can help in the
development of rule-based data cleaning and information
extraction systems, or be used as training data for
further supervised data cleaning procedures. We
evaluate our techniques on four large real-world
datasets from different domains: two US voter
registration databases containing personal details, the
2013 KDD Cup dataset of bibliographic records, and the
SNAP Memetracker dataset of phrases from social
networking sites. Our results show that our techniques
can efficiently and automatically discover abnormal
textual values, allowing an organization to conduct
efficient data exploration, and improve the quality of
their textual databases without the need of requiring
explicit training data.",
acknowledgement = ack-nhfb,
articleno =    "7",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Aiken:2016:ESD,
author =       "Peter Aiken",
title =        "{EXPERIENCE}: Succeeding at Data Management-{BigCo}
Attempts to Leverage Data",
journal =      j-JDIQ,
volume =       "7",
number =       "1--2",
pages =        "8:1--8:??",
month =        jun,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2893482",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "In a manner similar to most organizations, BigCompany
(BigCo) was determined to benefit strategically from
its widely recognized and vast quantities of data.
(U.S. government agencies make regular visits to BigCo
to learn from its experiences in this area.) When faced
with an explosion in data volume, increases in
complexity, and a need to respond to changing
conditions, BigCo struggled to respond using a
approach to address these challenges. As BigCo was not
data knowledgeable, it did not realize that traditional
approaches could not work. Two full years into the
initiative, BigCo was far from achieving its initial
goals. How much more time, money, and effort would be
required before results were achieved? Moreover, could
the results be achieved in time to support a larger,
critical, technology-driven challenge that also
depended on solving the data challenges? While these
increase our collective understanding of data assets as
separate from IT projects. Only by reconceiving data as
a strategic asset can organizations begin to address
these new challenges. Transformation to a data-driven
culture requires far more than technology, which
remains just one of three required stool legs''
(people and process being the other two). Seven
prerequisites to effectively leveraging data are
necessary, but insufficient awareness exists in most
organizations-hence, the widespread misfires in these
areas, especially when attempting to implement the
so-called big data initiatives. Refocusing on
foundational data management practices is required for
all organizations, regardless of their organizational
or data strategies.",
acknowledgement = ack-nhfb,
articleno =    "8",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Chiang:2016:UDC,
author =       "Fei Chiang and Siddharth Sitaramachandran",
title =        "Unifying Data and Constraint Repairs",
journal =      j-JDIQ,
volume =       "7",
number =       "3",
pages =        "9:1--9:??",
month =        sep,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2883616",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Integrity constraints play an important role in data
design. However, in an operational database, they may
not be enforced for many reasons. Hence, over time,
data may become inconsistent with respect to the
constraints. To manage this, several approaches have
proposed techniques to repair the data by finding
minimal or lowest cost changes to the data that make it
consistent with the constraints. Such techniques are
appropriate for applications where only the data
changes, but schemas and their constraints remain
fixed. In many modern applications, however,
constraints may evolve over time as application or
business rules change, as data are integrated with new
data sources or as the underlying semantics of the data
evolves. In such settings, when an inconsistency
occurs, it is no longer clear if there is an error in
the data (and the data should be repaired) or if the
constraints have evolved (and the constraints should be
repaired). In this work, we present a novel unified
cost model that allows data and constraint repairs to
be compared on an equal footing. We consider repairs
over a database that is inconsistent with respect to a
set of rules, modeled as functional dependencies (FDs).
FDs are the most common type of constraint and are
known to play an important role in maintaining data
quality. We propose modifications to the data and to
the FDs such that the data and the constraints are
better aligned. We evaluate the quality and scalability
of our repair algorithms over synthetic and real
datasets. The results show that our repair algorithms
not only scale well for large datasets but also are
able to accurately capture and correct inconsistencies
and accurately decide when a data repair versus a
constraint repair is best.",
acknowledgement = ack-nhfb,
articleno =    "9",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Maltese:2016:SAC,
author =       "Vincenzo Maltese and Fausto Giunchiglia",
title =        "Search and Analytics Challenges in Digital Libraries
and Archives",
journal =      j-JDIQ,
volume =       "7",
number =       "3",
pages =        "10:1--10:??",
month =        sep,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2939377",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "10",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Gelernter:2016:COE,
author =       "J. Gelernter and J. Jha",
title =        "Challenges in Ontology Evaluation",
journal =      j-JDIQ,
volume =       "7",
number =       "3",
pages =        "11:1--11:??",
month =        sep,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2935751",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "11",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Berti-Equille:2016:VBD,
author =       "Laure Berti-Equille and Mouhamadou Lamine Ba",
title =        "Veracity of Big Data: Challenges of Cross-Modal Truth
Discovery",
journal =      j-JDIQ,
volume =       "7",
number =       "3",
pages =        "12:1--12:??",
month =        sep,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2935753",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "12",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Haralabopoulos:2016:CIC,
author =       "Giannis Haralabopoulos and Ioannis Anagnostopoulos and
title =        "The Challenge of Improving Credibility of
User-Generated Content in Online Social Networks",
journal =      j-JDIQ,
volume =       "7",
number =       "3",
pages =        "13:1--13:??",
month =        sep,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2899003",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "In every environment of information exchange,
Information Quality (IQ) is considered one of the most
important issues. Studies in Online Social Networks
(OSNs) analyze a number of related subjects that span
both theoretical and practical aspects, from data
quality identification and simple attribute
classification to quality assessment models for various
social environments. Among several factors that affect
information quality in online social networks is the
credibility of user-generated content. To address this
challenge, some proposed solutions include
community-based evaluation and labeling of
user-generated content in terms of accuracy, clarity,
and timeliness, along with well-established real-time
data mining techniques.",
acknowledgement = ack-nhfb,
articleno =    "13",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{DUrso:2016:EGD,
author =       "Ciro D'Urso",
title =        "{EXPERIENCE}: Glitches in Databases, How to Ensure
Data Quality by Outlier Detection Techniques",
journal =      j-JDIQ,
volume =       "7",
number =       "3",
pages =        "14:1--14:??",
month =        sep,
year =         "2016",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2950109",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:26 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Enterprise's archives are inevitably affected by the
presence of data quality problems (also called
new method to analyze the quality of datasets stored in
the tables of a database, with no knowledge of the
semantics of the data and without the need to define
repositories of rules. The proposed method is based on
proper revisions of different approaches for outlier
detection that are combined to boost overall
performance and accuracy. A novel transformation
algorithm is conceived that treats the items in
database tables as data points in real coordinate space
of n dimensions, so that fields containing dates and
fields containing text are processed to calculate
distances between those data points. The implementation
of an iterative approach ensures that global and local
outliers are discovered even if they are subject,
primarily in datasets with multiple outliers or
clusters of outliers, to masking and swamping effects.
The application of the method to a set of archives,
some of which have been studied extensively in the
literature, provides very promising experimental
results and outperforms the application of a single
other technique. Finally, a list of future research
directions is highlighted.",
acknowledgement = ack-nhfb,
articleno =    "14",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Labouseur:2017:IDD,
author =       "Alan G. Labouseur and Carolyn C. Matheus",
title =        "An Introduction to Dynamic Data Quality Challenges",
journal =      j-JDIQ,
volume =       "8",
number =       "2",
pages =        "6:1--6:??",
month =        feb,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2998575",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:27 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "6",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Becker:2017:CTD,
author =       "Christoph Becker and Kresimir Duretec and Andreas
Rauber",
title =        "The Challenge of Test Data Quality in Data
Processing",
journal =      j-JDIQ,
volume =       "8",
number =       "2",
pages =        "7:1--7:??",
month =        feb,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3012004",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:27 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "7",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ferro:2017:RCI,
author =       "Nicola Ferro",
title =        "Reproducibility Challenges in Information Retrieval
Evaluation",
journal =      j-JDIQ,
volume =       "8",
number =       "2",
pages =        "8:1--8:??",
month =        feb,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3020206",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:27 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "8",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Shankaranarayanan:2017:CCE,
author =       "G. Shankaranarayanan and Roger Blake",
title =        "From Content to Context: The Evolution and Growth of
Data Quality Research",
journal =      j-JDIQ,
volume =       "8",
number =       "2",
pages =        "9:1--9:??",
month =        feb,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/2996198",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:27 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Research in data and information quality has made
significant strides over the last 20 years. It has
become a unified body of knowledge incorporating
techniques, methods, and applications from a variety of
disciplines including information systems, computer
science, operations management, organizational
behavior, psychology, and statistics. With
organizations viewing Big Data'', social media data,
data-driven decision-making, and analytics as critical,
data quality has never been more important. We believe
that data quality research is reaching the threshold of
significant growth and a metamorphosis from focusing on
measuring and assessing data quality-content-toward a
focus on usage and context. At this stage, it is vital
to understand the identity of this research area in
order to recognize its current state and to effectively
identify an increasing number of research opportunities
within. Using Latent Semantic Analysis (LSA) to analyze
the abstracts of 972 peer-reviewed journal and
conference articles published over the past 20 years,
and themes that define the identity of data quality
research. It further explores their trends over time,
pointing to the data quality dimensions that have-and
have not-been well-studied, and offering insights into
topics that may provide significant opportunities in
this area.",
acknowledgement = ack-nhfb,
articleno =    "9",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Goldberg:2017:PIS,
author =       "Sean Goldberg and Daisy Zhe Wang and Christan Grant",
title =        "A Probabilistically Integrated System for
Crowd-Assisted Text Labeling and Extraction",
journal =      j-JDIQ,
volume =       "8",
number =       "2",
pages =        "10:1--10:??",
month =        feb,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3012003",
ISSN =         "1936-1955",
bibdate =      "Sat Apr 8 09:38:27 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "The amount of text data has been growing exponentially
in recent years, giving rise to automatic information
extraction methods that store text annotations in a
database. The current state-of-the-art structured
prediction methods, however, are likely to contain
errors and it is important to be able to manage the
overall uncertainty of the database. On the other hand,
the advent of crowdsourcing has enabled humans to aid
introduce pi-CASTLE, a system that optimizes and
integrates human and machine computing as applied to a
complex structured prediction problem involving
Conditional Random Fields (CRFs). We propose strategies
grounded in information theory to select a token
subset, formulate questions for the crowd to label, and
integrate these labelings back into the database using
a method of constrained inference. On both a text
entity recognition task over tweets we show an order of
magnitude improvement in accuracy gain over baseline
methods.",
acknowledgement = ack-nhfb,
articleno =    "10",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Woodall:2017:DRC,
author =       "Philip Woodall",
title =        "The Data Repurposing Challenge: New Pressures from
Data Analytics",
journal =      j-JDIQ,
volume =       "8",
number =       "3--4",
pages =        "11:1--11:??",
month =        jul,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3022698",
ISSN =         "1936-1955",
bibdate =      "Mon Oct 2 09:44:30 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "11",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Markovic:2017:CQS,
author =       "Milan Markovic and Peter Edwards",
title =        "The Challenge of Quality in Social Computation",
journal =      j-JDIQ,
volume =       "8",
number =       "3--4",
pages =        "12:1--12:??",
month =        jul,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3041762",
ISSN =         "1936-1955",
bibdate =      "Mon Oct 2 09:44:30 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "12",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Al-Hussaini:2017:EIB,
author =       "Leena Al-Hussaini",
title =        "Experience: Insights into the Benchmarking Data of
{Hunspell} and {Aspell} Spell Checkers",
journal =      j-JDIQ,
volume =       "8",
number =       "3--4",
pages =        "13:1--13:??",
month =        jul,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3092700",
ISSN =         "1936-1955",
bibdate =      "Mon Oct 2 09:44:30 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib;
http://www.math.utah.edu/pub/tex/bib/spell.bib",
abstract =     "Hunspell is a morphological spell checker and
automatic corrector for Macintosh 10.6 and later
versions. Aspell is a general spell checker and
automatic corrector for the GNU operating system. In
this experience article, we present a benchmarking
study of the performance of Hunspell and Aspell. Ginger
is a general grammatical spell checker that is used as
a baseline to compare the performance of Hunspell and
Aspell. A benchmark dataset was carefully selected to
be a mixture of different error types at different word
length levels. Further, the benchmarking data are from
very bad spellers and will challenge any spell checker.
The extensive study described in this work will
characterize the respective softwares and benchmarking
data from multiple perspectives and will consider many
error statistics. Overall, Hunspell can correct 415/469
words and Aspell can correct 414/469 words. The
baseline Ginger can correct 279/469 words. We recommend
this dataset as the preferred benchmark dataset for
evaluating newly developed isolated word'' spell
checkers.",
acknowledgement = ack-nhfb,
articleno =    "13",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Abdellaoui:2017:QSD,
author =       "Sabrina Abdellaoui and Fahima Nader and Rachid
Chalal",
title =        "{QDflows}: a System Driven by Knowledge Bases for
Designing Quality-Aware Data flows",
journal =      j-JDIQ,
volume =       "8",
number =       "3--4",
pages =        "14:1--14:??",
month =        jul,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3064173",
ISSN =         "1936-1955",
bibdate =      "Mon Oct 2 09:44:30 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "In the big data era, data integration is becoming
increasingly important. It is usually handled by data
flows processes that extract, transform, and clean data
from several sources, and populate the data integration
system (DIS). Designing data flows is facing several
issues such as (1) specifying a set of quality rules,
(2) enforcing them on the data flow pipeline to detect
violations, and (3) producing accurate repairs for the
detected violations. We propose QDflows, a system for
designing quality-aware data flows that considers the
following as input: (1) a high-quality knowledge base
(KB) as the global schema of integration, (2) a set of
data sources and a set of validated users'
requirements, (3) a set of defined mappings between
data sources and the KB, and (4) a set of quality rules
specified by users. QDflows uses an ontology to design
the DIS schema. It offers the ability to define the DIS
ontology as a module of the knowledge base, based on
validated users' requirements. The DIS ontology model
is then extended with multiple types of quality rules
specified by users. QDflows extracts and transforms
data from sources to populate the DIS. It detects
violations of quality rules enforced on the data flows,
constructs repair patterns, searches for horizontal and
vertical matches in the knowledge base, and performs an
automatic repair when possible or generates possible
repairs. It interactively involves users to validate
the DIS. Using real-life and synthetic datasets, the
DBpedia and Yago knowledge bases, we experimentally
evaluate the generality, effectiveness, and efficiency
of QDflows. We also showcase an interactive tool
implementing our system.",
acknowledgement = ack-nhfb,
articleno =    "14",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{St-Maurice:2017:ECS,
author =       "Justin St-Maurice and Catherine Burns",
title =        "An Exploratory Case Study to Understand Primary Care
Users and Their Data Quality Tradeoffs",
journal =      j-JDIQ,
volume =       "8",
number =       "3--4",
pages =        "15:1--15:??",
month =        jul,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3058750",
ISSN =         "1936-1955",
bibdate =      "Mon Oct 2 09:44:30 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Primary care data is an important part of the evolving
healthcare ecosystem. Generally, users in primary care
are expected to provide excellent patient care and
record high-quality data. In practice, users must
balance sets of priorities regarding care and data. The
goal of this study was to understand data quality
and use among primary care users. As a case study, data
quality measures and metrics are developed through a
focus group session with managers. After calculating
and extracting measurements of data quality from six
years of historic data, each measure was modeled with
logit binomial regression to show correlations,
characterize tradeoffs, and investigate data quality
interactions. Measures and correlations for
completeness, use, and timeliness were calculated for
196,967 patient encounters. Based on the analysis,
there was a positive relationship between validity and
completeness, and a negative relationship between
timeliness and use. Use of data and reductions in entry
delay were positively associated with completeness and
validity. Our results suggest that if users are not
provided with sufficient time to record data as part of
their regular workflow, they will prioritize spending
available time with patients. As a measurement of a
primary care system's effectiveness, the negative
correlation between use and timeliness points to a
self-reinforcing relationship that provides users with
little external value. In the future, additional data
can be generated from comparable organizations to test
several new hypotheses about primary care users.",
acknowledgement = ack-nhfb,
articleno =    "15",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Wang:2017:DDR,
author =       "Jiannan Wang and Nan Tang",
title =        "Dependable Data Repairing with Fixing Rules",
journal =      j-JDIQ,
volume =       "8",
number =       "3--4",
pages =        "16:1--16:??",
month =        jul,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3041761",
ISSN =         "1936-1955",
bibdate =      "Mon Oct 2 09:44:30 MDT 2017",
bibsource =    "http://www.acm.org/pubs/contents/journals/jdqi/;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "One of the main challenges that data-cleaning systems
face is to automatically identify and repair data
errors in a dependable manner. Though data dependencies
(also known as integrity constraints) have been widely
studied to capture errors in data, automated and
dependable data repairing on these errors has remained
a notoriously difficult problem. In this work, we
introduce an automated approach for dependably
repairing data errors, based on a novel class of fixing
rules. A fixing rule contains an evidence pattern, a
set of negative patterns, and a fact value. The heart
of fixing rules is deterministic: given a tuple, the
evidence pattern and the negative patterns of a fixing
rule are combined to precisely capture which attribute
is wrong, and the fact indicates how to correct this
error. We study several fundamental problems associated
with fixing rules and establish their complexity. We
develop efficient algorithms to check whether a set of
fixing rules are consistent and discuss approaches to
resolve inconsistent fixing rules. We also devise
efficient algorithms for repairing data errors using
fixing rules. Moreover, we discuss approaches on how to
generate a large number of fixing rules from examples
or available knowledge bases. We experimentally
demonstrate that our techniques outperform other
automated algorithms in terms of the accuracy of
repairing data errors, using both real-life and
synthetic data.",
acknowledgement = ack-nhfb,
articleno =    "16",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Marcheggiani:2017:ELQ,
author =       "Diego Marcheggiani and Fabrizio Sebastiani",
title =        "On the Effects of Low-Quality Training Data on
Information Extraction from Clinical Reports",
journal =      j-JDIQ,
volume =       "9",
number =       "1",
pages =        "1:1--1:??",
month =        oct,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3106235",
ISSN =         "1936-1955",
bibdate =      "Mon Jan 22 16:07:56 MST 2018",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "In the last five years there has been a flurry of work
on information extraction from clinical documents, that
is, on algorithms capable of extracting, from the
informal and unstructured texts that are generated
during everyday clinical practice, mentions of concepts
relevant to such practice. Many of these research works
are about methods based on supervised learning, that
is, methods for training an information extraction
system from manually annotated examples. While a lot of
work has been devoted to devising learning methods that
generate more and more accurate information extractors,
no work has been devoted to investigating the effect of
the quality of training data on the learning process
for the clinical domain. Low quality in training data
often derives from the fact that the person who has
annotated the data is different from the one against
whose judgment the automatically annotated data must be
data quality issues on the accuracy of information
extraction systems as applied to the clinical domain.
We do this by comparing the accuracy deriving from
training data annotated by the authoritative coder
(i.e., the one who has also annotated the test data and
by whose judgment we must abide) with the accuracy
deriving from training data annotated by a different
coder, equally expert in the subject matter. The
results indicate that, although the disagreement
between the two coders (as measured on the training
set) is substantial, the difference is (surprisingly
enough) not always statistically significant. While the
dataset used in the present work originated in a
clinical context, the issues we study in this work are
of more general interest.",
acknowledgement = ack-nhfb,
articleno =    "1",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Basheer:2017:CBQ,
author =       "Aseel Basheer and Kewei Sha",
title =        "Cluster-Based Quality-Aware Adaptive Data Compression
for Streaming Data",
journal =      j-JDIQ,
volume =       "9",
number =       "1",
pages =        "2:1--2:??",
month =        oct,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3122863",
ISSN =         "1936-1955",
bibdate =      "Mon Jan 22 16:07:56 MST 2018",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/datacompression.bib;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Wireless sensor networks (WSNs) are widely applied in
data collection applications. Energy efficiency is one
of the most important design goals of WSNs. In this
article, we examine the tradeoffs between the energy
efficiency and the data quality. First, four attributes
used to evaluate data quality are formally defined.
Then, we propose a novel data compression algorithm,
Quality-Aware Adaptive data Compression (QAAC), to
reduce the amount of data communication to save energy.
QAAC utilizes an adaptive clustering algorithm to build
clusters from dataset; then a code for each cluster is
generated and stored in a Huffman encoding tree. The
encoding algorithm encodes the original dataset based
on the Haffman encoding tree. An improvement algorithm
is also designed to reduce the information loss when
data are compressed. After the encoded data, the
Huffman encoding tree and parameters used in the
improvement algorithm have been received at the sink, a
decompression algorithm is used to retrieve the
approximation of the original dataset. The performance
evaluation shows that QAAC is efficient and achieves a
much higher compression ratio than lossy and lossless
compression algorithms, while it has much smaller
information loss than lossy compression algorithms.",
acknowledgement = ack-nhfb,
articleno =    "2",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Corsar:2017:COD,
author =       "David Corsar and Peter Edwards",
title =        "Challenges of Open Data Quality: More Than Just
journal =      j-JDIQ,
volume =       "9",
number =       "1",
pages =        "3:1--3:??",
month =        oct,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3110291",
ISSN =         "1936-1955",
bibdate =      "Mon Jan 22 16:07:56 MST 2018",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "3",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{El-Mawass:2017:DQC,
author =       "Nour El-Mawass and Saad Alaboodi",
title =        "Data Quality Challenges in Social Spam Research",
journal =      j-JDIQ,
volume =       "9",
number =       "1",
pages =        "4:1--4:??",
month =        oct,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3090057",
ISSN =         "1936-1955",
bibdate =      "Mon Jan 22 16:07:56 MST 2018",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "4",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Chen:2017:IQC,
author =       "Min Chen and Roman Lukyanenko and Monica Chiarini
Tremblay",
title =        "Information Quality Challenges in Shared Healthcare
Decision Making",
journal =      j-JDIQ,
volume =       "9",
number =       "1",
pages =        "5:1--5:??",
month =        oct,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3090056",
ISSN =         "1936-1955",
bibdate =      "Mon Jan 22 16:07:56 MST 2018",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "5",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Arbuckle:2017:CPC,
author =       "Peter Arbuckle and Ezra Kahn and Adam Kriesberg",
title =        "Challenge Paper: Challenges to Sharing Data and Models
for Life Cycle Assessment",
journal =      j-JDIQ,
volume =       "9",
number =       "1",
pages =        "6:1--6:??",
month =        oct,
year =         "2017",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3106236",
ISSN =         "1936-1955",
bibdate =      "Mon Jan 22 16:07:56 MST 2018",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "6",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Raschid:2018:ECJ,
author =       "Louiqa Raschid",
title =        "{Editor-in-Chief (January 2014--May 2017)} Farewell
Report",
journal =      j-JDIQ,
volume =       "9",
number =       "2",
pages =        "7:1--7:??",
month =        jan,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3143313",
ISSN =         "1936-1955",
bibdate =      "Mon Jan 22 16:07:57 MST 2018",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "7",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Catarci:2018:FNJ,
author =       "Tiziana Catarci",
title =        "Foreword from the New {JDIQ Editor-in-Chief}",
journal =      j-JDIQ,
volume =       "9",
number =       "2",
pages =        "8:1--8:??",
month =        jan,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3143316",
ISSN =         "1936-1955",
bibdate =      "Mon Jan 22 16:07:57 MST 2018",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "8",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Truong:2018:CEQ,
author =       "Hong-Linh Truong and Aitor Murguzur and Erica Yang",
title =        "Challenges in Enabling Quality of Analytics in the
Cloud",
journal =      j-JDIQ,
volume =       "9",
number =       "2",
pages =        "9:1--9:??",
month =        jan,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3138806",
ISSN =         "1936-1955",
bibdate =      "Mon Jan 22 16:07:57 MST 2018",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "9",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Koh:2018:ELA,
author =       "Kyu Han Koh and Eric Fouh and Mohammed F. Farghally
and Hossameldin Shahin and Clifford A. Shaffer",
title =        "Experience: Learner Analytics Data Quality for an
{eTextbook} System",
journal =      j-JDIQ,
volume =       "9",
number =       "2",
pages =        "10:1--10:??",
month =        jan,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3148240",
ISSN =         "1936-1955",
bibdate =      "Mon Jan 22 16:07:57 MST 2018",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "We present lessons learned related to data collection
and analysis from 5 years of experience with the
eTextbook system OpenDSA. The use of such cyberlearning
systems is expanding rapidly in both formal and
informal educational settings. Although the precise
issues related to any such project are idiosyncratic
based on the data collection technology and goals of
the project, certain types of data collection problems
will be common. We begin by describing the nature of
the data transmitted between the student's client
machine and the database server, and our initial
database schema for storing interaction log data. We
describe many problems that we encountered, with the
nature of the problems categorized as syntactic-level
data collection issues, issues with relating events to
users, or issues with tracking users over time.
Relating events to users and tracking the time spent on
tasks are both prerequisites to converting
syntactic-level interaction streams to semantic-level
behavior needed for higher-order analysis of the data.
Finally, we describe changes made to our database
schema that helped to resolve many of the issues that
ultimate goal of encouraging a change from ineffective
learning behavior by students to more productive
behavior.",
acknowledgement = ack-nhfb,
articleno =    "10",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Cappiello:2018:VDQ,
author =       "C. Cappiello and C. Cerletti and C. Fratto and B.
Pernici",
title =        "Validating Data Quality Actions in Scoring Processes",
journal =      j-JDIQ,
volume =       "9",
number =       "2",
pages =        "11:1--11:??",
month =        jan,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3141248",
ISSN =         "1936-1955",
bibdate =      "Mon Jan 22 16:07:57 MST 2018",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Data quality has gained momentum among organizations
upon the realization that poor data quality might cause
failures and/or inefficiencies, thus compromising
business processes and application results. However,
enterprises often adopt data quality assessment and
improvement methods based on practical and empirical
approaches without conducting a rigorous analysis of
the data quality issues and outcome of the enacted data
quality improvement practices. In particular, data
quality management, especially the identification of
the data quality dimensions to be monitored and
improved, is performed by knowledge workers on the
basis of their skills and experience. Control methods
are therefore designed on the basis of expected and
evident quality problems; thus, these methods may not
be effective in dealing with unknown and/or unexpected
based on fault injection, for validating the data
quality actions used by organizations. We show how it
is possible to check whether the adopted techniques
properly monitor the real issues that may damage
business processes. At this stage, we focus on scoring
processes, i.e., those in which the output represents
the evaluation or ranking of a specific object. We show
the effectiveness of our proposal by means of a case
study in the financial risk management area.",
acknowledgement = ack-nhfb,
articleno =    "11",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Heinrich:2018:RDQ,
author =       "Bernd Heinrich and Diana Hristova and Mathias Klier
and Alexander Schiller and Michael Szubartowicz",
title =        "Requirements for Data Quality Metrics",
journal =      j-JDIQ,
volume =       "9",
number =       "2",
pages =        "12:1--12:??",
month =        jan,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3148238",
ISSN =         "1936-1955",
bibdate =      "Mon Jan 22 16:07:57 MST 2018",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Data quality and especially the assessment of data
quality have been intensively discussed in research and
practice alike. To support an economically oriented
management of data quality and decision making under
uncertainty, it is essential to assess the data quality
level by means of well-founded metrics. However, if not
decisions and economic losses. Therefore, based on a
decision-oriented framework, we present a set of five
requirements for data quality metrics. These
requirements are relevant for a metric that aims to
support an economically oriented management of data
quality and decision making under uncertainty. We
further demonstrate the applicability and efficacy of
these requirements by evaluating five data quality
metrics for different data quality dimensions.
Moreover, we discuss practical implications when
applying the presented requirements.",
acknowledgement = ack-nhfb,
articleno =    "12",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Geerts:2018:ESI,
author =       "Floris Geerts and Paolo Missier and Norman Paton",
title =        "Editorial: Special Issue on Improving the Veracity and
Value of Big Data",
journal =      j-JDIQ,
volume =       "9",
number =       "3",
pages =        "13:1--13:??",
month =        mar,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3174791",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "13",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bertossi:2018:OMD,
author =       "Leopoldo Bertossi and Mostafa Milani",
title =        "Ontological Multidimensional Data Models and
Contextual Data Quality",
journal =      j-JDIQ,
volume =       "9",
number =       "3",
pages =        "14:1--14:??",
month =        mar,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3148239",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Data quality assessment and data cleaning are
context-dependent activities. Motivated by this
observation, we propose the Ontological
Multidimensional Data Model (OMD model), which can be
used to model and represent contexts as logic-based
ontologies. The data under assessment are mapped into
the context for additional analysis, processing, and
quality data extraction. The resulting contexts allow
for the representation of dimensions, and
multidimensional data quality assessment becomes
possible. At the core of a multidimensional context, we
include a generalized multidimensional data model and a
Datalog$^\pm$ ontology with provably good properties
in terms of query answering. These main components are
used to represent dimension hierarchies, dimensional
constraints, and dimensional rules and define
predicates for quality data specification. Query
dimension hierarchies and becomes the basic tool for
the extraction of quality data. The OMD model is
interesting per se beyond applications to data quality.
It allows for a logic-based and computationally
tractable representation of multidimensional data,
extending previous multidimensional data models with
acknowledgement = ack-nhfb,
articleno =    "14",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Mountantonakis:2018:SMM,
author =       "Michalis Mountantonakis and Yannis Tzitzikas",
title =        "Scalable Methods for Measuring the Connectivity and
Quality of Large Numbers of Linked Datasets",
journal =      j-JDIQ,
volume =       "9",
number =       "3",
pages =        "15:1--15:??",
month =        mar,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3165713",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Although the ultimate objective of Linked Data is
linking and integration, it is not currently evident
how connected the current Linked Open Data (LOD) cloud
special indexes and algorithms, for performing
measurements related to the connectivity of more than
two datasets that are useful in various tasks including
(a) Dataset Discovery and Selection; (b) Object
Coreference, i.e., for obtaining complete information
about a set of entities, including provenance
information; (c) Data Quality Assessment and
Improvement, i.e., for assessing the connectivity
between any set of datasets and monitoring their
evolution over time, as well as for estimating data
veracity; (d) Dataset Visualizations; and various other
tasks. Since it would be prohibitively expensive to
perform all these measurements in a na{\"\i}ve way, in
construction algorithms) that can speed up such tasks.
In brief, we introduce (i) a namespace-based prefix
index, (ii) a sameAs catalog for computing the
symmetric and transitive closure of the owl:sameAs
relationships encountered in the datasets, (iii) a
semantics-aware element index (that exploits the
aforementioned indexes), and, finally, (iv) two
lattice-based incremental algorithms for speeding up
the computation of the intersection of URIs of any set
of datasets. For enhancing scalability, we propose
parallel index construction algorithms and parallel
lattice-based incremental algorithms, we evaluate the
achieved speedup using either a single machine or a
cluster of machines, and we provide insights regarding
the factors that affect efficiency. Finally, we report
measurements about the connectivity of the (billion
triples-sized) LOD cloud that have never been carried
out so far.",
acknowledgement = ack-nhfb,
articleno =    "15",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Esteves:2018:TVA,
author =       "Diego Esteves and Anisa Rula and Aniketh Janardhan
Reddy and Jens Lehmann",
title =        "Toward Veracity Assessment in {RDF} Knowledge Bases:
an Exploratory Analysis",
journal =      j-JDIQ,
volume =       "9",
number =       "3",
pages =        "16:1--16:??",
month =        mar,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3177873",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Among different characteristics of knowledge bases,
data quality is one of the most relevant to maximize
the benefits of the provided information. Knowledge
base quality assessment poses a number of big data
challenges such as high volume, variety, velocity, and
questions related to the assessment of the veracity of
facts through Deep Fact Validation (DeFacto), a triple
validation framework designed to assess facts in RDF
knowledge bases. Despite current developments in the
research area, the underlying framework faces many
issues and conducts a thorough analysis of its
pipeline, aiming at reducing the error propagation
through its components. Furthermore, we discuss recent
developments related to this fact validation as well as
describing advantages and drawbacks of state-of-the-art
models. As a result of this exploratory analysis, we
give insights and directions toward a better
architecture to tackle the complex task of
fact-checking in knowledge bases.",
acknowledgement = ack-nhfb,
articleno =    "16",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Chen:2018:CAS,
author =       "Qingyu Chen and Yu Wan and Xiuzhen Zhang and Yang Lei
and Justin Zobel and Karin Verspoor",
title =        "Comparative Analysis of Sequence Clustering Methods
for Deduplication of Biological Databases",
journal =      j-JDIQ,
volume =       "9",
number =       "3",
pages =        "17:1--17:??",
month =        mar,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3131611",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "The massive volumes of data in biological sequence
databases provide a remarkable resource for large-scale
biological studies. However, the underlying data
quality of these resources is a critical concern. A
particular challenge is duplication, in which multiple
records have similar sequences, creating a high level
of redundancy that impacts database storage, curation,
and search. Biological database deduplication has two
direct applications: for database curation, where
detected duplicates are removed to improve curation
efficiency, and for database search, where detected
duplicate sequences may be flagged but remain available
to support analysis. Clustering methods have been
widely applied to biological sequences for database
deduplication. Since an exhaustive all-by-all pairwise
comparison of sequences cannot scale for a high volume
of data, heuristic approaches have been recruited, such
as the use of simple similarity thresholds. In this
article, we present a comparison between CD-HIT and
UCLUST, the two best-known clustering tools for
sequence database deduplication. Our contributions
include a detailed assessment of the redundancy
remaining after deduplication, application of standard
clustering evaluation metrics to quantify the cohesion
and separation of the clusters generated by each
method, and a biological case study that assesses
intracluster function annotation consistency to
demonstrate the impact of these factors on a practical
application of the sequence clustering methods. Our
results show that the trade-off between efficiency and
accuracy becomes acute when low threshold values are
used and when cluster sizes are large. This evaluation
leads to practical recommendations for users for more
effective uses of the sequence clustering tools for
deduplication.",
acknowledgement = ack-nhfb,
articleno =    "17",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Gal:2018:CPD,
author =       "Avigdor Gal and Arik Senderovich and Matthias
Weidlich",
title =        "Challenge Paper: Data Quality Issues in Queue Mining",
journal =      j-JDIQ,
volume =       "9",
number =       "4",
pages =        "18:1--18:??",
month =        may,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3165712",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "18",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Musyaffa:2018:EOF,
author =       "Fathoni A. Musyaffa and Christiane Engels and
Maria-Esther Vidal and Fabrizio Orlandi and S{\"o}ren
Auer",
title =        "Experience: Open Fiscal Datasets, Common Issues, and
Recommendations",
journal =      j-JDIQ,
volume =       "9",
number =       "4",
pages =        "19:1--19:??",
month =        may,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3190576",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Public administrations are continuously publishing
open data, increasing the amount of government open
data over time. The published data includes budgets and
spending as part of fiscal data; publishing these data
is an important part of transparent and accountable
governance. However, open fiscal data should also meet
open data publication guidelines. When requirements in
data guidelines are not met, effective data analysis
over published datasets cannot be performed
Data Publication (OFDP), a framework to assess the
quality of open fiscal datasets. We also present an
extensive open fiscal data assessment and common data
quality issues found; additionally, open fiscal data
publishing guidelines are presented. We studied and
surveyed main quality factors for open fiscal datasets.
Moreover, the collected quality factors have been
scored according to the results of a questionnaire to
score quality factors within the OFDP assessment
framework. We gather and comprehensively analyze a
representative set of 77 fiscal datasets from several
public administrations across different regions at
different levels (e.g., supranational, national,
municipality). We characterize quality issues commonly
arising in these datasets. Our assessment shows that
there are many quality factors in fiscal data
publication that still need to be taken care of so that
the data can be analyzed effectively. Our proposed
guidelines allow for publishing open fiscal data where
these quality issues are avoided.",
acknowledgement = ack-nhfb,
articleno =    "19",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Alshayeb:2018:SSP,
author =       "Mohammad Alshayeb and Yasser Shaaban and Jarallah
Al-Ghamdi",
title =        "{SPMDL}: Software Product Metrics Definition
Language",
journal =      j-JDIQ,
volume =       "9",
number =       "4",
pages =        "20:1--20:??",
month =        may,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3185049",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Software metrics are becoming more acceptable measures
for software quality assessment. However, there is no
standard form to represent metric definitions, which
would be useful for metrics exchange and customization.
Metrics Definition Language (SPMDL). We develop an
XML-based description language to define software
metrics in a precise and reusable form. Metric
definitions in SPMDL are based on meta-models extracted
from either source code or design artifacts, such as
the Dagstuhl Middle Meta-model, with support for
various abstraction levels. The language defines
several flexible computation mechanisms, such as
extended Object Constraint Language queries and
predefined graph operations on the meta-model. SPMDL
provides an unambiguous description of the metric
definition; it is also easy to use and is extensible.",
acknowledgement = ack-nhfb,
articleno =    "20",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ashish:2018:MRB,
author =       "Naveen Ashish and Arihant Patawari",
title =        "Machine Reading of Biomedical Data Dictionaries",
journal =      j-JDIQ,
volume =       "9",
number =       "4",
pages =        "21:1--21:??",
month =        may,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3177874",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
reading of biomedical data dictionaries. Automated
reading is the process of extracting element details
for each of the data elements from a data dictionary in
a document format (such as PDF) to a completely
structured representation. A structured representation
is essential if the data dictionary metadata are to be
used in applications such as data integration and also
in evaluating the quality of the associated data. We
present an approach and implemented solution for the
problem, considering different formats of data
dictionaries. We have a particular focus on the most
challenging format with a machine-learning
classification solution to the problem using
conditional random field classifiers. We present an
evaluation using several actual data dictionaries,
demonstrating the effectiveness of our approach.",
acknowledgement = ack-nhfb,
articleno =    "21",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Chiang:2018:IPS,
author =       "Fei Chiang and Dhruv Gairola",
title =        "{InfoClean}: Protecting Sensitive Information in Data
Cleaning",
journal =      j-JDIQ,
volume =       "9",
number =       "4",
pages =        "22:1--22:??",
month =        may,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3190577",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:58 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Data quality has become a pervasive challenge for
organizations as they wrangle with large, heterogeneous
datasets to extract value. Given the proliferation of
sensitive and confidential information, it is crucial
to consider data privacy concerns during the data
cleaning process. For example, in medical database
applications, varying levels of privacy are enforced
across the attribute values. Attributes such as a
patient's country or city of residence may be less
sensitive than the patient's prescribed medication.
Traditional data cleaning techniques assume the data is
openly accessible, without considering the differing
levels of information sensitivity. In this work, we
take the first steps toward a data cleaning model that
integrates privacy as part of the data cleaning
process. We present a privacy-aware data cleaning
framework that differentiates the information content
among the attribute values during the data cleaning
process to resolve data inconsistencies while
minimizing the amount of information disclosed. Our
data repair algorithm includes a set of data disclosure
operations that considers the information content of
the underlying attribute values, while maximizing data
utility. Our evaluation using real datasets shows that
our algorithm scales well, and achieves improved
performance and comparable repair accuracy against
existing data cleaning solutions.",
acknowledgement = ack-nhfb,
articleno =    "22",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bertino:2018:ACE,
author =       "Elisa Bertino and Mohammad R. Jahanshahi",
title =        "Adaptive and Cost-Effective Collection of High-Quality
Data for Critical Infrastructure and Emergency
Management in Smart Cities-Framework and Challenges",
journal =      j-JDIQ,
volume =       "10",
number =       "1",
pages =        "1:1--1:??",
month =        may,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3190579",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "1",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Flores:2018:IQA,
author =       "Javier Flores and Jun Sun",
title =        "Information Quality Awareness and Information Quality
Practice",
journal =      j-JDIQ,
volume =       "10",
number =       "1",
pages =        "2:1--2:??",
month =        may,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3182182",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Healthcare organizations increasingly rely on
electronic information to optimize their operations.
Information of high diversity from various sources
accentuate the relevance and importance of information
quality (IQ). The quality of information needs to be
improved to support a more efficient and reliable
utilization of healthcare information systems (IS).
This can only be achieved through the implementation of
initiatives followed by most users across an
organization. The purpose of this study is to examine
how awareness of IS users about IQ issues would affect
their IQ behavior. Based on multiple theoretical
frameworks, it is hypothesized that different aspects
of user motivation mediate the relationship between the
awareness on both beneficial and problematic situations
and IQ practice inclination. In addition, social
influence and facilitating condition moderate the
relationship between IQ practice inclination and overt
IQ practice. The theoretical and practical implications
of findings are discussed, especially how to enhance IQ
compliance in the healthcare settings.",
acknowledgement = ack-nhfb,
articleno =    "2",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bors:2018:VIC,
author =       "Christian Bors and Theresia Gschwandtner and Simone
Kriglstein and Silvia Miksch and Margit Pohl",
title =        "Visual Interactive Creation, Customization, and
Analysis of Data Quality Metrics",
journal =      j-JDIQ,
volume =       "10",
number =       "1",
pages =        "3:1--3:??",
month =        may,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3190578",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "During data preprocessing, analysts spend a
significant part of their time and effort profiling the
quality of the data along with cleansing and
transforming the data for further analysis. While
quality metrics-ranging from general to domain-specific
measures-support assessment of the quality of a
dataset, there are hardly any approaches to visually
support the analyst in customizing and applying such
metrics. Yet, visual approaches could facilitate users'
involvement in data quality assessment. We present
MetricDoc, an interactive environment for assessing
data quality that provides customizable, reusable
quality metrics in combination with immediate visual
feedback. Moreover, we provide an overview
visualization of these quality metrics along with error
of the data to determine the causes of quality issues
architecture, design, and evaluation of MetricDoc,
which underwent several design cycles, including
heuristic evaluation and expert reviews as well as a
focus group with data quality, human-computer
interaction, and visual analytics experts.",
acknowledgement = ack-nhfb,
articleno =    "3",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Zhang:2018:ASB,
author =       "Han Zhang and Shawndra Hill and David Rothschild",
title =        "Addressing Selection Bias in Event Studies with
General-Purpose Social Media Panels",
journal =      j-JDIQ,
volume =       "10",
number =       "1",
pages =        "4:1--4:??",
month =        may,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3185048",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Data from Twitter have been employed in prior research
to study the impacts of events. Conventionally,
researchers use keyword-based samples of tweets to
create a panel of Twitter users who mention
event-related keywords during and after an event.
However, the keyword-based sampling is limited in its
objectivity dimension of data and information quality.
First, the technique suffers from selection bias since
users who discuss an event are already more likely to
discuss event-related topics beforehand. Second, there
are no viable control groups for comparison to a
keyword-based sample of Twitter users. We propose an
alternative sampling approach to construct panels of
users defined by their geolocation. Geolocated panels
are exogenous to the keywords in users' tweets,
resulting in less selection bias than the keyword panel
method. Geolocated panels allow us to follow
within-person changes over time and enable the creation
of comparison groups. We compare different panels in
two real-world settings: response to mass shootings and
TV advertising. We first show the strength of the
selection biases of keyword panels. Then, we
empirically illustrate how geolocated panels reduce
selection biases and allow meaningful comparison groups
regarding the impact of the studied events. We are the
first to provide a clear, empirical example of how a
better panel selection design, based on an exogenous
variable such as geography, both reduces selection bias
compared to the current state of the art and increases
the value of Twitter research for studying events.
While we advocate for the use of a geolocated panel, we
also discuss its weaknesses and application scenario
importance of selection bias in impacting the
objectivity of social media data.",
acknowledgement = ack-nhfb,
articleno =    "4",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Puentes:2018:CQE,
author =       "John Puentes and Pedro Merino Laso and David Brosset",
title =        "The Challenge of Quality Evaluation in Fraud
Detection",
journal =      j-JDIQ,
volume =       "10",
number =       "2",
pages =        "5:1--5:??",
month =        sep,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3228341",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3228341",
acknowledgement = ack-nhfb,
articleno =    "5",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bertino:2018:CAC,
author =       "Elisa Bertino and Amani Abu Jabal and Seraphin Calo
and Dinesh Verma and Christopher Williams",
title =        "The Challenge of Access Control Policies Quality",
journal =      j-JDIQ,
volume =       "10",
number =       "2",
pages =        "6:1--6:??",
month =        sep,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3209668",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3209668",
abstract =     "Access Control policies allow one to control data
sharing among multiple subjects. For high assurance
data security, it is critical that such policies be fit
for their purpose. In this paper we introduce the
notion of policy quality'' and elaborate on its many
dimensions, such as consistency, completeness, and
minimality. We introduce a framework supporting the
analysis of policies with respect to the introduced
quality dimensions and elaborate on research
challenges, including policy analysis for large-scale
distributed systems, assessment of policy correctness,
and analysis of policies expressed in richer policy
models.",
acknowledgement = ack-nhfb,
articleno =    "6",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Karanja:2018:CPT,
author =       "Evanson Mwangi Karanja and Shedden Masupe and Mandu
Gasennelwe-Jeffrey",
title =        "Challenge Paper: Towards Open Datasets for {Internet
of Things} Malware",
journal =      j-JDIQ,
volume =       "10",
number =       "2",
pages =        "7:1--7:??",
month =        sep,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3230669",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "7",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Koumarelas:2018:EEA,
author =       "Ioannis Koumarelas and Axel Kroschk and Clifford
Mosley and Felix Naumann",
title =        "Experience: Enhancing Address Matching with Geocoding
and Similarity Measure Selection",
journal =      j-JDIQ,
volume =       "10",
number =       "2",
pages =        "8:1--8:??",
month =        sep,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3232852",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Given a query record, record matching is the problem
of finding database records that represent the same
real-world object. In the easiest scenario, a database
record is completely identical to the query. However,
in most cases, problems do arise, for instance, as a
result of data errors or data integrated from multiple
sources or received from restrictive form fields. These
problems are usually difficult, because they require a
variety of actions, including field segmentation,
decoding of values, and similarity comparisons, each
study the problem of matching records that contain
address information, including attributes such as
Street-address and City. To facilitate this matching
process, we propose a domain-specific procedure to,
first, enrich each record with a more complete
representation of the address information through
geocoding and reverse-geocoding and, second, to select
the best similarity measure per each address attribute
that will finally help the classifier to achieve the
best f-measure. We report on our experience in
selecting geocoding services and discovering similarity
measures for a concrete but common industry use-case.",
acknowledgement = ack-nhfb,
articleno =    "8",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ferro:2018:ISIa,
author =       "Nicola Ferro and Norbert Fuhr and Andreas Rauber",
title =        "Introduction to the Special Issue on Reproducibility
in Information Retrieval: Evaluation Campaigns,
Collections, and Analyses",
journal =      j-JDIQ,
volume =       "10",
number =       "3",
pages =        "9:1--9:??",
month =        oct,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3268408",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "9",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Moffat:2018:EMU,
author =       "Alistair Moffat and Falk Scholer and Ziying Yang",
title =        "Estimating Measurement Uncertainty for Information
Retrieval Effectiveness Metrics",
journal =      j-JDIQ,
volume =       "10",
number =       "3",
pages =        "10:1--10:??",
month =        oct,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3239572",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3239572",
abstract =     "One typical way of building test collections for
offline measurement of information retrieval systems is
to pool the ranked outputs of different systems down to
some chosen depth d and then form relevance judgments
for those documents only. Non-pooled documents-ones
that did not appear in the top- d sets of any of the
contributing systems-are then deemed to be non-relevant
for the purposes of evaluating the relative behavior of
residuals to re-examine the reliability of that
process. By fitting the RBP parameter $\phi$ to
maximize similarity between AP- and NDCG-induced system
rankings, on the one hand, and RBP-induced rankings, on
the other, an estimate can be made as to the potential
score uncertainty associated with those two
recall-based metrics. We then consider the effect that
residual size-as an indicator of possible measurement
uncertainty in utility-based metrics-has in connection
with recall-based metrics by computing the effect of
increasing pool sizes and examining the trends that
arise in terms of both metric score and system
separability using standard statistical tests. The
experimental results show that the confidence levels
expressed via the p -values generated by statistical
tests are only weakly connected to the size of the
residual and to the degree of measurement uncertainty
caused by the presence of unjudged documents.
Statistical confidence estimates are, however, largely
consistent as pooling depths are altered. We therefore
recommend that all such experimental results should
report, in addition to the outcomes of statistical
significance tests, the residual measurements generated
by a suitably matched weighted-precision metric, to
give a clear indication of measurement uncertainty that
arises due to the presence of unjudged documents in
test collections with finite pooled judgments.",
acknowledgement = ack-nhfb,
articleno =    "10",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Roitero:2018:RGE,
author =       "Kevin Roitero and Marco Passon and Giuseppe Serra and
Stefano Mizzaro",
title =        "{Reproduce}. {Generalize}. {Extend}. {On} Information
Retrieval Evaluation without Relevance Judgments",
journal =      j-JDIQ,
volume =       "10",
number =       "3",
pages =        "11:1--11:??",
month =        oct,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3241064",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3241064",
abstract =     "The evaluation of retrieval effectiveness by means of
test collections is a commonly used methodology in the
information retrieval field. Some researchers have
addressed the quite fascinating research question of
whether it is possible to evaluate effectiveness
completely automatically, without human relevance
assessments. Since human relevance assessment is one of
the main costs of building a test collection, both in
human time and money resources, this rather ambitious
reproduce the main results on evaluating information
retrieval systems without relevance judgments;
furthermore, we generalize such previous work to
analyze the effect of test collections, evaluation
metrics, and pool depth. We also expand the idea to
semi-automatic evaluation and estimation of topic
difficulty. Our results show that (i) previous work is
overall reproducible, although some specific results
are not; (ii) collection, metric, and pool depth impact
the automatic evaluation of systems, which is anyway
accurate in several cases; (iii) semi-automatic
evaluation is an effective methodology; and (iv)
automatic evaluation can (to some extent) be used to
predict topic difficulty.",
acknowledgement = ack-nhfb,
articleno =    "11",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Roitero:2018:RIE,
author =       "Kevin Roitero and Michael Soprano and Andrea Brunello
and Stefano Mizzaro",
title =        "Reproduce and Improve: an Evolutionary Approach to
Select a Few Good Topics for Information Retrieval
Evaluation",
journal =      j-JDIQ,
volume =       "10",
number =       "3",
pages =        "12:1--12:??",
month =        oct,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3239573",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3239573",
abstract =     "Effectiveness evaluation of information retrieval
systems by means of a test collection is a widely used
methodology. However, it is rather expensive in terms
of resources, time, and money; therefore, many
researchers have proposed methods for a cheaper
evaluation. One particular approach, on which we focus
initiatives, usually system effectiveness is evaluated
as the average effectiveness on a set of n topics
(usually, n =50, but more than 1,000 have been also
proposed to find the best subsets of a few good topics
that evaluate the systems in the most similar way to
the full set. The computational complexity of the task
has so far limited the analysis that has been
performed. We develop a novel and efficient approach
based on a multi-objective evolutionary algorithm. The
higher efficiency of our new implementation allows us
to reproduce some notable results on topic set
reduction, as well as perform new experiments to
generalize and improve such results. We show that our
approach is able to both reproduce the main
state-of-the-art results and to allow us to analyze the
effect of the collection, metric, and pool depth used
for the evaluation. Finally, differently from previous
studies, which have been mainly theoretical, we are
also able to discuss some practical topic selection
strategies, integrating results of automatic evaluation
approaches.",
acknowledgement = ack-nhfb,
articleno =    "12",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Jagerman:2018:OLL,
author =       "Rolf Jagerman and Krisztian Balog and Maarten {De
Rijke}",
title =        "{OpenSearch}: Lessons Learned from an Online
Evaluation Campaign",
journal =      j-JDIQ,
volume =       "10",
number =       "3",
pages =        "13:1--13:??",
month =        oct,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3239575",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:16:59 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3239575",
abstract =     "We report on our experience with TREC OpenSearch, an
online evaluation campaign that enabled researchers to
evaluate their experimental retrieval methods using
real users of a live website. Specifically, we focus on
academic search domain, and work with two search
engines, CiteSeerX and SSOAR, that provide us with
traffic. We describe our experimental platform, which
is based on the living labs methodology, and report on
the experimental results obtained. We also share our
experiences, challenges, and the lessons learned from
running this track in 2016 and 2017.",
acknowledgement = ack-nhfb,
articleno =    "13",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ferro:2018:ISIb,
author =       "Nicola Ferro and Norbert Fuhr and Andreas Rauber",
title =        "Introduction to the Special Issue on Reproducibility
in Information Retrieval: Tools and Infrastructures",
journal =      j-JDIQ,
volume =       "10",
number =       "4",
pages =        "14:1--14:??",
month =        nov,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3268410",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
acknowledgement = ack-nhfb,
articleno =    "14",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Hopfgartner:2018:ESC,
author =       "Frank Hopfgartner and Allan Hanbury and Henning
M{\"u}ller and Ivan Eggel and Krisztian Balog and
Torben Brodt and Gordon V. Cormack and Jimmy Lin and
Jayashree Kalpathy-Cramer and Noriko Kando and Makoto
P. Kato and Anastasia Krithara and Tim Gollub and
Martin Potthast and Evelyne Viegas and Simon Mercer",
title =        "Evaluation-as-a-Service for the Computational
Sciences: Overview and Outlook",
journal =      j-JDIQ,
volume =       "10",
number =       "4",
pages =        "15:1--15:??",
month =        nov,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3239570",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Evaluation in empirical computer science is essential
to show progress and assess technologies developed.
Several research domains such as information retrieval
have long relied on systematic evaluation to measure
progress: here, the Cranfield paradigm of creating
shared test collections, defining search tasks, and
collecting ground truth for these tasks has persisted
up until now. In recent years, however, several new
challenges have emerged that do not fit this paradigm
very well: extremely large data sets, confidential data
sets as found in the medical domain, and rapidly
changing data sets as often encountered in industry.
Crowdsourcing has also changed the way in which
industry approaches problem-solving with companies now
organizing challenges and handing out monetary awards
to incentivize people to work on their challenges,
particularly in the field of machine learning. This
article is based on discussions at a workshop on
Evaluation-as-a-Service (EaaS). EaaS is the paradigm of
not providing data sets to participants and have them
work on the data locally, but keeping the data central
and allowing access via Application Programming
Interfaces (API), Virtual Machines (VM), or other
possibilities to ship executables. The objectives of
approaches and consolidate the experiences of these
approaches to outline the next steps of EaaS,
particularly toward sustainable research
infrastructures. The article summarizes several
existing approaches to EaaS and analyzes their usage
The many factors influencing EaaS are summarized, and
the environment in terms of motivations for the various
stakeholders, from funding agencies to challenge
organizers, researchers and participants, to industry
interested in supplying real-world problems for which
they require solutions. EaaS solves many problems of
the current research environment, where data sets are
often not accessible to many researchers. Executables
of published tools are equally often not available
making the reproducibility of results impossible. EaaS,
however, creates reusable/citable data sets as well as
available executables. Many challenges remain, but such
a framework for research can also foster more
collaboration between researchers, potentially
increasing the speed of obtaining research results.",
acknowledgement = ack-nhfb,
articleno =    "15",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Yang:2018:ARR,
author =       "Peilin Yang and Hui Fang and Jimmy Lin",
title =        "{Anserini}: Reproducible Ranking Baselines Using
{Lucene}",
journal =      j-JDIQ,
volume =       "10",
number =       "4",
pages =        "16:1--16:??",
month =        nov,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3239571",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "This work tackles the perennial problem of
reproducible baselines in information retrieval
research, focusing on bag-of-words ranking models.
have a long history of building and sharing systems,
they are primarily designed to facilitate the
publication of research papers. As such, these systems
are often incomplete, inflexible, poorly documented,
difficult to use, and slow, particularly in the context
of modern web-scale collections. Furthermore, the
growing complexity of modern software ecosystems and
the resource constraints most academic research groups
operate under make maintaining open-source systems a
constant struggle. However, except for a small number
of companies (mostly commercial web search engines)
that deploy custom infrastructure, Lucene has become
the de facto platform in industry for building search
applications. Lucene has an active developer base, a
large audience of users, and diverse capabilities to
work with heterogeneous collections at scale. However,
it lacks systematic support for ad hoc experimentation
using standard test collections. We describe Anserini,
an information retrieval toolkit built on Lucene that
fills this gap. Our goal is to simplify ad hoc
experimentation and allow researchers to easily
reproduce results with modern bag-of-words ranking
models on diverse test collections. With Anserini, we
demonstrate that Lucene provides a suitable framework
for supporting information retrieval research.
Experiments show that our system efficiently indexes
large web collections, provides modern ranking models
that are on par with research implementations in terms
of effectiveness, and supports low-latency query
evaluation to facilitate rapid experimentation",
acknowledgement = ack-nhfb,
articleno =    "16",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Kiesel:2018:RWC,
author =       "Johannes Kiesel and Florian Kneist and Milad Alshomary
and Benno Stein and Matthias Hagen and Martin
Potthast",
title =        "Reproducible {Web} Corpora: Interactive Archiving with
Automatic Quality Assessment",
journal =      j-JDIQ,
volume =       "10",
number =       "4",
pages =        "17:1--17:??",
month =        nov,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3239574",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "The evolution of web pages from static HTML pages
toward dynamic pieces of software has rendered
archiving them increasingly difficult. Nevertheless, an
accurate, reproducible web archive is a necessity to
ensure the reproducibility of web-based research.
Archiving web pages reproducibly, however, is currently
not part of best practices for web corpus construction.
As a result, and despite the ongoing efforts of other
stakeholders to archive the web, tools for the
construction of reproducible web corpora are
tool tailored to this purpose. It relies on emulating
user interactions with a web page while recording all
network traffic. The customizable user interactions can
be replayed on demand, while requests sent by the
archived page are served with the recorded responses.
The tool facilitates reproducible user studies, user
simulations, and evaluations of algorithms that rely on
extracting data from web pages. To evaluate our tool,
we conduct the first systematic assessment of
reproduction quality for rendered web pages. Using our
tool, we create a corpus of 10,000\&nbsp;web pages
carefully sampled from the Common Crawl and manually
annotated with regard to reproduction quality via
crowdsourcing. Based on this data, we test three
approaches to automatic reproduction-quality
assessment. An off-the-shelf neural network, trained on
visual differences between the web page during
archiving and reproduction, matches the manual
assessments best. This automatic assessment of
reproduction quality allows for immediate bugfixing
during archiving and continuous development of our tool
as the web continues to evolve.",
acknowledgement = ack-nhfb,
articleno =    "17",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Roy:2018:CCD,
author =       "Dwaipayan Roy and Mandar Mitra and Debasis Ganguly",
title =        "To Clean or Not to Clean: Document Preprocessing and
Reproducibility",
journal =      j-JDIQ,
volume =       "10",
number =       "4",
pages =        "18:1--18:??",
month =        nov,
year =         "2018",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3242180",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
abstract =     "Web document collections such as WT10G, GOV2, and
ClueWeb are widely used for text retrieval experiments.
Documents in these collections contain a fair amount of
non-content-related markup in the form of tags,
hyperlinks, and so on. Published articles that use
these corpora generally do not provide specific details
about how this markup information is handled during
indexing. However, this question turns out to be
important: Through experiments, we find that including
or excluding metadata in the index can produce
significantly different results with standard IR
models. More importantly, the effect varies across
models and collections. For example, metadata filtering
is found to be generally beneficial when using BM25, or
language modeling with Dirichlet smoothing, but can
significantly reduce retrieval effectiveness if
language modeling is used with Jelinek-Mercer
smoothing. We also observe that, in general, the
performance differences become more noticeable as the
amount of metadata in the test collections increase.
Given this variability, we believe that the details of
document preprocessing are significant from the point
of view of reproducibility. In a second set of
experiments, we also study the effect of preprocessing
on query expansion using RM3. In this case, once again,
we find that it is generally better to remove markup
before using documents for query expansion.",
acknowledgement = ack-nhfb,
articleno =    "18",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Srivastava:2019:EHQ,
author =       "Divesh Srivastava and Monica Scannapieco and Thomas C.
Redman",
title =        "Ensuring High-Quality Private Data for Responsible
Data Science: Vision and Challenges",
journal =      j-JDIQ,
volume =       "11",
number =       "1",
pages =        "1:1--1:??",
month =        jan,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3287168",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3287168",
abstract =     "High-quality data is critical for effective data
science. As the use of data science has grown, so too
have concerns that individuals' rights to privacy will
be violated. This has led to the development of data
protection regulations around the globe and the use of
sophisticated anonymization techniques to protect
privacy. Such measures make it more challenging for the
data scientist to understand the data, exacerbating
issues of data quality. Responsible data science aims
to develop useful insights from the data while fully
embracing these considerations. We pose the high-level
problem in this article, How can a data scientist
develop the needed trust that private data has high
quality?'' We then identify a series of challenges for
various data-centric communities and outline research
questions for data quality and privacy researchers,
acknowledgement = ack-nhfb,
articleno =    "1",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Rios:2019:CTF,
author =       "Julio C{\'e}sar Cort{\'e}s R{\'\i}os and Norman W.
Paton and Alvaro A. A. Fernandes and Edward Abel and
John A. Keane",
title =        "Crowdsourced Targeted Feedback Collection for
Multicriteria Data Source Selection",
journal =      j-JDIQ,
volume =       "11",
number =       "1",
pages =        "2:1--2:??",
month =        jan,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3284934",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3284934",
abstract =     "A multicriteria data source selection (MCSS) scenario
identifies, from a set of candidate data sources, the
subset that best meets users' needs. These needs are
expressed using several criteria, which are used to
evaluate the candidate data sources. An MCSS problem
can be solved using multidimensional optimization
techniques that trade off the different objectives.
Sometimes one may have uncertain knowledge regarding
how well the candidate data sources meet the criteria.
In order to overcome this uncertainty, one may rely on
end-users or crowds to annotate the data items produced
by the sources in relation to the selection criteria.
Collection (TFC) approach is introduced that aims to
identify those data items on which feedback should be
collected, thereby providing evidence on how the
sources satisfy the required criteria. The proposed TFC
targets feedback by considering the confidence
intervals around the estimated criteria values, with a
view to increasing the confidence in the estimates that
are most relevant to the multidimensional optimization.
Variants of the proposed TFC approach have been
developed for use where feedback is expected to be
reliable (e.g., where it is provided by trusted
experts) and where feedback is expected to be
unreliable (e.g., from crowd workers). Both variants
have been evaluated, and positive results are reported
against other approaches to feedback collection,
including active learning, in experiments that involve
real-world datasets and crowdsourcing.",
acknowledgement = ack-nhfb,
articleno =    "2",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Dallachiesa:2019:ICQ,
author =       "Michele Dallachiesa and Charu C. Aggarwal and Themis
Palpanas",
title =        "Improving Classification Quality in Uncertain Graphs",
journal =      j-JDIQ,
volume =       "11",
number =       "1",
pages =        "3:1--3:??",
month =        jan,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3242095",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3242095",
abstract =     "In many real applications that use and analyze
networked data, the links in the network graph may be
erroneous or derived from probabilistic techniques. In
such cases, the node classification problem can be
challenging, since the unreliability of the links may
affect the final results of the classification process.
explicitly, then the classification accuracy in the
underlying network may be affected adversely. In this
article, we focus on situations that require the
analysis of the uncertainty that is present in the
graph structure. We study the novel problem of node
classification in uncertain graphs, by treating
uncertainty as a first-class citizen. We propose two
techniques based on a Bayes model and automatic
parameter selection and show that the incorporation of
uncertainty in the classification process as a
first-class citizen is beneficial. We experimentally
evaluate the proposed approach using different real
data sets and study the behavior of the algorithms
under different conditions. The results demonstrate the
effectiveness and efficiency of our approach.",
acknowledgement = ack-nhfb,
articleno =    "3",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Casey:2019:FRR,
author =       "K. Michael Casey and Kevin {Casey Jr.}",
title =        "Financial Regulatory and Risk Management Challenges
Stemming from Firm-Specific Digital Misinformation",
journal =      j-JDIQ,
volume =       "11",
number =       "1",
pages =        "4:1--4:??",
month =        jan,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3274655",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:00 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3274655",
acknowledgement = ack-nhfb,
articleno =    "4",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Fan:2019:DGC,
author =       "Wenfei Fan",
title =        "Dependencies for Graphs: Challenges and
Opportunities",
journal =      j-JDIQ,
volume =       "11",
number =       "2",
pages =        "5:1--5:??",
month =        may,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3310230",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3310230",
abstract =     "What are graph dependencies? What do we need them for?
tackles these questions. It aims to incite curiosity
and interest in this emerging area of research.",
acknowledgement = ack-nhfb,
articleno =    "5",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Sillaber:2019:EDI,
author =       "Christian Sillaber and Andrea Mussmann and Ruth Breu",
title =        "Experience: Data and Information Quality Challenges in
Governance, Risk, and Compliance Management",
journal =      j-JDIQ,
volume =       "11",
number =       "2",
pages =        "6:1--6:??",
month =        may,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3297721",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3297721",
abstract =     "Governance, risk, and compliance (GRC) managers often
struggle to document the current state of their
organizations. This is due to the complexity of their
IS landscape, the complex regulatory and organizational
environment, and the frequent changes to both. GRC
tools seek to support them by integrating existing
information sources. However, a comprehensive analysis
of how the data is managed in such tools, as well as
the impact of data quality, is still missing. To build
a basis of empirical data, we conducted a series of
interviews with information security managers
responsible for GRC management activities in their
organizations. The results of a qualitative content
analysis of these interviews suggest that decision
makers largely depend on high-quality documentation but
struggle to maintain their documentation at the
required level for long periods of time. This work
discusses factors affecting the quality of GRC data and
information and provides insights into approaches
implemented by organizations to analyze, improve, and
maintain the quality of their GRC data and
information.",
acknowledgement = ack-nhfb,
articleno =    "6",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Lazar:2019:EEM,
author =       "Alina Lazar and Ling Jin and C. Anna Spurlock and
Kesheng Wu and Alex Sim and Annika Todd",
title =        "Evaluating the Effects of Missing Values and Mixed
Data Types on Social Sequence Clustering Using {t-SNE}
Visualization",
journal =      j-JDIQ,
volume =       "11",
number =       "2",
pages =        "7:1--7:??",
month =        may,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3301294",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3301294",
abstract =     "The goal of this work is to investigate the impact of
missing values in clustering joint categorical social
sequences. Identifying patterns in sociodemographic
longitudinal data is important in a number of social
science settings. However, performing analytical
operations, such as clustering on life course
trajectories, is challenging due to the categorical and
multidimensional nature of the data, their mixed data
types, and corruption by missing and inconsistent
values. Data quality issues were investigated
previously on single variable sequences. To understand
their effects on multivariate sequence analysis, we
employ a dataset of mixed data types and missing
values, a dissimilarity measure designed for joint
categorical sequence data, together with dimensionality
reduction methodologies in a systematic design of
sequence clustering experiments. Given the categorical
nature of our data, we employ an edit'' distance
using optimal matching. Because each data record has
multiple variables of different types, we investigate
the impact of mixing these variables in a single
dissimilarity measure. Between variables with binary
values and those with multiple nominal values, we find
that the ability to overcome missing data problems is
more difficult in the nominal domain than in the binary
values can result in systematic biases in dissimilarity
matrices and subsequently introduce both artificial
clusters and unrealistic interpretations of associated
data domains. We demonstrate the usage of t-distributed
stochastic neighborhood embedding to visually guide
mitigation of such biases by tuning the missing value
substitution cost parameter or determining an optimal
sequence span.",
acknowledgement = ack-nhfb,
articleno =    "7",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

author =       "Daniel M{\"u}ller and Pratiksha Jain and Yieh-Funk
Te",
title =        "Augmenting Data Quality through High-Precision Gender
Categorization",
journal =      j-JDIQ,
volume =       "11",
number =       "2",
pages =        "8:1--8:??",
month =        may,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3297720",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3297720",
abstract =     "Mappings of first name to gender have been widely
recognized as a critical tool for the completion,
study, and validation of data records in a range of
areas. In this study, we investigate how organizations
with large databases of existing entities can create
their own mappings between first names and gender and
how these mappings can be improved and utilized.
Therefore, we first explore a dataset with demographic
information on more than 4 million people, which was
provided by a car insurance company. Then, we study how
naming conventions have changed over time and how they
differ by nationality. Next, we build a probabilistic
first-name-to-gender mapping and augment the mapping by
mapping's performance. We test our mapping in two-label
and three-label settings and further validate our
mapping by categorizing patent filings by gender of the
inventor. We compare the results with previous studies'
outcomes and find that our mapping produces
high-precision results. We validate that the additional
information of nationality and year of birth improve
the precision scores of name-to-gender mappings.
Therefore, the proposed approach constitutes an
efficient process for improving the data quality of
organizations' records, if the gender attribute is
missing or unreliable.",
acknowledgement = ack-nhfb,
articleno =    "8",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Hassan:2019:ISI,
author =       "Naeemul Hassan and Chengkai Li and Jun Yang and Cong
Yu",
title =        "Introduction to the Special Issue on Combating Digital
Misinformation and Disinformation",
journal =      j-JDIQ,
volume =       "11",
number =       "3",
pages =        "9:1--9:??",
month =        jul,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3321484",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3321484",
acknowledgement = ack-nhfb,
articleno =    "9",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Zannettou:2019:WFI,
author =       "Savvas Zannettou and Michael Sirivianos and Jeremy
Blackburn and Nicolas Kourtellis",
title =        "The {Web} of False Information: Rumors, Fake News,
Hoaxes, Clickbait, and Various Other Shenanigans",
journal =      j-JDIQ,
volume =       "11",
number =       "3",
pages =        "10:1--10:??",
month =        jul,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3309699",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309699",
abstract =     "A new era of Information Warfare has arrived. Various
actors, including state-sponsored ones, are weaponizing
information on Online Social Networks to run
false-information campaigns with targeted manipulation
of public opinion on specific topics. These
false-information campaigns can have dire consequences
to the public: mutating their opinions and actions,
especially with respect to critical world events like
major elections. Evidently, the problem of false
information on the Web is a crucial one and needs
increased public awareness as well as immediate
attention from law enforcement agencies, public
institutions, and in particular, the research
direction by providing a typology of the Web's
false-information ecosystem, composed of various types
of false-information, actors, and their motives. We
report a comprehensive overview of existing research on
the false-information ecosystem by identifying several
lines of work: (1) how the public perceives false
information; (2) understanding the propagation of false
information; (3) detecting and containing false
information on the Web; and (4) false information on
the political stage. In this work, we pay particular
attention to political false information as: (1) it can
have dire consequences to the community (e.g., when
election results are mutated) and (2) previous work
shows that this type of false information propagates
faster and further when compared to other types of
false information. Finally, for each of these lines of
work, we report several future research directions that
can help us better understand and mitigate the emerging
problem of false-information dissemination on the
Web.",
acknowledgement = ack-nhfb,
articleno =    "10",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Xue:2019:CAT,
author =       "Hao Xue and Qiaozhi Wang and Bo Luo and Hyunjin Seo
and Fengjun Li",
title =        "Content-Aware Trust Propagation Toward Online Review
Spam Detection",
journal =      j-JDIQ,
volume =       "11",
number =       "3",
pages =        "11:1--11:??",
month =        jul,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3305258",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3305258",
abstract =     "With the increasing popularity of online review
systems, a large volume of user-generated content
becomes available to help people make reasonable
judgments about the quality of services and products
from unknown providers. However, these platforms are
frequently abused since fraudulent information can be
freely inserted by potentially malicious users without
validation. Consequently, online review systems become
targets of individual and professional spammers, who
insert deceptive reviews by manipulating the rating
and/or the content of the reviews. In this work, we
propose a review spamming detection scheme based on the
deviation between the aspect-specific opinions
extracted from individual reviews and the aggregated
opinions on the corresponding aspects. In particular,
we model the influence on the trustworthiness of the
user due to his opinion deviations from the majority in
the form of a deviation-based penalty, and integrate
this penalty into a three-layer trust propagation
framework to iteratively compute the trust scores for
users, reviews, and review targets, respectively. The
trust scores are effective indicators of spammers,
since they reflect the overall deviation of a user from
the aggregated aspect-specific opinions across all
targets and all aspects. Experiments on the dataset
collected from Yelp.com show that the proposed
detection scheme based on aspect-specific content-aware
trust propagation is able to measure users'
trustworthiness based on opinions expressed in
reviews.",
acknowledgement = ack-nhfb,
articleno =    "11",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Atanasova:2019:AFC,
author =       "Pepa Atanasova and Preslav Nakov and Llu{\'\i}s
M{\a}rquez and Alberto Barr{\'o}n-Cede{\~n}o and
Georgi Karadzhov and Tsvetomila Mihaylova and Mitra
Mohtarami and James Glass",
title =        "Automatic Fact-Checking Using Context and Discourse
Information",
journal =      j-JDIQ,
volume =       "11",
number =       "3",
pages =        "12:1--12:??",
month =        jul,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3297722",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3297722",
abstract =     "We study the problem of automatic fact-checking,
paying special attention to the impact of contextual
and discourse information. We address two related
tasks: ( i )\&nbsp;detecting check-worthy claims and (
ii )\&nbsp;fact-checking claims. We develop supervised
systems based on neural networks, kernel-based support
vector machines, and combinations thereof, which make
use of rich input representations in terms of discourse
cues and contextual features. For the check-worthiness
estimation task, we focus on political debates, and we
model the target claim in the context of the full
intervention of a participant and the previous and
following turns in the debate, taking into account
contextual meta information. For the fact-checking
forum, and we model the veracity of the answer with
it occurs as well as with respect to other related
posts from the entire forum. We develop annotated
datasets for both tasks and we run extensive
experimental evaluation, confirming that both types of
information-but especially contextual features-play an
important role.",
acknowledgement = ack-nhfb,
articleno =    "12",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Lin:2019:DPF,
author =       "Peng Lin and Qi Song and Yinghui Wu and Jiaxing Pi",
title =        "Discovering Patterns for Fact Checking in Knowledge
Graphs",
journal =      j-JDIQ,
volume =       "11",
number =       "3",
pages =        "13:1--13:??",
month =        jul,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3286488",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3286488",
incorporates graph patterns to support fact checking in
knowledge graphs. Our method discovers discriminant
graph patterns to construct classifiers for fact
prediction. First, we propose a class of graph fact
checking rules (GFCs). A GFC incorporates graph
patterns that best distinguish true and false facts of
generalized fact statements. We provide statistical
measures to characterize useful patterns that are both
discriminant and diversified. Second, we show that it
is feasible to discover GFCs in large graphs with
optimality guarantees. We develop an algorithm that
performs localized search to generate a stream of graph
patterns, and dynamically assemble the best GFCs from
multiple GFC sets, where each set ensures quality
scores within certain ranges. The algorithm guarantees
a $(1 / 2 - \epsilon)$ approximation when it (early)
terminates. We also develop a space-efficient
alternative that dynamically spawns prioritized
patterns with best marginal gains to the verified GFCs.
It guarantees a $(1 - 1 / e)$ approximation. Both
strategies guarantee a bounded time cost independent of
the size of the underlying graph. Third, to support
fact checking, we develop two classifiers, which make
use of top-ranked GFCs as predictive rules or
instance-level features of the pattern matches induced
by GFCs, respectively. Using real-world data, we
experimentally verify the efficiency and the
effectiveness of GFC-based techniques for fact checking
in knowledge graphs and verify its application in
knowledge exploration and news prediction.",
acknowledgement = ack-nhfb,
articleno =    "13",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Borges:2019:CSF,
author =       "Lu{\'\i}s Borges and Bruno Martins and P{\'a}vel
title =        "Combining Similarity Features and Deep Representation
Learning for Stance Detection in the Context of
Checking Fake News",
journal =      j-JDIQ,
volume =       "11",
number =       "3",
pages =        "14:1--14:??",
month =        jul,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3287763",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3287763",
abstract =     "Fake news is nowadays an issue of pressing concern,
given its recent rise as a potential threat to
high-quality journalism and well-informed public
discourse. The Fake News Challenge (FNC-1) was
organized in early 2017 to encourage the development of
machine-learning-based classification systems for
stance detection (i.e., for identifying whether a
particular news article agrees, disagrees, discusses,
or is unrelated to a particular news headline), thus
helping in the detection and analysis of possible
approach to tackle this stance detection problem, based
on the combination of string similarity features with a
deep neural network architecture that leverages ideas
previously advanced in the context of
learning-efficient text representations, document
classification, and natural language inference.
Specifically, we use bi-directional Recurrent Neural
Networks (RNNs), together with max-pooling over the
temporal/sequential dimension and neural attention, for
representing (i) the headline, (ii) the first two
sentences of the news article, and (iii) the entire
news article. These representations are then
combined/compared, complemented with similarity
features inspired on other FNC-1 approaches, and passed
to a final layer that predicts the stance of the
article toward the headline. We also explore the use of
external sources of information, specifically large
datasets of sentence pairs originally proposed for
training and evaluating natural language inference
methods to pre-train specific components of the neural
network architecture (e.g., the RNNs used for encoding
sentences). The obtained results attest to the
effectiveness of the proposed ideas and show that our
model, particularly when considering pre-training and
the combination of neural representations together with
similarity features, slightly outperforms the previous
state of the art.",
acknowledgement = ack-nhfb,
articleno =    "14",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Abiteboul:2019:TFD,
author =       "Serge Abiteboul and Julia Stoyanovich",
title =        "Transparency, Fairness, Data Protection, Neutrality:
Data Management Challenges in the Face of New
Regulation",
journal =      j-JDIQ,
volume =       "11",
number =       "3",
pages =        "15:1--15:??",
month =        jul,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3310231",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3310231",
abstract =     "The data revolution continues to transform every
sector of science, industry, and government. Due to the
incredible impact of data-driven technology on society,
we are becoming increasingly aware of the imperative to
use data and algorithms responsibly-in accordance with
three recent regulatory frameworks: the European
Union's General Data Protection Regulation (GDPR), the
New York City Automated Decisions Systems (ADS) Law,
and the Net Neutrality principle, which aim to protect
the rights of individuals who are impacted by data
collection and analysis. These frameworks are prominent
examples of a global trend: Governments are starting to
recognize the need to regulate data-driven algorithmic
regulatory frameworks to the attention of the data
management community and to underscore the technical
challenges they raise and that we, as a community, are
well-equipped to address. The main takeaway of this
article is that legal and ethical norms cannot be
incorporated into data-driven systems as an
afterthought. Rather, we must think in terms of
responsibility by design, viewing it as a systems
requirement.",
acknowledgement = ack-nhfb,
articleno =    "15",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bertino:2019:DTB,
author =       "Elisa Bertino and Ahish Kundu and Zehra Sura",
title =        "Data Transparency with Blockchain and {AI} Ethics",
journal =      j-JDIQ,
volume =       "11",
number =       "4",
pages =        "16:1--16:??",
month =        sep,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3312750",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3312750",
abstract =     "Providing a 360${}^\circ$ view of a given data item
especially for sensitive data is essential toward not
only protecting the data and associated privacy but
also assuring trust, compliance, and ethics of the
systems that use or manage such data. With the advent
of General Data Protection Regulation, California Data
Privacy Law, and other such regulatory requirements, it
is essential to support data transparency in all such
dimensions. Moreover, data transparency should not
violate privacy and security requirements. In this
article, we put forward a vision for how data
transparency would be achieved in a de-centralized
fashion using blockchain technology.",
acknowledgement = ack-nhfb,
articleno =    "16",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Fard:2019:ARA,
author =       "Amir Ebrahimi Fard and Scott Cunningham",
False and Unverified Information",
journal =      j-JDIQ,
volume =       "11",
number =       "4",
pages =        "17:1--17:??",
month =        sep,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3313788",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3313788",
abstract =     "The spread of false and unverified information has the
potential to inflict damage by harming the reputation
of individuals or organisations, shaking financial
markets, and influencing crowd decisions in important
events. This phenomenon needs to be properly curbed,
otherwise it can contaminate other aspects of our
social life. In this regard, academia as a key
institution against false and unverified information is
expected to play a pivotal role. Despite a great deal
of research in this arena, the amount of progress by
misjudgements about the performance of the topic of
interest that can ultimately result in wrong science
policies regarding academic efforts for quelling false
and unverified information. In this research, we
academia in the topic of false and unverified
information. To this end, we adopt the emergence
framework and measure its dimensions (novelty, growth,
coherence, and impact) over more than 21,000 articles
information. Our results show the current body of
research has had organic growth so far, which is not
promising enough for confronting the problem of false
and unverified information. To tackle this problem, we
suggest an external push strategy that, compared to the
early stages of the topic of interest, reinforces the
emergence dimensions and leads to a higher level in
every dimension.",
acknowledgement = ack-nhfb,
articleno =    "17",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Babcock:2019:DFF,
author =       "Matthew Babcock and David M. Beskow and Kathleen M.
Carley",
title =        "Different Faces of False: The Spread and Curtailment
of False Information in the {Black Panther Twitter}
Discussion",
journal =      j-JDIQ,
volume =       "11",
number =       "4",
pages =        "18:1--18:??",
month =        sep,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3339468",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3339468",
abstract =     "The task of combating false information online appears
daunting, in part due to a public focus on how quickly
it can spread and the clamor for automated
platform-based interventions. While such concerns can
be warranted, threat analysis and intervention design
both benefit from a fuller understanding of different
types of false information and of the community
responses to them. Here, we present a study of the most
tweeted about movie ever ( Black Panther ) in which the
spread of false information of four different types is
find that (1) false information tweets played a small
part in the overall conversation, (2) community-based
debunking and shaming responses to false posts about
attacks at theaters overwhelmed such posts by orders of
magnitude, (3) as another form of community response,
one type of false narrative (Satire) was used to attack
another (Fake Attacks), and (4) the four types of
false-information tweets differed in the use of
hashtags and in the role played by originating users
and responding users. Overall, this work helps to
illustrate the importance of investigating
on-the-ground'' community responses to fake news and
other types of digital false information and to inform
identification and intervention design and
implementation.",
acknowledgement = ack-nhfb,
articleno =    "18",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Bosu:2019:EQB,
author =       "Michael F. Bosu and Stephen G. Macdonell",
title =        "Experience: Quality Benchmarking of Datasets Used in
Software Effort Estimation",
journal =      j-JDIQ,
volume =       "11",
number =       "4",
pages =        "19:1--19:??",
month =        sep,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3328746",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3328746",
abstract =     "Data is a cornerstone of empirical software
engineering (ESE) research and practice. Data underpin
numerous process and project management activities,
including the estimation of development effort and the
prediction of the likely location and severity of
defects in code. Serious questions have been raised,
however, over the quality of the data used in ESE. Data
quality problems caused by noise, outliers, and
incompleteness have been noted as being especially
prevalent. Other quality issues, although also
potentially important, have received less attention. In
this study, we assess the quality of 13 datasets that
have been used extensively in research on software
effort estimation. The quality issues considered in
previously based on a systematic mapping of data
quality issues in ESE. Our contributions are as
follows: (1) an evaluation of the fitness for
purpose'' of these commonly used datasets and (2) an
assessment of the utility of the taxonomy in terms of
dataset benchmarking. We also propose a template that
could be used to both improve the ESE data
collection/submission process and to evaluate other
such datasets, contributing to enhanced awareness of
data quality issues in the ESE community and, in time,
the availability and use of higher-quality datasets.",
acknowledgement = ack-nhfb,
articleno =    "19",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Ding:2019:CSA,
author =       "Junhua Ding and Xinchuan Li and Xiaojun Kang and
title =        "A Case Study of the Augmentation and Evaluation of
Training Data for Deep Learning",
journal =      j-JDIQ,
volume =       "11",
number =       "4",
pages =        "20:1--20:??",
month =        sep,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3317573",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3317573",
abstract =     "Deep learning has been widely used for extracting
values from big data. As many other machine learning
algorithms, deep learning requires significant training
data. Experiments have shown both the volume and the
quality of training data can significantly impact the
effectiveness of the value extraction. In some cases,
the volume of training data is not sufficiently large
for effectively training a deep learning model. In
other cases, the quality of training data is not high
enough to achieve the optimal performance. Many
approaches have been proposed for augmenting training
data to mitigate the deficiency. However, whether the
augmented data are `fit for purpose'' of deep learning
is still a question. A framework for comprehensively
evaluating the effectiveness of the augmented data for
we first discuss a data augmentation approach for deep
learning. The approach includes two components: the
first one is to remove noisy data in a dataset using a
machine learning based classification to improve its
quality, and the second one is to increase the volume
of the dataset for effectively training a deep learning
model. To evaluate the quality of the augmented data in
fidelity, variety, and veracity, a data quality
evaluation framework is proposed. We demonstrated the
effectiveness of the data augmentation approach and the
data quality evaluation framework through studying an
automated classification of biology cell images using
deep learning. The experimental results clearly
demonstrated the impact of the volume and quality of
training data to the performance of deep learning and
the importance of the data quality evaluation. The data
augmentation approach and the data quality evaluation
framework can be straightforwardly adapted for deep
learning study in other domains.",
acknowledgement = ack-nhfb,
articleno =    "20",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Akhtar:2019:IAV,
author =       "Zahaib Akhtar and Anh Minh Le and Yun Seong Nam and
Jessica Chen and Ramesh Govindan and Ethan Katz-Bassett
and Sanjay Rao and Jibin Zhan",
title =        "Improving Adaptive Video Streaming through Session
Classification",
journal =      j-JDIQ,
volume =       "11",
number =       "4",
pages =        "21:1--21:??",
month =        sep,
year =         "2019",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3309682",
ISSN =         "1936-1955",
bibdate =      "Tue Oct 22 07:17:01 MDT 2019",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309682",
abstract =     "With internet video gaining increasing popularity and
soaring to dominate network traffic, extensive studies
are being carried out on how to achieve higher Quality
of Experience (QoE) with the delivery of video content.
Associated with the chunk-based streaming protocol,
Adaptive Bitrate (ABR) algorithms have recently emerged
to cope with the diverse and fluctuating network
conditions by dynamically adjusting bitrates for future
chunks. This inevitably involves predicting the future
throughput of a video session. Some of the session
features like Internet Service Provider (ISP),
geographical location, and so on, could affect network
conditions and contain helpful information for this
our knowledge about the session features can be
utilized to improve ABR quality via customized
parameter settings. We present our ABR-independent,
QoE-driven, feature-based partition method to classify
the logged video sessions so that different parameter
settings could be adopted in different situations to
reach better quality. A variation of Decision Tree is
developed for the classification and has been applied
to a sample ABR for evaluation. The experiment shows
that our approach can improve the average bitrate of
the sample ABR by 36.1\% without causing the increase
of the rebuffering ratio where 99\% of the sessions can
get improvement. It can also improve the rebuffering
ratio by 87.7\% without causing the decrease of the
average bitrate, where, among those sessions involved
in rebuffering, 82\% receives improvement and 18\%
remains the same.",
acknowledgement = ack-nhfb,
articleno =    "21",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "http://portal.acm.org/browse_dl.cfm?idx=J1191",
}

@Article{Milo:2020:GRD,
author =       "Tova Milo",
title =        "Getting Rid of Data",
journal =      j-JDIQ,
volume =       "12",
number =       "1",
pages =        "1--7",
month =        jan,
year =         "2020",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3326920",
ISSN =         "1936-1955",
bibdate =      "Thu Jan 23 07:39:46 MST 2020",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/doi/abs/10.1145/3326920",
abstract =     "We are experiencing an amazing data-centered
revolution. Incredible amounts of data are collected,
integrated, and analyzed, leading to key breakthroughs
in science and society. This well of knowledge,
however, is at a great risk if we do not dispense
\ldots{}",
acknowledgement = ack-nhfb,
articleno =    "1",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "https://dl.acm.org/loi/jdiq",
}

@Article{Firmani:2020:EDD,
author =       "Donatella Firmani and Letizia Tanca and Riccardo
Torlone",
title =        "Ethical Dimensions for Data Quality",
journal =      j-JDIQ,
volume =       "12",
number =       "1",
pages =        "1--5",
month =        jan,
year =         "2020",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3362121",
ISSN =         "1936-1955",
bibdate =      "Thu Jan 23 07:39:46 MST 2020",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/doi/abs/10.1145/3362121",
acknowledgement = ack-nhfb,
articleno =    "2",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "https://dl.acm.org/loi/jdiq",
}

@Article{Draisbach:2020:TPD,
author =       "Uwe Draisbach and Peter Christen and Felix Naumann",
title =        "Transforming Pairwise Duplicates to Entity Clusters
for High-quality Duplicate Detection",
journal =      j-JDIQ,
volume =       "12",
number =       "1",
pages =        "1--30",
month =        jan,
year =         "2020",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3352591",
ISSN =         "1936-1955",
bibdate =      "Thu Jan 23 07:39:46 MST 2020",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/doi/abs/10.1145/3352591",
abstract =     "Duplicate detection algorithms produce clusters of
database records, each cluster representing a single
real-world entity. As most of these algorithms use
pairwise comparisons, the resulting (transitive)
clusters can be inconsistent: Not all records
\ldots{}",
acknowledgement = ack-nhfb,
articleno =    "3",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "https://dl.acm.org/loi/jdiq",
}

@Article{Shakeel:2020:ASQ,
author =       "Yusra Shakeel and Jacob Kr{\~A}$1/4$ger and Ivonne Von
Nostitz-Wallwitz and Gunter Saake and Thomas Leich",
title =        "Automated Selection and Quality Assessment of Primary
Studies: a Systematic Literature Review",
journal =      j-JDIQ,
volume =       "12",
number =       "1",
pages =        "1--26",
month =        jan,
year =         "2020",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3356901",
ISSN =         "1936-1955",
bibdate =      "Thu Jan 23 07:39:46 MST 2020",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/doi/abs/10.1145/3356901",
abstract =     "Researchers use\&nbsp;systematic literature reviews
(SLRs) to synthesize existing evidence regarding a
research topic. While being an important means to
condense knowledge, conducting an SLR requires a large
amount of time and effort. Consequently, \ldots{}",
acknowledgement = ack-nhfb,
articleno =    "4",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "https://dl.acm.org/loi/jdiq",
}

@Article{Siagian:2020:RWC,
author =       "Al Hafiz Akbar Maulana Siagian and Masayoshi
Aritsugi",
title =        "Robustness of Word and Character {$N$}-gram
Combinations in Detecting Deceptive and Truthful
Opinions",
journal =      j-JDIQ,
volume =       "12",
number =       "1",
pages =        "1--24",
month =        jan,
year =         "2020",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3349536",
ISSN =         "1936-1955",
bibdate =      "Thu Jan 23 07:39:46 MST 2020",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/doi/abs/10.1145/3349536",
abstract =     "Opinions in reviews about the quality of products or
services can be important information for readers.
Unfortunately, such opinions may include deceptive ones
posted for some business reasons. To keep the opinions
as a valuable and trusted source of \ldots{}",
acknowledgement = ack-nhfb,
articleno =    "5",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "https://dl.acm.org/loi/jdiq",
}

@Article{Aswani:2020:EMM,
author =       "Reema Aswani and Arpan Kumar Kar and P. Vigneswara
Ilavarasan",
title =        "Experience: Managing Misinformation in Social
Analytics",
journal =      j-JDIQ,
volume =       "12",
number =       "1",
pages =        "1--18",
month =        jan,
year =         "2020",
CODEN =        "????",
DOI =          "https://doi.org/10.1145/3341107",
ISSN =         "1936-1955",
bibdate =      "Thu Jan 23 07:39:46 MST 2020",
bibsource =    "http://www.math.utah.edu/pub/tex/bib/jdiq.bib",
URL =          "https://dl.acm.org/doi/abs/10.1145/3341107",
abstract =     "Governance of misinformation is a serious concern in
social media platforms. Based on experiences gathered
from different case studies, we offer insights for the
policymakers on managing misinformation in social
media. These platforms are widely used \ldots{}",
acknowledgement = ack-nhfb,
articleno =    "6",
fjournal =     "Journal of Data and Information Quality (JDIQ)",
journal-URL =  "https://dl.acm.org/loi/jdiq",
}