@Preamble{"\input bibnames.sty" #
"\def \TM {${}^{\sc TM}$}"
}
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|http://www.math.utah.edu/~beebe/|"}
@String{j-POMACS = "Proceedings of the ACM on Measurement and
Analysis of Computing Systems (POMACS)"}
@Article{Chaintreau:2017:E,
author = "Augustin Chaintreau and Leana Golubchik and Zhi-Li
Zhang",
title = "Editorial",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "1:1--1:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3105875",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3105875",
acknowledgement = ack-nhfb,
articleno = "1",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Gibbens:2017:HND,
author = "Mathias Gibbens and Chris Gniady and Lei Ye and
Beichuan Zhang",
title = "{Hadoop} on Named Data Networking: Experience and
Results",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "2:1--2:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084439",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084439",
abstract = "The Named Data Networking (NDN) architecture retrieves
content by names rather than connecting to specific
hosts. It provides benefits such as highly efficient
and resilient content distribution, which fit well to
data-intensive distributed computing. This paper
presents and discusses our experience in modifying
Apache Hadoop, a popular MapReduce framework, to
operate on an NDN network. Through this
first-of-its-kind implementation process, we
demonstrate the feasibility of running an existing,
large, and complex piece of distributed software
commonly seen in data centers over NDN. We show
advantages such as simplified network code and reduced
network traffic which are beneficial in a data center
environment. There are also challenges faced by NDN,
that are being addressed by the community, which can be
magnified under data center traffic. Through detailed
evaluation, we show a reduction of 16\% for overall
data transmission between Hadoop nodes while writing
data with default replication settings. Preliminary
results also show promise for in-network caching of
repeated reads in distributed applications. We also
show that overall performance is currently slower under
NDN, and we identify challenges and opportunities for
further NDN improvements.",
acknowledgement = ack-nhfb,
articleno = "2",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Gong:2017:QPS,
author = "Long Gong and Paul Tune and Liang Liu and Sen Yang and
Jun (Jim) Xu",
title = "Queue-Proportional Sampling: A Better Approach to
Crossbar Scheduling for Input-Queued Switches",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "3:1--3:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084440",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084440",
abstract = "Most present day switching systems, in Internet
routers and data-center switches, employ a single
input-queued crossbar to interconnect input ports with
output ports. Such switches need to compute a matching,
between input and output ports, for each switching
cycle (time slot). The main challenge in designing such
matching algorithms is to deal with the unfortunate
tradeoff between the quality of the computed matching
and the computational complexity of the algorithm. In
this paper, we propose a general approach that can
significantly boost the performance of both SERENA and
iSLIP, yet incurs only O(1) additional computational
complexity at each input/output port. Our approach is a
novel proposing strategy, called Queue-Proportional
Sampling (QPS), that generates an excellent starter
matching. We show, through rigorous simulations, that
when starting with this starter matching, iSLIP and
SERENA can output much better final matching decisions,
as measured by the resulting throughput and delay
performance, than they otherwise can.",
acknowledgement = ack-nhfb,
articleno = "3",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Quach:2017:ILT,
author = "Alan Quach and Zhongjie Wang and Zhiyun Qian",
title = "Investigation of the 2016 {Linux TCP} Stack
Vulnerability at Scale",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "4:1--4:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084441",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084441",
abstract = "To combat blind in-window attacks against TCP, changes
proposed in RFC 5961 have been implemented by Linux
since late 2012. While successfully eliminating the old
vulnerabilities, the new TCP implementation was
reported in August 2016 to have introduced a subtle yet
serious security flaw. Assigned CVE-2016-5696, the flaw
exploits the challenge ACK rate limiting feature that
could allow an off-path attacker to infer the
presence/absence of a TCP connection between two
arbitrary hosts, terminate such a connection, and even
inject payload into an unsecured TCP connection. In
this work, we perform a comprehensive measurement of
the impact of the new vulnerability. This includes (1)
tracking the vulnerable Internet servers, (2)
monitoring the patch behavior over time, (3) picturing
the overall security status of TCP stacks at scale.
Towards this goal, we design a scalable measurement
methodology to scan the Alexa top 1 million websites
for almost 6 months. We also present how notifications
impact the patching behavior, and compare the result
with the Heartbleed and the Debian PRNG vulnerability.
The measurement represents a valuable data point in
understanding how Internet servers react to serious
security flaws in the operating system kernel.",
acknowledgement = ack-nhfb,
articleno = "4",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Sharma:2017:PDR,
author = "Prateek Sharma and David Irwin and Prashant Shenoy",
title = "Portfolio-driven Resource Management for Transient
Cloud Servers",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "5:1--5:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084442",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib;
http://www.math.utah.edu/pub/tex/bib/pvm.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084442",
abstract = "Cloud providers have begun to offer their surplus
capacity in the form of low-cost transient servers,
which can be revoked unilaterally at any time. While
the low cost of transient servers makes them attractive
for a wide range of applications, such as data
processing and scientific computing, failures due to
server revocation can severely degrade application
performance. Since different transient server types
offer different cost and availability tradeoffs, we
present the notion of server portfolios that is based
on financial portfolio modeling. Server portfolios
enable construction of an 'optimal' mix of severs to
meet an application's sensitivity to cost and
revocation risk. We implement model-driven portfolios
in a system called ExoSphere, and show how diverse
applications can use portfolios and
application-specific policies to gracefully handle
transient servers. We show that ExoSphere enables
widely-used parallel applications such as Spark, MPI,
and BOINC to be made transiency-aware with modest
effort. Our experiments show that allowing the
applications to use suitable transiency-aware policies,
ExoSphere is able to achieve 80\% cost savings when
compared to on-demand servers and greatly reduces
revocation risk compared to existing approaches.",
acknowledgement = ack-nhfb,
articleno = "5",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Cao:2017:DEC,
author = "Yi Cao and Javad Nejati and Muhammad Wajahat and Aruna
Balasubramanian and Anshul Gandhi",
title = "Deconstructing the Energy Consumption of the Mobile
Page Load",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "6:1--6:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084443",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084443",
abstract = "Modeling the energy consumption of applications on
mobile devices is an important topic that has received
much attention in recent years. However, there has been
very little research on modeling the energy consumption
of the mobile Web. This is primarily due to the
short-lived yet complex page load process that makes it
infeasible to rely on coarse-grained resource
monitoring for accurate power estimation. We present
RECON, a modeling approach that accurately estimates
the energy consumption of any Web page load and
deconstructs it into the energy contributions of
individual page load activities. Our key intuition is
to leverage low-level application semantics in addition
to coarse-grained resource utilizations for modeling
the page load energy consumption. By exploiting
fine-grained information about the individual
activities that make up the page load, RECON enables
fast and accurate energy estimations without requiring
complex models. Experiments across 80 Web pages and
under four different optimizations show that RECON can
estimate the energy consumption for a Web page load
with an average error of less than 7\%. Importantly,
RECON helps to analyze and explain the energy effects
of an optimization on the individual components of Web
page loads.",
acknowledgement = ack-nhfb,
articleno = "6",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Wang:2017:OTS,
author = "Xin Wang and Richard T. B. Ma and Yinlong Xu",
title = "On Optimal Two-Sided Pricing of Congested Networks",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "7:1--7:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084444",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084444",
abstract = "Traditionally, Internet Access Providers (APs) only
charge end-users for Internet access services; however,
to recoup infrastructure costs and increase revenues,
some APs have recently adopted two-sided pricing
schemes under which both end-users and content
providers are charged. Meanwhile, with the rapid growth
of traffic, network congestion could seriously degrade
user experiences and influence providers' utility. To
optimize profit and social welfare, APs and regulators
need to design appropriate pricing strategies and
regulatory policies that take the effects of network
congestion into consideration. In this paper, we model
two-sided networks under which users' traffic demands
are influenced by exogenous pricing and endogenous
congestion parameters and derive the system congestion
under an equilibrium. We characterize the structures
and sensitivities of profit- and welfare-optimal
two-sided pricing schemes and reveal that (1) the
elasticity of system throughput plays a crucial role in
determining the structures of optimal pricing, (2) the
changes of optimal pricing under varying AP's capacity
and users' congestion sensitivity are largely driven by
the type of data traffic, e.g., text or video, and (3)
APs and regulators will be incentivized to shift from
one-sided to two-sided pricing when APs' capacities and
user demand for video traffic grow. Our results can
help APs design optimal two-sided pricing and guide
regulators to legislate desirable policies.",
acknowledgement = ack-nhfb,
articleno = "7",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Casale:2017:API,
author = "Giuliano Casale",
title = "Accelerating Performance Inference over Closed Systems
by Asymptotic Methods",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "8:1--8:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084445",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084445",
abstract = "Recent years have seen a rapid growth of interest in
exploiting monitoring data collected from enterprise
applications for automated management and performance
analysis. In spite of this trend, even simple
performance inference problems involving queueing
theoretic formulas often incur computational
bottlenecks, for example upon computing likelihoods in
models of batch systems. Motivated by this issue, we
revisit the solution of multiclass closed queueing
networks, which are popular models used to describe
batch and distributed applications with parallelism
constraints. We first prove that the normalizing
constant of the equilibrium state probabilities of a
closed model can be reformulated exactly as a
multidimensional integral over the unit simplex. This
gives as a by-product novel explicit expressions for
the multiclass normalizing constant. We then derive a
method based on cubature rules to efficiently evaluate
the proposed integral form in small and medium-sized
models. For large models, we propose novel asymptotic
expansions and Monte Carlo sampling methods to
efficiently and accurately approximate normalizing
constants and likelihoods. We illustrate the resulting
accuracy gains in problems involving optimization-based
inference.",
acknowledgement = ack-nhfb,
articleno = "8",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Ju:2017:HLS,
author = "Xiaoen Ju and Hani Jamjoom and Kang G. Shin",
title = "{Hieroglyph}: Locally-Sufficient Graph Processing via
Compute--Sync--Merge",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "9:1--9:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084446",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084446",
abstract = "Despite their widespread adoption, large-scale graph
processing systems do not fully decouple computation
and communication, often yielding suboptimal
performance. Locally-sufficient computation-computation
that relies only on the graph state local to a
computing host-can mitigate the effects of this
coupling. In this paper, we present
Compute--Sync--Merge (CSM), a new programming
abstraction that achieves efficient locally-sufficient
computation. CSM enforces local sufficiency at the
programming abstraction level and enables the
activation of vertex-centric computation on all vertex
replicas, thus supporting vertex-cut partitioning. We
demonstrate the simplicity of expressing several
fundamental graph algorithms in CSM. Hieroglyph-our
implementation of a graph processing system with CSM
support-outperforms state of the art by up to 53x, with
a median speedup of 3.5x and an average speedup of 6x
across a wide range of datasets.",
acknowledgement = ack-nhfb,
articleno = "9",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Chang:2017:URV,
author = "Kevin K. Chang and A. Giray Ya{\u{a}}lik{\c{c}}i and
Saugata Ghose and Aditya Agrawal and Niladrish
Chatterjee and Abhijith Kashyap and Donghyuk Lee and
Mike O'Connor and Hasan Hassan and Onur Mutlu",
title = "Understanding Reduced-Voltage Operation in Modern
{DRAM} Devices: Experimental Characterization,
Analysis, and Mechanisms",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "10:1--10:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084447",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084447",
abstract = "The energy consumption of DRAM is a critical concern
in modern computing systems. Improvements in
manufacturing process technology have allowed DRAM
vendors to lower the DRAM supply voltage
conservatively, which reduces some of the DRAM energy
consumption. We would like to reduce the DRAM supply
voltage more aggressively, to further reduce energy.
Aggressive supply voltage reduction requires a thorough
understanding of the effect voltage scaling has on DRAM
access latency and DRAM reliability. In this paper, we
take a comprehensive approach to understanding and
exploiting the latency and reliability characteristics
of modern DRAM when the supply voltage is lowered below
the nominal voltage level specified by DRAM standards.
Using an FPGA-based testing platform, we perform an
experimental study of 124 real DDR3L (low-voltage) DRAM
chips manufactured recently by three major DRAM
vendors. We find that reducing the supply voltage below
a certain point introduces bit errors in the data, and
we comprehensively characterize the behavior of these
errors. We discover that these errors can be avoided by
increasing the latency of three major DRAM operations
(activation, restoration, and precharge). We perform
detailed DRAM circuit simulations to validate and
explain our experimental findings. We also characterize
the various relationships between reduced supply
voltage and error locations, stored data patterns, DRAM
temperature, and data retention. Based on our
observations, we propose a new DRAM energy reduction
mechanism, called Voltron. The key idea of Voltron is
to use a performance model to determine by how much we
can reduce the supply voltage without introducing
errors and without exceeding a user-specified threshold
for performance loss. Our evaluations show that Voltron
reduces the average DRAM and system energy consumption
by 10.5\% and 7.3\%, respectively, while limiting the
average system performance loss to only 1.8\%, for a
variety of memory-intensive quad-core workloads. We
also show that Voltron significantly outperforms prior
dynamic voltage and frequency scaling mechanisms for
DRAM.",
acknowledgement = ack-nhfb,
articleno = "10",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Wang:2017:UBI,
author = "Cheng Wang and Bhuvan Urgaonkar and Neda Nasiriani and
George Kesidis",
title = "Using Burstable Instances in the Public Cloud: Why,
When and How?",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "11:1--11:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084448",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib;
http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084448",
abstract = "Amazon EC2 and Google Compute Engine (GCE) have
recently introduced a new class of virtual machines
called 'burstable' instances that are cheaper than even
the smallest traditional/regular instances. These lower
prices come with reduced average capacity and increased
variance. Using measurements from both EC2 and GCE, we
identify key idiosyncrasies of resource capacity
dynamism for burstable instances that set them apart
from other instance types. Most importantly, certain
resources for these instances appear to be regulated by
deterministic token bucket like mechanisms. We find
widely different types of disclosures by providers of
the parameters governing these regulation mechanisms:
full disclosure (e.g., CPU capacity for EC2 t2
instances), partial disclosure (e.g., CPU capacity and
remote disk IO bandwidth for GCE shared-core
instances), or no disclosure (network bandwidth for EC2
t2 instances). A tenant modeling these variations as
random phenomena (as some recent work suggests) might
make sub-optimal procurement and operation decisions.
We present modeling techniques for a tenant to infer
the properties of these regulation mechanisms via
simple offline measurements. We also present two case
studies of how certain memcached workloads might
benefit from our modeling when operating on EC2 by: (i)
augmenting cheap but low availability in-memory storage
offered by spot instances with backup of popular
content on burstable instances, and (ii) temporal
multiplexing of multiple burstable instances to achieve
the CPU or network bandwidth (and thereby throughput)
equivalent of a more expensive regular EC2 instance.",
acknowledgement = ack-nhfb,
articleno = "11",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Ying:2017:SMM,
author = "Lei Ying",
title = "{Stein}'s Method for Mean Field Approximations in
Light and Heavy Traffic Regimes",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "12:1--12:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084449",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084449",
abstract = "Mean-field analysis is an analytical method for
understanding large-scale stochastic systems such as
large-scale data centers and communication networks.
The idea is to approximate the stationary distribution
of a large-scale stochastic system using the
equilibrium point (called the mean-field limit) of a
dynamical system (called the mean-field model). This
approximation is often justified by proving the weak
convergence of stationary distributions to its
mean-field limit. Most existing mean-field models
concerned the light-traffic regime where the load of
the system, denote by $ \rho $, is strictly less than
one and is independent of the size of the system. This
is because a traditional mean-field model represents
the limit of the corresponding stochastic system.
Therefore, the load of the mean-field model is $ \rho =
l i m N \to \infty \rho (N) $, where $ \rho (N) $ is
the load of the stochastic system of size $N$. Now if $
\rho (N) \to 1$ as $ N \to \infty $ (i.e., in the
heavy-traffic regime), then $ \rho = 1$. For most
systems, the mean-field limits when $ \rho = 1$ are
trivial and meaningless. To overcome this difficulty of
traditional mean-field models, this paper takes a
different point of view on mean-field models. Instead
of regarding a mean-field model as the limiting system
of large-scale stochastic system, it views the
equilibrium point of the mean-field model, called a
mean-field solution, simply as an approximation of the
stationary distribution of the finite-size system.
Therefore both mean-field models and solutions can be
functions of $N$. This paper first outlines an
analytical method to bound the approximation error
based on Stein's method and the perturbation theory. We
further present two examples: the M/M/N queueing system
and the supermarket model under the
power-of-two-choices algorithm. For both applications,
the method enables us to characterize the system
performance under a broad range of traffic loads. For
the supermarket model, this is the first paper that
rigorously quantifies the steady-state performance of
the-power-of-two-choices in the heavy-traffic regime.
These results in the heavy-traffic regime cannot be
obtained using the traditional mean-field analysis and
the interchange of the limits.",
acknowledgement = ack-nhfb,
articleno = "12",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Wang:2017:SGN,
author = "Sinong Wang and Ness Shroff",
title = "Security Game with Non-additive Utilities and Multiple
Attacker Resources",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "13:1--13:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084450",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084450",
abstract = "There has been significant interest in studying
security games for modeling the interplay of attacks
and defenses on various systems involving critical
infrastructure, financial system security, political
campaigns, and civil safeguarding. However, existing
security game models typically either assume additive
utility functions, or that the attacker can attack only
one target. Such assumptions lead to tractable
analysis, but miss key inherent dependencies that exist
among different targets in current complex networks. In
this paper, we generalize the classical security game
models to allow for non-additive utility functions. We
also allow attackers to be able to attack multiple
targets. We examine such a general security game from a
theoretical perspective and provide a unified view. In
particular, we show that each security game is
equivalent to a combinatorial optimization problem over
a set system $ \epsilon $, which consists of defender's
pure strategy space. The key technique we use is based
on the transformation, projection of a polytope, and
the ellipsoid method. This work settles several open
questions in security game domain and significantly
extends the state-of-the-art of both the polynomial
solvable and NP-hard class of the security game.",
acknowledgement = ack-nhfb,
articleno = "13",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Li:2017:SYE,
author = "Lingda Li and Robel Geda and Ari B. Hayes and Yanhao
Chen and Pranav Chaudhari and Eddy Z. Zhang and Mario
Szegedy",
title = "A Simple Yet Effective Balanced Edge Partition Model
for Parallel Computing",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "14:1--14:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084451",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084451",
abstract = "Graph edge partition models have recently become an
appealing alternative to graph vertex partition models
for distributed computing due to their flexibility in
balancing loads and their performance in reducing
communication cost [6, 16]. In this paper, we propose a
simple yet effective graph edge partitioning algorithm.
In practice, our algorithm provides good partition
quality (and better than similar state-of-the-art edge
partition approaches, at least for power-law graphs)
while maintaining low partition overhead. In theory,
previous work [6] showed that an approximation
guarantee of $ O(d_{\rm max} \sqrt {\log n \log k}) $
apply to the graphs with $ m = \Omega (k^2) $ edges
($k$ is the number of partitions). We further
rigorously proved that this approximation guarantee
hold for all graphs. We show how our edge partition
model can be applied to parallel computing. We draw our
example from GPU program locality enhancement and
demonstrate that the graph edge partition model does
not only apply to distributed computing with many
computer nodes, but also to parallel computing in a
single computer node with a many-core processor.",
acknowledgement = ack-nhfb,
articleno = "14",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Zhou:2017:PSM,
author = "You Zhou and Yian Zhou and Min Chen and Shigang
Chen",
title = "Persistent Spread Measurement for Big Network Data
Based on Register Intersection",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "15:1--15:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084452",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084452",
abstract = "Persistent spread measurement is to count the number
of distinct elements that persist in each network flow
for predefined time periods. It has many practical
applications, including detecting long-term stealthy
network activities in the background of normal-user
activities, such as stealthy DDoS attack, stealthy
network scan, or faked network trend, which cannot be
detected by traditional flow cardinality measurement.
With big network data, one challenge is to measure the
persistent spreads of a massive number of flows without
incurring too much memory overhead as such measurement
may be performed at the line speed by network
processors with fast but small on-chip memory. We
propose a highly compact Virtual Intersection
HyperLogLog (VI-HLL) architecture for this purpose. It
achieves far better memory efficiency than the best
prior work of V-Bitmap, and in the meantime drastically
extends the measurement range. Theoretical analysis and
extensive experiments demonstrate that VI-HLL provides
good measurement accuracy even in very tight memory
space of less than 1 bit per flow.",
acknowledgement = ack-nhfb,
articleno = "15",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Bondorf:2017:QCD,
author = "Steffen Bondorf and Paul Nikolaus and Jens B.
Schmitt",
title = "Quality and Cost of Deterministic Network Calculus:
Design and Evaluation of an Accurate and Fast
Analysis",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "16:1--16:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084453",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084453",
abstract = "Networks are integral parts of modern safety-critical
systems and certification demands the provision of
guarantees for data transmissions. Deterministic
Network Calculus (DNC) can compute a worst-case bound
on a data flow's end-to-end delay. Accuracy of DNC
results has been improved steadily, resulting in two
DNC branches: the classical algebraic analysis and the
more recent optimization-based analysis. The
optimization-based branch provides a theoretical
solution for tight bounds. Its computational cost
grows, however, (possibly super-)exponentially with the
network size. Consequently, a heuristic optimization
formulation trading accuracy against computational
costs was proposed. In this article, we challenge
optimization-based DNC with a new algebraic DNC
algorithm. We show that: no current optimization
formulation scales well with the network size and
algebraic DNC can be considerably improved in both
aspects, accuracy and computational cost. To that end,
we contribute a novel DNC algorithm that transfers the
optimization's search for best attainable delay bounds
to algebraic DNC. It achieves a high degree of accuracy
and our novel efficiency improvements reduce the cost
of the analysis dramatically. In extensive numerical
experiments, we observe that our delay bounds deviate
from the optimization-based ones by only 1.142\% on
average while computation times simultaneously decrease
by several orders of magnitude.",
acknowledgement = ack-nhfb,
articleno = "16",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Gast:2017:EVE,
author = "Nicolas Gast",
title = "Expected Values Estimated via Mean-Field Approximation
are {$ 1 / N $}-Accurate",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "17:1--17:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084454",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084454",
abstract = "Mean-field approximation is a powerful tool to study
large-scale stochastic systems such as data-centers ---
one example being the famous power of two-choice
paradigm. It is shown in the literature that under
quite general conditions, the empirical measure of a
system of $N$ interacting objects converges at rate $
O(1 \sqrt {N}) $ to a deterministic dynamical system,
called its mean-field approximation. In this paper, we
revisit the accuracy of mean-field approximation by
focusing on expected values. We show that, under almost
the same general conditions, the expectation of any
performance functional converges at rate $ O(1 / N) $
to its mean-field approximation. Our result applies for
finite and infinite-dimensional mean-field models. We
also develop a new perturbation theory argument that
shows that the result holds for the stationary regime
if the dynamical system is asymptotically exponentially
stable. We provide numerical experiments that
demonstrate that this rate of convergence is tight and
that illustrate the necessity of our conditions. As an
example, we apply our result to the classical
two-choice model. By combining our theory with
numerical experiments, we claim that, as the load rho
goes to 1, the average queue length of a two-choice
system with $N$ servers is $ \log 2 1 / (1 - \rho) + 1
/ (2 N (1 - \rho) + O(1 / N^2))$.",
acknowledgement = ack-nhfb,
articleno = "17",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Wang:2017:CMP,
author = "Brandon Wang and Xiaoye Li and Leandro P. de Aguiar
and Daniel S. Menasche and Zubair Shafiq",
title = "Characterizing and Modeling Patching Practices of
Industrial Control Systems",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "18:1--18:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084455",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084455",
abstract = "Industrial Control Systems (ICS) are widely deployed
in mission critical infrastructures such as
manufacturing, energy, and transportation. The mission
critical nature of ICS devices poses important security
challenges for ICS vendors and asset owners. In
particular, the patching of ICS devices is usually
deferred to scheduled production outages so as to
prevent potential operational disruption of critical
systems. Unfortunately, anecdotal evidence suggests
that ICS devices are riddled with security
vulnerabilities that are not patched in a timely
manner, which leaves them vulnerable to exploitation by
hackers, nation states, and hacktivist organizations.
In this paper, we present the results from our
longitudinal measurement and characterization study of
ICS patching behavior. Our study is based on IP scan
data collected from Shodan over the duration of three
years for more than 500 known industrial ICS protocols
and products. Our longitudinal measurements reveal the
impact of vulnerability disclosures on ICS patching.
Our analysis of more than 100 thousand Internet-exposed
ICS devices reveals that about 50\% upgrade to newer
patched versions within 60 days of a vulnerability
disclosure. Based on our measurement and analysis, we
further propose a variation of the Bass model to
forecast the patching behavior of ICS devices. The
evaluation shows that our proposed models have
comparable prediction accuracy when contrasted against
traditional ARIMA timeseries forecasting models, while
requiring less parameters and being amenable to direct
physical interpretation.",
acknowledgement = ack-nhfb,
articleno = "18",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Formby:2017:CSP,
author = "David Formby and Anwar Walid and Raheem Beyah",
title = "A Case Study in Power Substation Network Dynamics",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "19:1--19:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084456",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084456",
abstract = "The modern world is becoming increasingly dependent on
computing and communication technology to function, but
unfortunately its application and impact on areas such
as critical infrastructure and industrial control
system (ICS) networks remains to be thoroughly studied.
Significant research has been conducted to address the
myriad security concerns in these areas, but they are
virtually all based on artificial testbeds or
simulations designed on assumptions about their
behavior either from knowledge of traditional IT
networking or from basic principles of ICS operation.
In this work, we provide the most detailed
characterization of an example ICS to date in order to
determine if these common assumptions hold true. A live
power distribution substation is observed over the
course of two and a half years to measure its behavior
and evolution over time. Then, a horizontal study is
conducted that compared this behavior with three other
substations from the same company. Although most
predictions were found to be correct, some unexpected
behavior was observed that highlights the fundamental
differences between ICS and IT networks including round
trip times dominated by processing speed as opposed to
network delay, several well known TCP features being
largely irrelevant, and surprisingly large jitter from
devices running real-time operating systems. The impact
of these observations is discussed in terms of
generality to other embedded networks, network security
applications, and the suitability of the TCP protocol
for this environment.",
acknowledgement = ack-nhfb,
articleno = "19",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Nguyen:2017:OIC,
author = "Hung T. Nguyen and Tri P. Nguyen and Tam N. Vu and
Thang N. Dinh",
title = "Outward Influence and Cascade Size Estimation in
Billion-scale Networks",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "20:1--20:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084457",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084457",
abstract = "Estimating cascade size and nodes' influence is a
fundamental task in social, technological, and
biological networks. Yet this task is extremely
challenging due to the sheer size and the structural
heterogeneity of networks. We investigate a new
influence measure, termed outward influence (OI),
defined as the (expected) number of nodes that a subset
of nodes S will activate, excluding the nodes in S.
Thus, OI equals, the de facto standard measure,
influence spread of S minus | S |. OI is not only more
informative for nodes with small influence, but also,
critical in designing new effective sampling and
statistical estimation methods. Based on OI, we propose
SIEA/SOIEA, novel methods to estimate influence
spread/outward influence at scale and with rigorous
theoretical guarantees. The proposed methods are built
on two novel components (1) IICP an important sampling
method for outward influence; and (2) RSA, a robust
mean estimation method that minimize the number of
samples through analyzing variance and range of random
variables. Compared to the state-of-the art for
influence estimation, SIEA is $ \Omega (\log 4 n) $
times faster in theory and up to several orders of
magnitude faster in practice. For the first time,
influence of nodes in the networks of billions of edges
can be estimated with high accuracy within a few
minutes. Our comprehensive experiments on real-world
networks also give evidence against the popular
practice of using a fixed number, e.g. 10K or 20K, of
samples to compute the 'ground truth' for influence
spread.",
acknowledgement = ack-nhfb,
articleno = "20",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Choi:2017:EDL,
author = "Wonil Choi and Mohammad Arjomand and Myoungsoo Jung
and Mahmut Kandemir",
title = "Exploiting Data Longevity for Enhancing the Lifetime
of Flash-based Storage Class Memory",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "21:1--21:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084458",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084458",
abstract = "Storage-class memory (SCM) combines the benefits of a
solid-state memory, such as high-performance and
robustness, with the archival capabilities and low cost
of conventional hard-disk magnetic storage. Among
candidate solid-state nonvolatile memory technologies
that could potentially be used to construct SCM, flash
memory is a well-established technology and have been
widely used in commercially available SCM incarnations.
Flash-based SCM enables much better tradeoffs between
performance, space and power than disk-based systems.
However, write endurance is a significant challenge for
a flash-based SCM (each act of writing a bit may
slightly damage a cell, so one flash cell can be
written 10^4-10^5 times, depending on the flash
technology, before it becomes unusable). This is a
well-documented problem and has received a lot of
attention by manufactures that are using some
combination of write reduction and wear-leveling
techniques for achieving longer lifetime. In an effort
to improve flash lifetime, first, by quantifying data
longevity in an SCM, we show that a majority of the
data stored in a solid-state SCM do not require long
retention times provided by flash memory (i.e., up to
10 years in modern devices); second, by exploiting
retention time relaxation, we propose a novel
mechanism, called Dense-SLC (D-SLC), which enables us
perform multiple writes into a cell during each erase
cycle for lifetime extension; and finally, we discuss
the required changes in the flash management software
(FTL) in order to use D-SLC mechanism for extending the
lifetime of the solid-state part of an SCM. Using an
extensive simulation-based analysis of an SLC
flash-based SCM, we demonstrate that D-SLC is able to
significantly improve device lifetime (between 5.1X and
8.6X) with no performance overhead and also very small
changes at the FTL software.",
acknowledgement = ack-nhfb,
articleno = "21",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Venkatakrishnan:2017:DRB,
author = "Shaileshh Bojja Venkatakrishnan and Giulia Fanti and
Pramod Viswanath",
title = "{Dandelion}: Redesigning the {Bitcoin} Network for
Anonymity",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "22:1--22:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084459",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084459",
abstract = "Bitcoin and other cryptocurrencies have surged in
popularity over the last decade. Although Bitcoin does
not claim to provide anonymity for its users, it enjoys
a public perception of being a privacy preserving
financial system. In reality, cryptocurrencies publish
users' entire transaction histories in plaintext,
albeit under a pseudonym; this is required for
transaction validation. Therefore, if a user's
pseudonym can be linked to their human identity, the
privacy fallout can be significant. Recently,
researchers have demonstrated deanonymization attacks
that exploit weaknesses in the Bitcoin network's
peer-to-peer (P2P) networking protocols. In particular,
the P2P network currently forwards content in a
structured way that allows observers to deanonymize
users. In this work, we redesign the P2P network from
first principles with the goal of providing strong,
provable anonymity guarantees. We propose a simple
networking policy called Dandelion which provides
quasi-optimal, network-wide anonymity, with minimal
cost to the network's utility. We also discuss
practical implementation challenges and propose
heuristic solutions.",
acknowledgement = ack-nhfb,
articleno = "22",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Zhang:2017:OPP,
author = "Zijun Zhang and Zongpeng Li and Chuan Wu",
title = "Optimal Posted Prices for Online Cloud Resource
Allocation",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "23:1--23:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084460",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084460",
abstract = "We study online resource allocation in a cloud
computing platform through posted pricing: The cloud
provider publishes a unit price for each resource type,
which may vary over time; upon arrival at the cloud
system, a cloud user either takes the current prices,
renting resources to execute its job, or refuses the
prices without running its job there. We design pricing
functions based on current resource utilization ratios,
in a wide array of demand-supply relationships and
resource occupation durations, and prove worst-case
competitive ratios in social welfare. In the basic case
of a single-type, non-recycled resource (allocated
resources are not later released for reuse), we prove
that our pricing function design is optimal, in that it
achieves the smallest competitive ratio among all
possible pricing functions. Insights obtained from the
basic case are then used to generalize the pricing
functions to more realistic cloud systems with multiple
types of resources, where a job occupies allocated
resources for a number of time slots till completion,
upon which time the resources are returned to the cloud
resource pool.",
acknowledgement = ack-nhfb,
articleno = "23",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Sun:2017:ASM,
author = "Wen Sun and Veronique Simon and Sebastien Monnet and
Philippe Robert and Pierre Sens",
title = "Analysis of a Stochastic Model of Replication in Large
Distributed Storage Systems: A Mean-Field Approach",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "24:1--24:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084462",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084462",
abstract = "Distributed storage systems such as Hadoop File System
or Google File System (GFS) ensure data availability
and durability using replication. Persistence is
achieved by replicating the same data block on several
nodes, and ensuring that a minimum number of copies are
available on the system at any time. Whenever the
contents of a node are lost, for instance due to a hard
disk crash, the system regenerates the data blocks
stored before the failure by transferring them from the
remaining replicas. This paper is focused on the
analysis of the efficiency of replication mechanism
that determines the location of the copies of a given
file at some server. The variability of the loads of
the nodes of the network is investigated for several
policies. Three replication mechanisms are tested
against simulations in the context of a real
implementation of a such a system: Random, Least Loaded
and Power of Choice. The simulations show that some of
these policies may lead to quite unbalanced situations:
if $ \beta $ is the average number of copies per node
it turns out that, at equilibrium, the load of the
nodes may exhibit a high variability. It is shown in
this paper that a simple variant of a power of choice
type algorithm has a striking effect on the loads of
the nodes: at equilibrium, the distribution of the load
of a node has a bounded support, most of nodes have a
load less than $ 2 \beta $ which is an interesting
property for the design of the storage space of these
systems. Stochastic models are introduced and
investigated to explain this interesting phenomenon.",
acknowledgement = ack-nhfb,
articleno = "24",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Mukherjee:2017:OSE,
author = "Debankur Mukherjee and Souvik Dhara and Sem C. Borst
and Johan S. H. van Leeuwaarden",
title = "Optimal Service Elasticity in Large-Scale Distributed
Systems",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "25:1--25:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084463",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084463",
abstract = "A fundamental challenge in large-scale cloud networks
and data centers is to achieve highly efficient server
utilization and limit energy consumption, while
providing excellent user-perceived performance in the
presence of uncertain and time-varying demand patterns.
Auto-scaling provides a popular paradigm for
automatically adjusting service capacity in response to
demand while meeting performance targets, and
queue-driven auto-scaling techniques have been widely
investigated in the literature. In typical data center
architectures and cloud environments however, no
centralized queue is maintained, and load balancing
algorithms immediately distribute incoming tasks among
parallel queues. In these distributed settings with
vast numbers of servers, centralized queue-driven
auto-scaling techniques involve a substantial
communication overhead and major implementation burden,
or may not even be viable at all. Motivated by the
above issues, we propose a joint auto-scaling and load
balancing scheme which does not require any global
queue length information or explicit knowledge of
system parameters, and yet provides provably
near-optimal service elasticity. We establish the
fluid-level dynamics for the proposed scheme in a
regime where the total traffic volume and nominal
service capacity grow large in proportion. The
fluid-limit results show that the proposed scheme
achieves asymptotic optimality in terms of
user-perceived delay performance as well as energy
consumption. Specifically, we prove that both the
waiting time of tasks and the relative energy portion
consumed by idle servers vanish in the limit. At the
same time, the proposed scheme operates in a
distributed fashion and involves only constant
communication overhead per task, thus ensuring
scalability in massive data center operations.
Extensive simulation experiments corroborate the
fluid-limit results, and demonstrate that the proposed
scheme can match the user performance and energy
consumption of state-of-the-art approaches that do take
full advantage of a centralized queue.",
acknowledgement = ack-nhfb,
articleno = "25",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Lee:2017:DIL,
author = "Donghyuk Lee and Samira Khan and Lavanya Subramanian
and Saugata Ghose and Rachata Ausavarungnirun and
Gennady Pekhimenko and Vivek Seshadri and Onur Mutlu",
title = "Design-Induced Latency Variation in Modern {DRAM}
Chips: Characterization, Analysis, and Latency
Reduction Mechanisms",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "26:1--26:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084464",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084464",
abstract = "Variation has been shown to exist across the cells
within a modern DRAM chip. Prior work has studied and
exploited several forms of variation, such as
manufacturing-process- or temperature-induced
variation. We empirically demonstrate a new form of
variation that exists within a real DRAM chip, induced
by the design and placement of different components in
the DRAM chip: different regions in DRAM, based on
their relative distances from the peripheral
structures, require different minimum access latencies
for reliable operation. In particular, we show that in
most real DRAM chips, cells closer to the peripheral
structures can be accessed much faster than cells that
are farther. We call this phenomenon design-induced
variation in DRAM. Our goals are to (i) understand
design-induced variation that exists in real,
state-of-the-art DRAM chips, (ii) exploit it to develop
low-cost mechanisms that can dynamically find and use
the lowest latency at which to operate a DRAM chip
reliably, and, thus, (iii) improve overall system
performance while ensuring reliable system operation.
To this end, we first experimentally demonstrate and
analyze designed-induced variation in modern DRAM
devices by testing and characterizing 96 DIMMs (768
DRAM chips). Our characterization identifies DRAM
regions that are vulnerable to errors, if operated at
lower latency, and finds consistency in their locations
across a given DRAM chip generation, due to
design-induced variation. Based on our extensive
experimental analysis, we develop two mechanisms that
reliably reduce DRAM latency. First, DIVA Profiling
uses runtime profiling to dynamically identify the
lowest DRAM latency that does not introduce failures.
DIVA Profiling exploits design-induced variation and
periodically profiles only the vulnerable regions to
determine the lowest DRAM latency at low cost. It is
the first mechanism to dynamically determine the lowest
latency that can be used to operate DRAM reliably. DIVA
Profiling reduces the latency of read/write requests by
35.1\% / 57.8\%, respectively, at 55$^\ocirc $C. Our
second mechanism, DIVA Shuffling, shuffles data such
that values stored in vulnerable regions are mapped to
multiple error-correcting code (ECC) codewords. As a
result, DIVA Shuffling can correct 26\% more multi-bit
errors than conventional ECC. Combined together, our
two mechanisms reduce read/write latency by
40.0\%/60.5\%, which translates to an overall system
performance improvement of 14.7\%/13.7\%/13.8\% (in
2-/4-/8-core systems) across a variety of workloads,
while ensuring reliable operation.",
acknowledgement = ack-nhfb,
articleno = "26",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}
@Article{Avrachenkov:2017:LCA,
author = "Konstantin Avrachenkov and Jasper Goseling and Berksan
Serbetci",
title = "A Low-Complexity Approach to Distributed Cooperative
Caching with Geographic Constraints",
journal = j-POMACS,
volume = "1",
number = "1",
pages = "27:1--27:??",
month = jun,
year = "2017",
CODEN = "????",
DOI = "https://doi.org/10.1145/3084465",
ISSN = "2476-1249",
ISSN-L = "2476-1249",
bibdate = "Fri Jun 16 09:11:52 MDT 2017",
bibsource = "http://www.math.utah.edu/pub/tex/bib/pomacs.bib",
URL = "http://dl.acm.org/citation.cfm?id=3084465",
abstract = "We consider caching in cellular networks in which each
base station is equipped with a cache that can store a
limited number of files. The popularity of the files is
known and the goal is to place files in the caches such
that the probability that a user at an arbitrary
location in the plane will find the file that she
requires in one of the covering caches is maximized. We
develop distributed asynchronous algorithms for
deciding which contents to store in which cache. Such
cooperative algorithms require communication only
between caches with overlapping coverage areas and can
operate in asynchronous manner. The development of the
algorithms is principally based on an observation that
the problem can be viewed as a potential game. Our
basic algorithm is derived from the best response
dynamics. We demonstrate that the complexity of each
best response step is independent of the number of
files, linear in the cache capacity and linear in the
maximum number of base stations that cover a certain
area. Then, we show that the overall algorithm
complexity for a discrete cache placement is polynomial
in both network size and catalog size. In practical
examples, the algorithm converges in just a few
iterations. Also, in most cases of interest, the basic
algorithm finds the best Nash equilibrium corresponding
to the global optimum. We provide two extensions of our
basic algorithm based on stochastic and deterministic
simulated annealing which find the global optimum.
Finally, we demonstrate the hit probability evolution
on real and synthetic networks numerically and show
that our distributed caching algorithm performs
significantly better than storing the most popular
content, probabilistic content placement policy and
Multi-LRU caching policies.",
acknowledgement = ack-nhfb,
articleno = "27",
fjournal = "Proceedings of the ACM on Measurement and Analysis of
Computing Systems (POMACS)",
journal-URL = "http://dl.acm.org/pub.cfm?id=J1567",
}