Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.04",
%%%     date            = "13 December 2012",
%%%     time            = "17:12:03 MST",
%%%     filename        = "supercomputing2012.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "",
%%%     checksum        = "14292 4061 23619 229572",
%%%     email           = "beebe at, beebe at,
%%%                        beebe at (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "BibTeX; bibliography; SC'12; SC'2012;
%%%                        Supercomputing '2012",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     abstract        = "",
%%%     docstring       = "This is a complete bibliography of papers
%%%                        published in the proceedings of
%%%                        Supercomputing '2012.
%%%                        The conference World-Wide Web site is
%%%                        The organizers of this conference series
%%%                        maintain a World-Wide Web site at
%%%                        where pointers to Web pages for the
%%%                        conferences from 1988 to date may be found.
%%%                        At version 1.04, the year coverage looked
%%%                        like this:
%%%                             2012 ( 106)
%%%                             InProceedings:  105
%%%                             Proceedings:      1
%%%                             Total entries:  106
%%%                        In this bibliography, entries are sorted in
%%%                        order of PDF file numbers.
%%%                        The on-line electronic proceedings do not
%%%                        contain sequential page numbers, although
%%%                        there is an ISBN assigned for the
%%%                        proceedings.  A pagecount field is given with
%%%                        each entry, extracted from the PDF file: some
%%%                        of the articles lack page numbers altogether,
%%%                        others number pages 1, 2, 3, ...
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================
    "\ifx \undefined \circled \def \circled #1{(#1)}\fi" #
    "\ifx \undefined \reg \def \reg {\circled{R}}\fi" #
    "\ifx \undefined \TM \def \TM {${}^{\sc TM}$} \fi"

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path||,
                            \path|| (Internet),
                    URL: \path||"}

%%% ====================================================================
%%% Publishers and their addresses:
@String{pub-ACM                 = "ACM Press"}
@String{pub-ACM:adr             = "New York, NY 10036, USA"}

@String{pub-IEEE                = "IEEE Computer Society Press"}
@String{pub-IEEE:adr            = "1109 Spring Street, Suite 300,
                                  Silver Spring, MD 20910, USA"}

%%% ====================================================================
%%% Bibliography entries from main SC'12 Proceedings
  author =       "Jatin Chhugani and Changkyu Kim and Hemant Shukla and
                 Jongsoo Park and Pradeep Dubey and John Shalf and Horst
                  D. Simon",
  title =        "Billion-particle {SIMD}-friendly two-point correlation
                 on large-scale {HPC} cluster systems",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "1:1--1:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Two-point Correlation Function (TPCF) is widely used
                 in astronomy to characterize the distribution of
                 matter/energy in the Universe, and help derive the
                 physics that can trace back to the creation of the
                 universe. However, it is prohibitively slow for current
                 sized datasets, and would continue to be a critical
                 bottleneck with the trend of increasing dataset sizes
                 to billions of particles and more, which makes TPCF a
                 compelling benchmark application for future exa-scale
                 architectures. State-of-the-art TPCF implementations do
                 not map well to the underlying SIMD hardware, and also
                 suffer from load-imbalance for large core counts. In
                 this paper, we present a novel SIMD-friendly histogram
                 update algorithm that exploits the spatial locality of
                 histogram updates to achieve near-linear SIMD scaling.
                 We also present a load-balancing scheme that combines
                 domain-specific initial static division of work and
                 dynamic task migration across nodes to effectively
                 balance computation across nodes. Using Zin
                 supercomputer at Lawrence Livermore National Laboratory
                 (25,600 cores of Intel\reg{} Xeon\reg{} E5-2670, each
                 with 256-bit SIMD), we achieve 90\% parallel efficiency
                 and 96\% SIMD efficiency, and perform TPCF computation
                 on a 1.7 billion particle dataset in 5.3 hours (at
                 least 35 x faster than previous approaches). In terms
                 of cost per performance (measured in flops/\$), we
                 achieve at least an order-of-magnitude (11.1 x) higher
                 flops/\$ as compared to the best known results [1].
                 Consequently, we now have line-of-sight to achieving
                 the processing power for correlation computation to
                 process billion+ particles telescopic data.",
  acknowledgement = ack-nhfb,
  articleno =    "1",

  author =       "Arthur A. Mirin and David F. Richards and James N.
                 Glosli and Erik W. Draeger and Bor Chan and Jean-luc
                 Fattebert and William D. Krauss and Tomas Oppelstrup
                 and John Jeremy Rice and John A. Gunnels and
                 Viatcheslav Gurev and Changhoan Kim and John Magerlein
                 and Matthias Reumann and Hui-Fang Wen",
  title =        "Toward real-time modeling of human heart ventricles at
                 cellular resolution: simulation of drug-induced
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "2:1--2:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "We have developed a highly efficient and scalable
                 cardiac electrophysiology simulation capability that
                 supports groundbreaking resolution and detail to
                 elucidate the mechanisms of sudden cardiac death from
                 arrhythmia. We can simulate thousands of heartbeats at
                 a resolution of 0.1 mm, comparable to the size of
                 cardiac cells, thereby enabling scientific inquiry not
                 previously possible. Based on scaling results from the
                 partially deployed Sequoia IBM Blue Gene/Q machine at
                 Lawrence Livermore National Laboratory and planned
                 optimizations, we estimate that by SC12 we will
                 simulate 8--10 heartbeats per minute --- a
                 time-to-solution 400--500 times faster than the
                 state-of-the-art. Performance between 8 and 11 PFlop/s
                 on the full 1,572,864 cores is anticipated,
                 representing 40--55 percent of peak. The power of the
                 model is demonstrated by illuminating the subtle
                 arrhythmogenic mechanisms of anti-arrhythmic drugs that
                 paradoxically increase arrhythmias in some patient
  acknowledgement = ack-nhfb,
  articleno =    "2",

  author =       "Tan Bui-Thanh and Carsten Burstedde and Omar Ghattas
                 and James Martin and Georg Stadler and Lucas C.
  title =        "Extreme-scale {UQ} for {Bayesian} inverse problems
                 governed by {PDEs}",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "3:1--3:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Quantifying uncertainties in large-scale simulations
                 has emerged as the central challenge facing CS{\&}E.
                 When the simulations require supercomputers, and
                 uncertain parameter dimensions are large, conventional
                 UQ methods fail. Here we address uncertainty
                 quantification for large-scale inverse problems in a
                 Bayesian inference framework: given data and model
                 uncertainties, find the pdf describing parameter
                 uncertainties. To overcome the curse of dimensionality
                 of conventional methods, we exploit the fact that the
                 data are typically informative about low-dimensional
                 manifolds of parameter space to construct low rank
                 approximations of the covariance matrix of the
                 posterior pdf via a matrix-free randomized method. We
                 obtain a method that scales independently of the
                 forward problem dimension, the uncertain parameter
                 dimension, the data dimension, and the number of cores.
                 We apply the method to the Bayesian solution of an
                 inverse problem in 3D global seismic wave propagation
                 with over one million uncertain earth model parameters,
                 630 million wave propagation unknowns, on up to 262K
                 cores, for which we obtain a factor of over 2000
                 reduction in problem dimension. This makes UQ tractable
                 for the inverse problem.",
  acknowledgement = ack-nhfb,
  articleno =    "3",

  author =       "Salman Habib and Vitali Morozov and Hal Finkel and
                 Adrian Pope and Katrin Heitmann and Kalyan Kumaran and
                 Tom Peterka and Joe Insley and David Daniel and
                 Patricia Fasel and Nicholas Frontiere and Zarija
  title =        "The universe at extreme scale: multi-petaflop sky
                 simulation on the {BG/Q}",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "4:1--4:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Remarkable observational advances have established a
                 compelling cross-validated model of the Universe. Yet,
                 two key pillars of this model --- dark matter and dark
                 energy --- remain mysterious. Next-generation sky
                 surveys will map billions of galaxies to explore the
                 physics of the 'Dark Universe'. Science requirements
                 for these surveys demand simulations at extreme scales;
                 these will be delivered by the HACC (Hybrid/Hardware
                 Accelerated Cosmology Code) framework. HACC's novel
                 algorithmic structure allows tuning across diverse
                 architectures, including accelerated and multi-core
                 systems. On the IBM BG/Q, HACC attains unprecedented
                 scalable performance --- currently 6.23 PFlops at 62\%
                 of peak and 92\% parallel efficiency on 786,432 cores
                 (48 racks) --- at extreme problem sizes with up to
                 almost two trillion particles, larger than any
                 cosmological simulation yet performed. HACC simulations
                 at these scales will for the first time enable tracking
                 individual galaxies over the entire volume of a
                 cosmological survey.",
  acknowledgement = ack-nhfb,
  articleno =    "4",

  author =       "Tomoaki Ishiyama and Keigo Nitadori and Junichiro
  title =        "4.45 Pflops astrophysical {$N$}-body simulation on {K}
                 computer: the gravitational trillion-body problem",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "5:1--5:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "As an entry for the 2012 Gordon-Bell performance
                 prize, we report performance results of astrophysical N
                 -body simulations of one trillion particles performed
                 on the full system of K computer. This is the first
                 gravitational trillion-body simulation in the world. We
                 describe the scientific motivation, the numerical
                 algorithm, the parallelization strategy, and the
                 performance analysis. Unlike many previous Gordon-Bell
                 prize winners that used the tree algorithm for
                 astrophysical N -body simulations, we used the hybrid
                 TreePM method, for similar level of accuracy in which
                 the short-range force is calculated by the tree
                 algorithm, and the long-range force is solved by the
                 particle-mesh algorithm. We developed a highly-tuned
                 gravity kernel for short-range forces, and a novel
                 communication algorithm for long-range forces. The
                 average performance on 24576 and 82944 nodes of K
                 computer are 1.53 and 4.45 Pflops, which correspond to
                 49\% and 42\% of the peak speed.",
  acknowledgement = ack-nhfb,
  articleno =    "5",

  author =       "Robert Henschel and Stephen Simms and David Hancock
                 and Scott Michael and Tom Johnson and Nathan Heald and
                 Thomas William and Donald Berry and Matt Allen and
                 Richard Knepper and Matthew Davy and Matthew Link and
                 Craig A. Stewart",
  title =        "Demonstrating {Lustre} over a {100Gbps} wide area
                 network of 3,500km",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "6:1--6:8",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "As part of the SCinet Research Sandbox at the
                 Supercomputing 2011 conference, Indiana University (IU)
                 demonstrated use of the Lustre high performance
                 parallel file system over a dedicated 100 Gbps wide
                 area network (WAN) spanning more than 3,500 km (2,175
                 mi). This demonstration functioned as a proof of
                 concept and provided an opportunity to study Lustre's
                 performance over a 100 Gbps WAN. To characterize the
                 performance of the network and file system, low level
                 iperf network tests, file system tests with the IOR
                 benchmark, and a suite of real-world applications
                 reading and writing to the file system were run over a
                 latency of 50.5 ms. In this article we describe the
                 configuration and constraints of the demonstration and
                 outline key findings.",
  acknowledgement = ack-nhfb,
  articleno =    "6",

  author =       "Dirk Meister and J{\"u}rgen Kaiser and Andre Brinkmann
                 and Toni Cortes and Michael Kuhn and Julian Kunkel",
  title =        "A study on data deduplication in {HPC} storage
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "7:1--7:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Deduplication is a storage saving technique that is
                 highly successful in enterprise backup environments. On
                 a file system, a single data block might be stored
                 multiple times across different files, for example,
                 multiple versions of a file might exist that are mostly
                 identical. With deduplication, this data replication is
                 localized and redundancy is removed --- by storing data
                 just once, all files that use identical regions refer
                 to the same unique data. The most common approach
                 splits file data into chunks and calculates a
                 cryptographic fingerprint for each chunk. By checking
                 if the fingerprint has already been stored, a chunk is
                 classified as redundant or unique. Only unique chunks
                 are stored. This paper presents the first study on the
                 potential of data deduplication in HPC centers, which
                 belong to the most demanding storage producers. We have
                 quantitatively assessed this potential for capacity
                 reduction for 4 data centers (BSC, DKRZ, RENCI, RWTH).
                 In contrast to previous deduplication studies focusing
                 mostly on backup data, we have analyzed over one PB
                 (1212 TB) of online file system data. The evaluation
                 shows that typically 20\% to 30\% of this online data
                 can be removed by applying data deduplication
                 techniques, peaking up to 70\% for some data sets. This
                 reduction can only be achieved by a subfile
                 deduplication approach, while approaches based on
                 whole-file comparisons only lead to small capacity
  acknowledgement = ack-nhfb,
  articleno =    "7",

  author =       "Bing Xie and Jeffrey Chase and David Dillow and Oleg
                 Drokin and Scott Klasky and Sarp Oral and Norbert
  title =        "Characterizing output bottlenecks in a supercomputer",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "8:1--8:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Supercomputer I/O loads are often dominated by writes.
                 HPC (High Performance Computing) file systems are
                 designed to absorb these bursty outputs at high
                 bandwidth through massive parallelism. However, the
                 delivered write bandwidth often falls well below the
                 peak. This paper characterizes the data absorption
                 behavior of a center-wide shared Lustre parallel file
                 system on the Jaguar supercomputer. We use a
                 statistical methodology to address the challenges of
                 accurately measuring a shared machine under production
                 load and to obtain the distribution of bandwidth across
                 samples of compute nodes, storage targets, and time
                 intervals. We observe and quantify limitations from
                 competing traffic, contention on storage servers and
                 I/O routers, concurrency limitations in the client
                 compute node operating systems, and the impact of
                 variance (stragglers) on coupled output such as
                 striping. We then examine the implications of our
                 results for application performance and the design of
                 I/O middleware systems on shared supercomputers.",
  acknowledgement = ack-nhfb,
  articleno =    "8",

  author =       "Dheya Mustafa and Rudolf Eigenmann",
  title =        "Portable section-level tuning of compiler parallelized
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "9:1--9:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Automatic parallelization of sequential programs
                 combined with tuning is an alternative to manual
                 parallelization. This method has the potential to
                 substantially increase productivity and is thus of
                 critical importance for exploiting the increased
                 computational power of today's multicores. A key
                 difficulty is that parallelizing compilers are
                 generally unable to estimate the performance impact of
                 an optimization on a whole program or a program section
                 at compile time; hence, the ultimate performance
                 decision today rests with the developer. Building an
                 autotuning system to remedy this situation is not a
                 trivial task. This work presents a portable empirical
                 autotuning system that operates at program-section
                 granularity and partitions the compiler options into
                 groups that can be tuned independently. To our
                 knowledge, this is the first approach delivering an
                 autoparallelization system that ensures performance
                 improvements for nearly all programs, eliminating the
                 users' need to experiment with such tools to strive for
                 highest application performance.",
  acknowledgement = ack-nhfb,
  articleno =    "9",

  author =       "Herbert Jordan and Peter Thoman and Juan J. Durillo
                 and Simone Pellegrini and Philipp Gschwandtner and
                 Thomas Fahringer and Hans Moritsch",
  title =        "A multi-objective auto-tuning framework for parallel
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "10:1--10:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "In this paper we introduce a multi-objective
                 auto-tuning framework comprising compiler and runtime
                 components. Focusing on individual code regions, our
                 compiler uses a novel search technique to compute a set
                 of optimal solutions, which are encoded into a
                 multi-versioned executable. This enables the runtime
                 system to choose specifically tuned code versions when
                 dynamically adjusting to changing circumstances. We
                 demonstrate our method by tuning loop tiling in
                 cache-sensitive parallel programs, optimizing for both
                 runtime and efficiency. Our static optimizer finds
                 solutions matching or surpassing those determined by
                 exhaustively sampling the search space on a regular
                 grid, while using less than 4\% of the computational
                 effort on average. Additionally, we show that
                 parallelism-aware multi-versioning approaches like our
                 own gain a performance improvement of up to 70\% over
                 solutions tuned for only one specific number of
  acknowledgement = ack-nhfb,
  articleno =    "10",

  author =       "Matthias Christen and Olaf Schenk and Yifeng Cui",
  title =        "{Patus} for convenient high-performance stencils:
                 evaluation in earthquake simulations",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "11:1--11:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Patus is a code generation and auto-tuning framework
                 for stencil computations targeting modern multi and
                 many-core processors. The goals of the framework are
                 productivity and portability for achieving high
                 performance on the target platform. Its stencil
                 specification language allows the programmer to express
                 the computation in a concise way independently of
                 hardware architecture-specific details. Thus, it
                 increases the programmer productivity by removing the
                 need for manual low-level tuning. We illustrate the
                 impact of the stencil code generation in seismic
                 applications, for which both weak and strong scaling
                 are important. We evaluate the performance by focusing
                 on a scalable discretization of the wave equation and
                 testing complex simulation types of the AWP-ODC code to
                 aim at excellent parallel efficiency, preparing for
                 petascale 3-D earthquake calculations.",
  acknowledgement = ack-nhfb,
  articleno =    "11",

  author =       "Scott Beamer and Krste Asanovi{\'c} and David
  title =        "Direction-optimizing breadth-first search",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "12:1--12:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Breadth-First Search is an important kernel used by
                 many graph-processing applications. In many of these
                 emerging applications of BFS, such as analyzing social
                 networks, the input graphs are low-diameter and
                 scale-free. We propose a hybrid approach that is
                 advantageous for low-diameter graphs, which combines a
                 conventional top-down algorithm along with a novel
                 bottom-up algorithm. The bottom-up algorithm can
                 dramatically reduce the number of edges examined, which
                 in turn accelerates the search as a whole. On a
                 multi-socket server, our hybrid approach demonstrates
                 speedups of 3.3--7.8 on a range of standard synthetic
                 graphs and speedups of 2.4--4.6 on graphs from real
                 social networks when compared to a strong baseline. We
                 also typically double the performance of prior leading
                 shared memory (multicore and GPU) implementations.",
  acknowledgement = ack-nhfb,
  articleno =    "12",

  author =       "Fabio Checconi and Fabrizio Petrini and Jeremiah
                 Willcock and Andrew Lumsdaine and Anamitra Roy
                 Choudhury and Yogish Sabharwal",
  title =        "Breaking the speed and scalability barriers for graph
                 exploration on distributed-memory machines",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "13:1--13:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "In this paper, we describe the challenges involved in
                 designing a family of highly-efficient Breadth-First
                 Search (BFS) algorithms and in optimizing these
                 algorithms on the latest two generations of Blue Gene
                 machines, Blue Gene/P and Blue Gene/Q. With our recent
                 winning Graph 500 submissions in November 2010, June
                 2011, and November 2011, we have achieved unprecedented
                 scalability results in both space and size. On Blue
                 Gene/P, we have been able to parallelize a scale 38
                 problem with 2$^{38}$ vertices and 2$^{42}$ edges on
                 131,072 processing cores. Using only four racks of an
                 experimental configuration of Blue Gene/Q, we have
                 achieved a processing rate of 254 billion edges per
                 second on 65,536 processing cores. This paper describes
                 the algorithmic design and the main classes of
                 optimizations that we have used to achieve these
  acknowledgement = ack-nhfb,
  articleno =    "13",

  author =       "Nadathur Satish and Changkyu Kim and Jatin Chhugani
                 and Pradeep Dubey",
  title =        "Large-scale energy-efficient graph traversal: a path
                 to efficient data-intensive supercomputing",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "14:1--14:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Graph traversal is a widely used algorithm in a
                 variety of fields, including social networks, business
                 analytics, and high-performance computing among others.
                 There has been a push for HPC machines to be rated not
                 just in Petaflops, but also in ``GigaTEPS'' (billions
                 of traversed edges per second), and the Graph500
                 benchmark has been established for this purpose. Graph
                 traversal on single nodes has been well studied and
                 optimized on modern CPU architectures. However, current
                 cluster implementations suffer from high latency data
                 communication with large volumes of transfers across
                 nodes, leading to inefficiency in performance and
                 energy consumption. In this work, we show that we can
                 overcome these constraints using a combination of
                 efficient low-overhead data compression techniques to
                 reduce transfer volumes along with latency-hiding
                 techniques. Using an optimized single node graph
                 traversal algorithm [1], our novel cluster
                 optimizations result in over 6.6X performance
                 improvements over state-of-the-art data transfer
                 techniques, and almost an order of magnitude in energy
                 savings. Our resulting implementation of the Graph500
                 benchmark achieves 115 GigaTEPS on a 320-node/5120 core
                 Intel\reg{} Endeavor cluster with E5-2700 Sandybridge
                 nodes, which matches the second ranked result in the
                 most recent November 2011 Graph500 list [2] with about
                 5.6X fewer nodes. Our cluster optimizations only have a
                 1.8X overhead in overall performance from the
                 performance of the optimized single-node
                 implementation, and allows for near-linear scaling with
                 number of nodes. Our algorithm on 1024 nodes on an
                 Intel\reg{} Xeon\reg{} X5670 Westmere processor (with
                 lower per-node performance) for a large multi-Terabyte
                 graph attained 195 GigaTEPS in performance, proving the
                 high scalability of our algorithm. Our per-node
                 performance is the highest in the top 10 of the Nov
                 2011 Graph500 list.",
  acknowledgement = ack-nhfb,
  articleno =    "14",

  author =       "John M. Levesque and Ramanan Sankaran and Ray Grout",
  title =        "Hybridizing {S3D} into an exascale application using
                 {OpenACC}: an approach for moving to multi-petaflops
                 and beyond",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "15:1--15:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Hybridization is the process of converting an
                 application with a single level of parallelism to an
                 application with multiple levels of parallelism. Over
                 the past 15 years a majority of the applications that
                 run on High Performance Computing systems have employed
                 MPI for all of the parallelism within the application.
                 In the Peta-Exascale computing regime, effective
                 utilization of the hardware requires multiple levels of
                 parallelism matched to the macro architecture of the
                 system to achieve good performance. A hybridized code
                 base is performance portable when sufficient
                 parallelism is expressed in an architecture agnostic
                 form to achieve good performance on a range of
                 available systems. The hybridized S3D code is
                 performance portable across today's leading many core
                 and GPU accelerated systems. The OpenACC framework
                 allows a unified code base to be deployed for either
                 (Manycore CPU or Manycore CPU+GPU) while permitting
                 architecture specific optimizations to expose new
                 dimensions of parallelism to be utilized.",
  acknowledgement = ack-nhfb,
  articleno =    "15",

  author =       "Babak Hejazialhosseini and Diego Rossinelli and
                 Christian Conti and Petros Koumoutsakos",
  title =        "High throughput software for direct numerical
                 simulations of compressible two-phase flows",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "16:1--16:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "We present an open source, object-oriented software
                 for high throughput Direct Numerical Simulations of
                 compressible, two-phase flows. The Navier--Stokes
                 equations are discretized on uniform grids using high
                 order finite volume methods. The software exploits
                 recent CPU micro-architectures by explicit
                 vectorization and adopts NUMA-aware techniques as well
                 as data and computation reordering. We report a
                 compressible flow solver with unprecedented fractions
                 of peak performance: 45\% of the peak for a single node
                 (nominal performance of 840 GFLOP/s) and 30\% for a
                 cluster of 47'000 cores (nominal performance of 0.8
                 PFLOP/s). We suggest that the present work may serve as
                 a performance upper bound, regarding achievable
                 GFLOP/s, for two-phase flow solvers using adaptive mesh
                 refinement. The software enables 3D simulations of
                 shock-bubble interaction including, for the first time,
                 effects of diffusion and surface tension, by
                 efficiently employing two hundred billion computational
  acknowledgement = ack-nhfb,
  articleno =    "16",

  author =       "Tanzima Zerin Islam and Kathryn Mohror and Saurabh
                 Bagchi and Adam Moody and Bronis R. de Supinski and
                 Rudolf Eigenmann",
  title =        "{McrEngine}: a scalable checkpointing system using
                 data-aware aggregation and compression",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "17:1--17:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "High performance computing (HPC) systems use
                 checkpoint-restart to tolerate failures. Typically,
                 applications store their states in checkpoints on a
                 parallel file system (PFS). As applications scale up,
                 checkpoint-restart incurs high overheads due to
                 contention for PFS resources. The high overheads force
                 large-scale applications to reduce checkpoint
                 frequency, which means more compute time is lost in the
                 event of failure. We alleviate this problem through a
                 scalable checkpoint-restart system, mcrEngine.
                 mcrEngine aggregates checkpoints from multiple
                 application processes with knowledge of the data
                 semantics available through widely-used I/O libraries,
                 e.g., HDF5 and netCDF, and compresses them. Our novel
                 scheme improves compressibility of checkpoints up to
                 115\% over simple concatenation and compression. Our
                 evaluation with large-scale application checkpoints
                 show that mcrEngine reduces checkpointing overhead by
                 up to 87\% and restart overhead by up to 62\% over a
                 baseline with no aggregation or compression.",
  acknowledgement = ack-nhfb,
  articleno =    "17",

  author =       "Rolf Riesen and Kurt Ferreira and Dilma {Da Silva} and
                 Pierre Lemarinier and Dorian Arnold and Patrick G.
  title =        "Alleviating scalability issues of checkpointing
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "18:1--18:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Current fault tolerance protocols are not sufficiently
                 scalable for the exascale era. The most-widely used
                 method, coordinated checkpointing, places enormous
                 demands on the I/O subsystem and imposes frequent
                 synchronizations. Uncoordinated protocols use message
                 logging which introduces message rate limitations or
                 undesired memory and storage requirements to hold
                 payload and event logs. In this paper we propose a
                 combination of several techniques, namely coordinated
                 checkpointing, optimistic message logging, and a
                 protocol that glues them together. This combination
                 eliminates some of the drawbacks of each individual
                 approach and proves to be an alternative for many types
                 of exascale applications. We evaluate performance and
                 scaling characteristics of this combination using
                 simulation and a partial implementation. While not a
                 universal solution, the combined protocol is suitable
                 for a large range of existing and future applications
                 that use coordinated checkpointing and enhances their
  acknowledgement = ack-nhfb,
  articleno =    "18",

  author =       "Kento Sato and Naoya Maruyama and Kathryn Mohror and
                 Adam Moody and Todd Gamblin and Bronis R. de Supinski
                 and Satoshi Matsuoka",
  title =        "Design and modeling of a non-blocking checkpointing
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "19:1--19:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "As the capability and component count of systems
                 increase, the MTBF decreases. Typically, applications
                 tolerate failures with checkpoint/restart to a parallel
                 file system (PFS). While simple, this approach can
                 suffer from contention for PFS resources. Multi-level
                 checkpointing is a promising solution. However, while
                 multi-level checkpointing is successful on today's
                 machines, it is not expected to be sufficient for
                 exascale class machines, which are predicted to have
                 orders of magnitude larger memory sizes and failure
                 rates. Our solution combines the benefits of
                 non-blocking and multi-level checkpointing. In this
                 paper, we present the design of our system and model
                 its performance. Our experiments show that our system
                 can improve efficiency by 1.1 to 2.0x on future
                 machines. Additionally, applications using our
                 checkpointing system can achieve high efficiency even
                 when using a PFS with lower bandwidth.",
  acknowledgement = ack-nhfb,
  articleno =    "19",

  author =       "Thanasis G. Papaioannou and Nicolas Bonvin and Karl
  title =        "{Scalia}: an adaptive scheme for efficient multi-cloud
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "20:1--20:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "A growing amount of data is produced daily resulting
                 in a growing demand for storage solutions. While cloud
                 storage providers offer a virtually infinite storage
                 capacity, data owners seek geographical and provider
                 diversity in data placement, in order to avoid vendor
                 lock-in and to increase availability and durability.
                 Moreover, depending on the customer data access
                 pattern, a certain cloud provider may be cheaper than
                 another. In this paper, we introduce Scalia, a cloud
                 storage brokerage solution that continuously adapts the
                 placement of data based on its access pattern and
                 subject to optimization objectives, such as storage
                 costs. Scalia efficiently considers repositioning of
                 only selected objects that may significantly lower the
                 storage cost. By extensive simulation experiments, we
                 prove the cost-effectiveness of Scalia against static
                 placements and its proximity to the ideal data
                 placement in various scenarios of data access patterns,
                 of available cloud storage solutions and of failures.",
  acknowledgement = ack-nhfb,
  articleno =    "20",

  author =       "Sheng Di and Derrick Kondo and Walfredo Cirne",
  title =        "Host load prediction in a {Google} compute cloud with
                 a {Bayesian} model",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "21:1--21:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Prediction of host load in Cloud systems is critical
                 for achieving service-level agreements. However,
                 accurate prediction of host load in Clouds is extremely
                 challenging because it fluctuates drastically at small
                 timescales. We design a prediction method based on
                 Bayes model to predict the mean load over a long-term
                 time interval, as well as the mean load in consecutive
                 future time intervals. We identify novel predictive
                 features of host load that capture the expectation,
                 predictability, trends and patterns of host load. We
                 also determine the most effective combinations of these
                 features for prediction. We evaluate our method using a
                 detailed one-month trace of a Google data center with
                 thousands of machines. Experiments show that the Bayes
                 method achieves high accuracy with a mean squared error
                 of 0.0014. Moreover, the Bayes method improves the load
                 prediction accuracy by 5.6-50\% compared to other
                 state-of-the-art methods based on moving averages,
                 auto-regression, and/or noise filters.",
  acknowledgement = ack-nhfb,
  articleno =    "21",

  author =       "Maciej Malawski and Gideon Juve and Ewa Deelman and
                 Jarek Nabrzyski",
  title =        "Cost- and deadline-constrained provisioning for
                 scientific workflow ensembles in {IaaS} clouds",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "22:1--22:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Large-scale applications expressed as scientific
                 workflows are often grouped into ensembles of
                 inter-related workflows. In this paper, we address a
                 new and important problem concerning the efficient
                 management of such ensembles under budget and deadline
                 constraints on Infrastructure- as-a-Service (IaaS)
                 clouds. We discuss, develop, and assess algorithms
                 based on static and dynamic strategies for both task
                 scheduling and resource provisioning. We perform the
                 evaluation via simulation using a set of scientific
                 workflow ensembles with a broad range of budget and
                 deadline parameters, taking into account uncertainties
                 in task runtime estimations, provisioning delays, and
                 failures. We find that the key factor determining the
                 performance of an algorithm is its ability to decide
                 which workflows in an ensemble to admit or reject for
                 execution. Our results show that an admission procedure
                 based on workflow structure and estimates of task
                 runtimes can significantly improve the quality of
  acknowledgement = ack-nhfb,
  articleno =    "22",

  author =       "Seyong Lee and Jeffrey S. Vetter",
  title =        "Early evaluation of directive-based {GPU} programming
                 models for productive exascale computing",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "23:1--23:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Graphics Processing Unit (GPU)-based parallel computer
                 architectures have shown increased popularity as a
                 building block for high performance computing, and
                 possibly for future Exascale computing. However, their
                 programming complexity remains as a major hurdle for
                 their widespread adoption. To provide better
                 abstractions for programming GPU architectures,
                 researchers and vendors have proposed several
                 directive-based GPU programming models. These
                 directive-based models provide different levels of
                 abstraction, and required different levels of
                 programming effort to port and optimize applications.
                 Understanding these differences among these new models
                 provides valuable insights on their applicability and
                 performance potential. In this paper, we evaluate
                 existing directive-based models by porting thirteen
                 application kernels from various scientific domains to
                 use CUDA GPUs, which, in turn, allows us to identify
                 important issues in the functionality, scalability,
                 tunability, and debuggability of the existing models.
                 Our evaluation shows that directive-based models can
                 achieve reasonable performance, compared to
                 hand-written GPU codes.",
  acknowledgement = ack-nhfb,
  articleno =    "23",

  author =       "Jacques A. Pienaar and Srimat Chakradhar and Anand
  title =        "Automatic generation of software pipelines for
                 heterogeneous parallel systems",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "24:1--24:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Pipelining is a well-known approach to increasing
                 parallelism and performance. We address the problem of
                 software pipelining for heterogeneous parallel
                 platforms that consist of different multi-core and
                 many-core processing units. In this context, pipelining
                 involves two key steps---partitioning an application
                 into stages and mapping and scheduling the stages onto
                 the processing units of the heterogeneous platform. We
                 show that the inter-dependency between these steps is a
                 critical challenge that must be addressed in order to
                 achieve high performance. We propose an Automatic
                 Heterogeneous Pipelining framework (ahp) that generates
                 an optimized pipelined implementation of a program from
                 an annotated unpipelined specification. Across three
                 complex applications (image classification, object
                 detection, and document retrieval) and two
                 heterogeneous platforms (Intel Xeon multi-core CPUs
                 with Intel MIC and NVIDIA GPGPU accelerators), ahp
                 achieves a throughput improvement of up to 1.53x (1.37x
                 on average) over a heterogeneous baseline that exploits
                 data and task parallelism.",
  acknowledgement = ack-nhfb,
  articleno =    "24",

  author =       "Linchuan Chen and Xin Huo and Gagan Agrawal",
  title =        "Accelerating {MapReduce} on a coupled {CPU--GPU}
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "25:1--25:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "The work presented here is driven by two observations.
                 First, heterogeneous architectures that integrate a CPU
                 and a GPU on the same chip are emerging, and hold much
                 promise for supporting power-efficient and scalable
                 high performance computing. Second, MapReduce has
                 emerged as a suitable framework for simplified parallel
                 application development for many classes of
                 applications, including data mining and machine
                 learning applications that benefit from accelerators.
                 This paper focuses on the challenge of scaling a
                 MapReduce application using the CPU and GPU together in
                 an integrated architecture. We develop different
                 methods for dividing the work, which are the
                 map-dividing scheme, where map tasks are divided
                 between both devices, and the pipelining scheme, which
                 pipelines the map and the reduce stages on different
                 devices. We develop dynamic work distribution schemes
                 for both the approaches. To achieve high load balance
                 while keeping scheduling costs low, we use a runtime
                 tuning method to adjust task block sizes for the
                 map-dividing scheme. Our implementation of MapReduce is
                 based on a continuous reduction method, which avoids
                 the memory overheads of storing key-value pairs. We
                 have evaluated the different design decisions using 5
                 popular MapReduce applications. For 4 of the
                 applications, our system achieves 1.21 to 2.1 speedup
                 over the better of the CPU-only and GPU-only versions.
                 The speedups over a single CPU core execution range
                 from 3.25 to 28.68. The runtime tuning method we have
                 developed achieves very low load imbalance, while
                 keeping scheduling overheads low. Though our current
                 work is specific to MapReduce, many underlying ideas
                 are also applicable towards intra-node acceleration of
                 other applications on integrated CPU-GPU nodes.",
  acknowledgement = ack-nhfb,
  articleno =    "25",

  author =       "Francisco D. Igual and Murtaza Ali and Arnon Friedmann
                 and Eric Stotzer and Timothy Wentz and Robert A. van de
  title =        "Unleashing the high performance and low power of
                 multi-core {DSPs} for general-purpose {HPC}",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "26:1--26:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Take a multicore Digital Signal Processor (DSP) chip
                 designed for cellular base stations and radio network
                 controllers, add floating-point capabilities to support
                 4G networks, and out of thin air a HPC engine is born.
                 The potential for HPC is clear: It promises 128 GFLOPS
                 (single precision) for 10 Watts; It is used in millions
                 of network related devices and hence benefits from
                 economies of scale; It should be simpler to program
                 than a GPU. Simply put, it is fast, green, and cheap.
                 But is it easy to use? In this paper, we show how this
                 potential can be applied to general-purpose high
                 performance computing, more specifically to dense
                 matrix computations, without major changes in existing
                 codes and methodologies, and with excellent performance
                 and power consumption numbers.",
  acknowledgement = ack-nhfb,
  articleno =    "26",

  author =       "Li-Wen Chang and John A. Stratton and Hee-Seok Kim and
                 Wen-Mei W. Hwu",
  title =        "A scalable, numerically stable, high-performance
                 tridiagonal solver using {GPUs}",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "27:1--27:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "In this paper, we present a scalable, numerically
                 stable, high-performance tridiagonal solver. The solver
                 is based on the SPIKE algorithm for partitioning a
                 large matrix into small independent matrices, which can
                 be solved in parallel. For each small matrix, our
                 solver applies a general 1-by-1 or 2-by-2 diagonal
                 pivoting algorithm, which is also known to be
                 numerically stable. Our paper makes two major
                 contributions. First, our solver is the first
                 numerically stable tridiagonal solver for GPUs. Our
                 solver provides comparable quality of stable solutions
                 to Intel MKL and Matlab, at speed comparable to the GPU
                 tridiagonal solvers in existing packages like CUSPARSE.
                 It is also scalable to multiple GPUs and CPUs. Second,
                 we present and analyze two key optimization strategies
                 for our solver: a high-throughput data layout
                 transformation for memory efficiency, and a dynamic
                 tiling approach for reducing the memory access
                 footprint caused by branch divergence.",
  acknowledgement = ack-nhfb,
  articleno =    "27",

  author =       "Jongsoo Park and Ping Tak Peter Tang and Mikhail
                 Smelyanskiy and Daehyun Kim and Thomas Benson",
  title =        "Efficient backprojection-based synthetic aperture
                 radar computation with many-core processors",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "28:1--28:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Tackling computationally challenging problems with
                 high efficiency often requires the combination of
                 algorithmic innovation, advanced architecture, and
                 thorough exploitation of parallelism. We demonstrate
                 this synergy through synthetic aperture radar (SAR) via
                 backprojection, an image reconstruction method that can
                 require hundreds of TFLOPS. Computation cost is
                 significantly reduced by our new algorithm of
                 approximate strength reduction; data movement cost is
                 economized by software locality optimizations
                 facilitated by advanced architecture support;
                 parallelism is fully harnessed in various patterns and
                 granularities. We deliver over 35 billion
                 backprojections per second throughput per compute node
                 on an Intel\reg{} Xeon\reg{} E5-2670-based cluster,
                 equipped with Intel\reg{} Xeon PhiTM coprocessors. This
                 corresponds to processing a 3K x 3K image within a
                 second using a single node. Our study can be extended
                 to other settings: backprojection is applicable
                 elsewhere including medical imaging, approximate
                 strength reduction is a general code transformation
                 technique, and many-core processors are emerging as a
                 solution to energy-efficient computing.",
  acknowledgement = ack-nhfb,
  articleno =    "28",

  author =       "Peng Li and Guodong Li and Ganesh Gopalakrishnan",
  title =        "Parametric flows: automated behavior equivalencing for
                 symbolic analysis of races in {CUDA} programs",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "29:1--29:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "The growing scale of concurrency requires automated
                 abstraction techniques to cut down the effort in
                 concurrent system analysis. In this paper, we show that
                 the high degree of behavioral symmetry present in GPU
                 programs allows CUDA race detection to be dramatically
                 simplified through abstraction. Our abstraction
                 techniques is one of automatically creating parametric
                 flows ---control-flow equivalence classes of threads
                 that diverge in the same manner---and checking for data
                 races only across a pair of threads per parametric
                 flow. We have implemented this approach as an extension
                 of our recently proposed GKLEE symbolic analysis
                 framework and show that all our previous results are
                 dramatically improved in that (i) the parametric
                 flow-based analysis takes far less time, and (ii)
                 because of the much higher scalability of the analysis,
                 we can detect even more data race situations that were
                 previously missed by GKLEE because it was forced to
                 downscale examples to limit analysis complexity.
                 Moreover, the parametric flow-based analysis is
                 applicable to other programs with SPMD models.",
  acknowledgement = ack-nhfb,
  articleno =    "29",

  author =       "Tobias Hilbrich and Joachim Protze and Martin Schulz
                 and Bronis R. de Supinski and Matthias S. M{\"u}ller",
  title =        "{MPI} runtime error detection with {MUST}: advances in
                 deadlock detection",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "30:1--30:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "The widely used Message Passing Interface (MPI) is
                 complex and rich. As a result, application developers
                 require automated tools to avoid and to detect MPI
                 programming errors. We present the Marmot Umpire
                 Scalable Tool (MUST) that detects such errors with
                 significantly increased scalability. We present
                 improvements to our graph-based deadlock detection
                 approach for MPI, which cover future MPI extensions.
                 Our enhancements also check complex MPI constructs that
                 no previous graph-based detection approach handled
                 correctly. Finally, we present optimizations for the
                 processing of MPI operations that reduce runtime
                 deadlock detection overheads. Existing approaches often
                 require O ( p ) analysis time per MPI operation, for p
                 processes. We empirically observe that our improvements
                 lead to sub-linear or better analysis time per
                 operation for a wide range of real world
  acknowledgement = ack-nhfb,
  articleno =    "30",

  author =       "Abhinav Bhatele and Todd Gamblin and Katherine E.
                 Isaacs and Brian T. N. Gunney and Martin Schulz and
                 Peer-Timo Bremer and Bernd Hamann",
  title =        "Novel views of performance data to analyze large-scale
                 adaptive applications",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "31:1--31:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Performance analysis of parallel scientific codes is
                 becoming increasingly difficult due to the rapidly
                 growing complexity of applications and architectures.
                 Existing tools fall short in providing intuitive views
                 that facilitate the process of performance debugging
                 and tuning. In this paper, we extend recent ideas of
                 projecting and visualizing performance data for faster,
                 more intuitive analysis of applications. We collect
                 detailed per-level and per-phase measurements for a
                 dynamically load-balanced, structured AMR library and
                 project per-core data collected in the hardware domain
                 on to the application's communication topology. We show
                 how our projections and visualizations lead to a rapid
                 diagnosis of and mitigation strategy for a previously
                 elusive scaling bottleneck in the library that is hard
                 to detect using conventional tools. Our new insights
                 have resulted in a 22\% performance improvement for a
                 65,536-core run of the AMR library on an IBM Blue
                 Gene/P system.",
  acknowledgement = ack-nhfb,
  articleno =    "31",

  author =       "Donghong Wu and Bingsheng He and Xueyan Tang and
                 Jianliang Xu and Minyi Guo",
  title =        "{RAMZzz}: rank-aware {DRAM} power management with
                 dynamic migrations and demotions",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "32:1--32:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Main memory is a significant energy consumer which may
                 contribute to over 40\% of the total system power, and
                 will become more significant for server machines with
                 more main memory. In this paper, we propose a novel
                 memory system design named RAMZzz with rank-aware
                 energy saving optimizations. Specifically, we rely on a
                 memory controller to monitor the memory access
                 locality, and group the pages with similar access
                 locality into the same rank. We further develop dynamic
                 page migrations to adapt to data access patterns, and a
                 prediction model to estimate the demotion time for
                 accurate control on power state transitions. We
                 experimentally compare our algorithm with other energy
                 saving policies with cycle-accurate simulation.
                 Experiments with benchmark workloads show that RAMZzz
                 achieves significant improvement on energy-delay and
                 energy consumption over other power saving
  acknowledgement = ack-nhfb,
  articleno =    "32",

  author =       "Sheng Li and Doe Hyun Yoon and Ke Chen and Jishen Zhao
                 and Jung Ho Ahn and Jay B. Brockman and Yuan Xie and
                 Norman P. Jouppi",
  title =        "{MAGE}: adaptive granularity and {ECC} for resilient
                 and power efficient memory systems",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "33:1--33:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Resiliency is one of the toughest challenges in
                 high-performance computing, and memory accounts for a
                 significant fraction of errors. Providing strong error
                 tolerance in memory usually requires a wide memory
                 channel that incurs a large access granularity (hence,
                 a large cache line). Unfortunately, applications with
                 limited spatial locality waste memory power and
                 bandwidth on systems with a large access granularity.
                 Thus, careful design considerations must be made to
                 balance memory system performance, power efficiency,
                 and resiliency. In this paper, we propose MAGE, a
                 Memory system with Adaptive Granularity and ECC, to
                 achieve high performance, power efficiency, and
                 resiliency. MAGE can adapt memory access granularities
                 and ECC schemes to applications with different memory
                 behaviors. Our experiments show that MAGE achieves more
                 than a 28\% energy-delay product improvement, compared
                 to the best existing systems with static granularity
                 and ECC.",
  acknowledgement = ack-nhfb,
  articleno =    "33",

  author =       "Yufei Ren and Tan Li and Dantong Yu and Shudong Jin
                 and Thomas Robertazzi and Brian L. Tierney and Eric
  title =        "Protocols for wide-area data-intensive applications:
                 design and performance issues",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "34:1--34:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Providing high-speed data transfer is vital to various
                 data-intensive applications. While there have been
                 remarkable technology advances to provide
                 ultra-high-speed network bandwidth, existing protocols
                 and applications may not be able to fully utilize the
                 bare-metal bandwidth due to their inefficient design.
                 We identify the same problem remains in the field of
                 Remote Direct Memory Access (RDMA) networks. RDMA
                 offloads TCP/IP protocols to hardware devices. However,
                 its benefits have not been fully exploited due to the
                 lack of efficient software and application protocols,
                 in particular in wide-area networks. In this paper, we
                 address the design choices to develop such protocols.
                 We describe a protocol implemented as part of a
                 communication middleware. The protocol has its flow
                 control, connection management, and task
                 synchronization. It maximizes the parallelism of RDMA
                 operations. We demonstrate its performance benefit on
                 various local and wide-area testbeds, including the DOE
                 ANI testbed with RoCE links and InfiniBand links.",
  acknowledgement = ack-nhfb,
  articleno =    "34",

  author =       "N. S. Islam and M. W. Rahman and J. Jose and R.
                 Rajachandrasekar and H. Wang and H. Subramoni and C.
                 Murthy and D. K. Panda",
  title =        "High performance {RDMA}-based design of {HDFS} over
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "35:1--35:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Hadoop Distributed File System (HDFS) acts as the
                 primary storage of Hadoop and has been adopted by
                 reputed organizations (Facebook, Yahoo! etc.) due to
                 its portability and fault-tolerance. The existing
                 implementation of HDFS uses Java-socket interface for
                 communication which delivers suboptimal performance in
                 terms of latency and throughput. For data-intensive
                 applications, network performance becomes key component
                 as the amount of data being stored and replicated to
                 HDFS increases. In this paper, we present a novel
                 design of HDFS using Remote Direct Memory Access (RDMA)
                 over InfiniBand via JNI interfaces. Experimental
                 results show that, for 5GB HDFS file writes, the new
                 design reduces the communication time by 87\% and 30\%
                 over 1Gigabit Ethernet (1GigE) and IP-over-InfiniBand
                 (IPoIB), respectively, on QDR platform (32Gbps). For
                 HBase, the Put operation performance is improved by
                 26\% with our design. To the best of our knowledge,
                 this is the first design of HDFS over InfiniBand
  acknowledgement = ack-nhfb,
  articleno =    "35",

  author =       "Kiril Dichev and Fergal Reid and Alexey Lastovetsky",
  title =        "Efficient and reliable network tomography in
                 heterogeneous networks using {BitTorrent} broadcasts
                 and clustering algorithms",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "36:1--36:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "In the area of network performance and discovery,
                 network tomography focuses on reconstructing network
                 properties using only end-to-end measurements at the
                 application layer. One challenging problem in network
                 tomography is reconstructing available bandwidth along
                 all links during multiple source/multiple destination
                 transmissions. The traditional measurement procedures
                 used for bandwidth tomography are extremely time
                 consuming. We propose a novel solution to this problem.
                 Our method counts the fragments exchanged during a
                 BitTorrent broadcast. While this measurement has a high
                 level of randomness, it can be obtained very
                 efficiently, and aggregated into a reliable metric.
                 This data is then analyzed with state-of-the-art
                 algorithms, which correctly reconstruct logical
                 clusters of nodes interconnected by high bandwidth, as
                 well as bottlenecks between these logical clusters. Our
                 experiments demonstrate that the proposed two-phase
                 approach efficiently solves the presented problem for a
                 number of settings on a complex grid infrastructure.",
  acknowledgement = ack-nhfb,
  articleno =    "36",

  author =       "Preeti Malakar and Thomas George and Sameer Kumar and
                 Rashmi Mittal and Vijay Natarajan and Yogish Sabharwal
                 and Vaibhav Saxena and Sathish S. Vadhiyar",
  title =        "A divide and conquer strategy for scaling weather
                 simulations with multiple regions of interest",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "37:1--37:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Accurate and timely prediction of weather phenomena,
                 such as hurricanes and flash floods, require
                 high-fidelity compute intensive simulations of multiple
                 finer regions of interest within a coarse simulation
                 domain. Current weather applications execute these
                 nested simulations sequentially using all the available
                 processors, which is sub-optimal due to their
                 sub-linear scalability. In this work, we present a
                 strategy for parallel execution of multiple nested
                 domain simulations based on partitioning the 2-D
                 processor grid into disjoint rectangular regions
                 associated with each domain. We propose a novel
                 combination of performance prediction, processor
                 allocation methods and topology-aware mapping of the
                 regions on torus interconnects. Experiments on IBM Blue
                 Gene systems using WRF show that the proposed
                 strategies result in performance improvement of up to
                 33\% with topology-oblivious mapping and up to
                 additional 7\% with topology-aware mapping over the
                 default sequential strategy.",
  acknowledgement = ack-nhfb,
  articleno =    "37",

  author =       "Max Rietmann and Peter Messmer and Tarje Nissen-Meyer
                 and Daniel Peter and Piero Basini and Dimitri
                 Komatitsch and Olaf Schenk and Jeroen Tromp and Lapo
                 Boschi and Domenico Giardini",
  title =        "Forward and adjoint simulations of seismic wave
                 propagation on emerging large-scale {GPU}
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "38:1--38:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Computational seismology is an area of wide
                 sociological and economic impact, ranging from
                 earthquake risk assessment to subsurface imaging and
                 oil and gas exploration. At the core of these
                 simulations is the modeling of wave propagation in a
                 complex medium. Here we report on the extension of the
                 high-order finite-element seismic wave simulation
                 package SPECFEM3D to support the largest scale hybrid
                 and homogeneous supercomputers. Starting from an
                 existing highly tuned MPI code, we migrated to a CUDA
                 version. In order to be of immediate impact to the
                 science mission of computational seismologists, we had
                 to port the entire production package, rather than just
                 individual kernels. One of the challenges in
                 parallelizing finite element codes is the potential for
                 race conditions during the assembly phase. We therefore
                 investigated different methods such as mesh coloring or
                 atomic updates on the GPU. In order to achieve strong
                 scaling, we needed to ensure good overlap of data
                 motion at all levels, including internode and
                 host-accelerator transfers. Finally we carefully tuned
                 the GPU implementation. The new MPI/CUDA solver
                 exhibits excellent scalability and achieves speedup on
                 a node-to-node basis over the carefully tuned
                 equivalent multi-core MPI solver. To demonstrate the
                 performance of both the forward and adjoint
                 functionality, we present two case studies run on the
                 Cray XE6 CPU and Cray XK6 GPU architectures up to 896
                 nodes: (1) focusing on most commonly used forward
                 simulations, we simulate seismic wave propagation
                 generated by earthquakes in Turkey, and (2) testing the
                 most complex seismic inversion type of the package, we
                 use ambient seismic noise to image 3-D crust and mantle
                 structure beneath western Europe.",
  acknowledgement = ack-nhfb,
  articleno =    "38",

  author =       "Tan Nguyen and Pietro Cicotti and Eric Bylaska and Dan
                 Quinlan and Scott B. Baden",
  title =        "{Bamboo}: translating {MPI} applications to a
                 latency-tolerant, data-driven form",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "39:1--39:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "We present Bamboo, a custom source-to-source
                 translator that transforms MPI C source into a
                 data-driven form that automatically overlaps
                 communication with available computation. Running on up
                 to 98304 processors of NERSC's Hopper system, we
                 observe that Bamboo's overlap capability speeds up MPI
                 implementations of a 3D Jacobi iterative solver and
                 Cannon's matrix multiplication. Bamboo's generated code
                 meets or exceeds the performance of hand optimized MPI,
                 which includes split-phase coding, the method
                 classically employed to hide communication. We achieved
                 our results with only modest amounts of programmer
                 annotation and no intrusive reprogramming of the
                 original application source.",
  acknowledgement = ack-nhfb,
  articleno =    "39",

  author =       "Vinayaka Bandishti and Irshad Pananilath and Uday
  title =        "Tiling stencil computations to maximize parallelism",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "40:1--40:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Most stencil computations allow tile-wise concurrent
                 start, i.e., there always exists a face of the
                 iteration space and a set of tiling hyperplanes such
                 that all tiles along that face can be started
                 concurrently. This provides load balance and maximizes
                 parallelism. However, existing automatic tiling
                 frameworks often choose hyperplanes that lead to
                 pipelined start-up and load imbalance. We address this
                 issue with a new tiling technique that ensures
                 concurrent start-up as well as perfect load-balance
                 whenever possible. We first provide necessary and
                 sufficient conditions on tiling hyperplanes to enable
                 concurrent start for programs with affine data
                 accesses. We then provide an approach to find such
                 hyperplanes. Experimental evaluation on a 12-core Intel
                 Westmere shows that our code is able to outperform a
                 tuned domain-specific stencil code generator by 4\% to
                 27\%, and previous compiler techniques by a factor of
                 2x to 10.14 x.",
  acknowledgement = ack-nhfb,
  articleno =    "40",

  author =       "Wei Ding and Yuanrui Zhang and Mahmut Kandemir and
                 Seung Woo Son",
  title =        "Compiler-directed file layout optimization for
                 hierarchical storage systems",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "41:1--41:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "File layout of array data is a critical factor that
                 effects the behavior of storage caches, and has so far
                 taken not much attention in the context of hierarchical
                 storage systems. The main contribution of this paper is
                 a compiler-driven file layout optimization scheme for
                 hierarchical storage caches. This approach, fully
                 automated within an optimizing compiler, analyzes a
                 multi-threaded application code and determines a file
                 layout for each disk-resident array referenced by the
                 code, such that the performance of the target storage
                 cache hierarchy is maximized. We tested our approach
                 using 16 I/O intensive application programs and
                 compared its performance against two previously
                 proposed approaches under different cache space
                 management schemes. Our experimental results show that
                 the proposed approach improves the execution time of
                 these parallel applications by 23.7\% on average.",
  acknowledgement = ack-nhfb,
  articleno =    "41",

  author =       "Ping Tak Peter Tang and Jongsoo Park and Daehyun Kim
                 and Vladimir Petrov",
  title =        "A framework for low-communication {$1$-D} {FFT}",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "42:1--42:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "In high-performance computing on distributed-memory
                 systems, communication often represents a significant
                 part of the overall execution time. The relative cost
                 of communication will certainly continue to rise as
                 compute-density growth follows the current technology
                 and industry trends. Design of lower-communication
                 alternatives to fundamental computational algorithms
                 has become an important field of research. For
                 distributed 1-D FFT, communication cost has hitherto
                 remained high as all industry-standard implementations
                 perform three all-to-all internode data exchanges (also
                 called global transposes). These communication steps
                 indeed dominate execution time. In this paper, we
                 present a mathematical framework from which many
                 single-all-to-all and easy-to-implement 1-D FFT
                 algorithms can be derived. For large-scale problems,
                 our implementation can be twice as fast as leading FFT
                 libraries on state-of-the-art computer clusters.
                 Moreover, our framework allows tradeoff between
                 accuracy and performance, further boosting performance
                 if reduced accuracy is acceptable.",
  acknowledgement = ack-nhfb,
  articleno =    "42",

  author =       "Hari Sundar and George Biros and Carsten Burstedde and
                 Johann Rudi and Omar Ghattas and Georg Stadler",
  title =        "Parallel geometric-algebraic multigrid on unstructured
                 forests of octrees",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "43:1--43:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "We present a parallel multigrid method for solving
                 variable-coefficient elliptic partial differential
                 equations on arbitrary geometries using highly adapted
                 meshes. Our method is designed for meshes that are
                 built from an unstructured hexahedral macro mesh, in
                 which each macro element is adaptively refined as an
                 octree. This forest-of-octrees approach enables us to
                 generate meshes for complex geometries with arbitrary
                 levels of local refinement. We use geometric multigrid
                 (GMG) for each of the octrees and algebraic multigrid
                 (AMG) as the coarse grid solver. We designed our GMG
                 sweeps to entirely avoid collectives, thus minimizing
                 communication cost. We present weak and strong scaling
                 results for the 3D variable-coefficient Poisson problem
                 that demonstrate high parallel scalability. As a
                 highlight, the largest problem we solve is on a
                 non-uniform mesh with 100 billion unknowns on 262,144
                 cores of NCCS's Cray XK6 ``Jaguar''; in this solve we
                 sustain 272 TFlops/s.",
  acknowledgement = ack-nhfb,
  articleno =    "43",

  author =       "Akira Nukada and Kento Sato and Satoshi Matsuoka",
  title =        "Scalable multi-{GPU} {$3$-D} {FFT} for {TSUBAME 2.0}
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "44:1--44:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "For scalable 3-D FFT computation using multiple GPUs,
                 efficient all-to-all communication between GPUs is the
                 most important factor in good performance.
                 Implementations with point-to-point MPI library
                 functions and CUDA memory copy APIs typically exhibit
                 very large overheads especially for small message sizes
                 in all-to-all communications between many nodes. We
                 propose several schemes to minimize the overheads,
                 including employment of lower-level API of InfiniBand
                 to effectively overlap intra- and inter-node
                 communication, as well as auto-tuning strategies to
                 control scheduling and determine rail assignments. As a
                 result we achieve very good strong scalability as well
                 as good performance, up to 4.8TFLOPS using 256 nodes of
                 TSUBAME 2.0 Supercomputer (768 GPUs) in double
  acknowledgement = ack-nhfb,
  articleno =    "44",

  author =       "Jun Doi",
  title =        "Peta-scale lattice quantum chromodynamics on a {Blue
                 Gene/Q} supercomputer",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "45:1--45:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Lattice Quantum Chromodynamics (QCD) is one of the
                 most challenging applications running on massively
                 parallel supercomputers. To reproduce these physical
                 phenomena on a supercomputer, a precise simulation is
                 demanded requiring well optimized and scalable code. We
                 have optimized lattice QCD programs on Blue Gene family
                 supercomputers and shown the strength in lattice QCD
                 simulation. Here we optimized on the third generation
                 Blue Gene/Q supercomputer; (i) by changing the data
                 layout, (ii) by exploiting new SIMD instruction sets,
                 and (iii) by pipelining boundary data exchange to
                 overlap communication and calculation. The optimized
                 lattice QCD program shows excellent weak scalability on
                 the large scale Blue Gene/Q system, and with 16 racks
                 we sustained 1.08 Pflop/s, 32.1\% of the theoretical
                 peak performance, including the conjugate gradient
                 solver routines.",
  acknowledgement = ack-nhfb,
  articleno =    "45",

  author =       "Abhinav Sarje and Xiaoye S. Li and Slim Chourou and
                 Elaine R. Chan and Alexander Hexemer",
  title =        "Massively parallel {X-ray} scattering simulations",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "46:1--46:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Although present X-ray scattering techniques can
                 provide tremendous information on the nano-structural
                 properties of materials that are valuable in the design
                 and fabrication of energy-relevant nano-devices, a
                 primary challenge remains in the analyses of such data.
                 In this paper we describe a high-performance, flexible,
                 and scalable Grazing Incidence Small Angle X-ray
                 Scattering simulation algorithm and codes that we have
                 developed on multi-core/CPU and many-core/GPU clusters.
                 We discuss in detail our implementation, optimization
                 and performance on these platforms. Our results show
                 speedups of ~125x on a Fermi-GPU and ~20x on a Cray-XE6
                 24-core node, compared to a sequential CPU code, with
                 near linear scaling on multi-node clusters. To our
                 knowledge, this is the first GISAXS simulation code
                 that is flexible to compute scattered light intensities
                 in all spatial directions allowing full reconstruction
                 of GISAXS patterns for any complex structures and with
                 high-resolutions while reducing simulation times from
                 months to minutes.",
  acknowledgement = ack-nhfb,
  articleno =    "46",

  author =       "C. Baker and G. Davidson and T. M. Evans and S.
                 Hamilton and J. Jarrell and W. Joubert",
  title =        "High performance radiation transport simulations:
                 preparing for {Titan}",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "47:1--47:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "In this paper we describe the Denovo code system.
                 Denovo solves the six-dimensional, steady-state, linear
                 Boltzmann transport equation, of central importance to
                 nuclear technology applications such as reactor core
                 analysis (neutronics), radiation shielding, nuclear
                 forensics and radiation detection. The code features
                 multiple spatial differencing schemes, state-of-the-art
                 linear solvers, the Koch-Baker-Alcouffe (KBA)
                 parallel-wavefront sweep algorithm for inverting the
                 transport operator, a new multilevel energy
                 decomposition method scaling to hundreds of thousands
                 of processing cores, and a modern, novel code
                 architecture that supports straightforward integration
                 of new features. In this paper we discuss the
                 performance of Denovo on the 20+ petaflop ORNL
                 GPU-based system, Titan. We describe algorithms and
                 techniques used to exploit the capabilities of Titan's
                 heterogeneous compute node architecture and the
                 challenges of obtaining good parallel performance for
                 this sparse hyperbolic PDE solver containing inherently
                 sequential computations. Numerical results
                 demonstrating Denovo performance on early Titan
                 hardware are presented.",
  acknowledgement = ack-nhfb,
  articleno =    "47",

  author =       "John Jenkins and Eric R. Schendel and Sriram
                 Lakshminarasimhan and David A. {Boyuka II} and Terry
                 Rogers and Stephane Ethier and Robert Ross and Scott
                 Klasky and Nagiza F. Samatova",
  title =        "Byte-precision level of detail processing for variable
                 precision analytics",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "48:1--48:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "I/O bottlenecks in HPC applications are becoming a
                 more pressing problem as compute capabilities continue
                 to outpace I/O capabilities. While double-precision
                 simulation data often must be stored losslessly, the
                 loss of some of the fractional component may introduce
                 acceptably small errors to many types of scientific
                 analyses. Given this observation, we develop a
                 precision level of detail (APLOD) library, which
                 partitions double-precision datasets along user-defined
                 byte boundaries. APLOD parameterizes the analysis
                 accuracy-I/O performance tradeoff, bounds maximum
                 relative error, maintains I/O access patterns compared
                 to full precision, and operates with low overhead.
                 Using ADIOS as an I/O use-case, we show proportional
                 reduction in disk access time to the degree of
                 precision. Finally, we show the effects of partial
                 precision analysis on accuracy for operations such as
                 $k$-means and Fourier analysis, finding a strong
                 applicability for the use of varying degrees of
                 precision to reduce the cost of analyzing extreme-scale
  acknowledgement = ack-nhfb,
  articleno =    "48",

  author =       "Janine C. Bennett and Hasan Abbasi and Peer-Timo
                 Bremer and Ray Grout and Attila Gyulassy and Tong Jin
                 and Scott Klasky and Hemanth Kolla and Manish Parashar
                 and Valerio Pascucci and Philippe Pebay and David
                 Thompson and Hongfeng Yu and Fan Zhang and Jacqueline
  title =        "Combining in-situ and in-transit processing to enable
                 extreme-scale scientific analysis",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "49:1--49:9",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "With the onset of extreme-scale computing, I/O
                 constraints make it increasingly difficult for
                 scientists to save a sufficient amount of raw
                 simulation data to persistent storage. One potential
                 solution is to change the data analysis pipeline from a
                 post-process centric to a concurrent approach based on
                 either in-situ or in-transit processing. In this
                 context computations are considered in-situ if they
                 utilize the primary compute resources, while in-transit
                 processing refers to offloading computations to a set
                 of secondary resources using asynchronous data
                 transfers. In this paper we explore the design and
                 implementation of three common analysis techniques
                 typically performed on large-scale scientific
                 simulations: topological analysis, descriptive
                 statistics, and visualization. We summarize algorithmic
                 developments, describe a resource scheduling system to
                 coordinate the execution of various analysis workflows,
                 and discuss our implementation using the DataSpaces and
                 ADIOS frameworks that support efficient data movement
                 between in-situ and in-transit computations. We
                 demonstrate the efficiency of our lightweight, flexible
                 framework by deploying it on the Jaguar XK6 to analyze
                 data generated by S3D, a massively parallel turbulent
                 combustion code. Our framework allows scientists
                 dealing with the data deluge at extreme scale to
                 perform analyses at increased temporal resolutions,
                 mitigate I/O costs, and significantly improve the time
                 to insight.",
  acknowledgement = ack-nhfb,
  articleno =    "49",

  author =       "Sidharth Kumar and Venkatram Vishwanath and Philip
                 Carns and Joshua A. Levine and Robert Latham and
                 Giorgio Scorzelli and Hemanth Kolla and Ray Grout and
                 Robert Ross and Michael E. Papka and Jacqueline Chen
                 and Valerio Pascucci",
  title =        "Efficient data restructuring and aggregation for {I/O}
                 acceleration in {PIDX}",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "50:1--50:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Hierarchical, multiresolution data representations
                 enable interactive analysis and visualization of
                 large-scale simulations. One promising application of
                 these techniques is to store high performance computing
                 simulation output in a hierarchical Z (HZ) ordering
                 that translates data from a Cartesian coordinate scheme
                 to a one-dimensional array ordered by locality at
                 different resolution levels. However, when the
                 dimensions of the simulation data are not an even power
                 of 2, parallel HZ ordering produces sparse memory and
                 network access patterns that inhibit I/O performance.
                 This work presents a new technique for parallel HZ
                 ordering of simulation datasets that restructures
                 simulation data into large (power of 2) blocks to
                 facilitate efficient I/O aggregation. We perform both
                 weak and strong scaling experiments using the S3D
                 combustion application on both Cray-XE6 (65,536 cores)
                 and IBM Blue Gene/P (131,072 cores) platforms. We
                 demonstrate that data can be written in hierarchical,
                 multiresolution format with performance competitive to
                 that of native data-ordering methods.",
  acknowledgement = ack-nhfb,
  articleno =    "50",

  author =       "Melanie Kambadur and Tipp Moseley and Rick Hank and
                 Martha A. Kim",
  title =        "Measuring interference between live datacenter
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "51:1--51:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Application interference is prevalent in datacenters
                 due to contention over shared hardware resources.
                 Unfortunately, understanding interference in live
                 datacenters is more difficult than in controlled
                 environments or on simpler architectures. Most
                 approaches to mitigating interference rely on data that
                 cannot be collected efficiently in a production
                 environment. This work exposes eight specific
                 complexities of live datacenters that constrain
                 measurement of interference. It then introduces new,
                 generic measurement techniques for analyzing
                 interference in the face of these challenges and
                 restrictions. We use the measurement techniques to
                 conduct the first large-scale study of application
                 interference in live production datacenter workloads.
                 Data is measured across 1000 12-core Google servers
                 observed to be running 1102 unique applications.
                 Finally, our work identifies several opportunities to
                 improve performance that use only the available data;
                 these opportunities are applicable to any datacenter.",
  acknowledgement = ack-nhfb,
  articleno =    "51",

  author =       "Rini T. Kaushik and Klara Nahrstedt",
  title =        "{T*}: a data-centric cooling energy costs reduction
                 approach for big data analytics cloud",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "52:1--52:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Explosion in Big Data has led to a surge in extremely
                 large-scale Big Data analytics platforms, resulting in
                 burgeoning energy costs. Big Data compute model
                 mandates strong data-locality for computational
                 performance, and moves computations to data.
                 State-of-the-art cooling energy management techniques
                 rely on thermal-aware computational job
                 placement/migration and are inherently
                 data-placement-agnostic in nature. T * takes a novel,
                 data-centric approach to reduce cooling energy costs
                 and to ensure thermal-reliability of the servers. T *
                 is cognizant of the uneven thermal-profile and
                 differences in thermal-reliability-driven load
                 thresholds of the servers, and the differences in the
                 computational jobs arrival rate, size, and evolution
                 life spans of the Big Data placed in the cluster. Based
                 on this knowledge, and coupled with its predictive file
                 models and insights, T * does proactive, thermal-aware
                 file placement, which implicitly results in
                 thermal-aware job placement in the Big Data analytics
                 compute model. Evaluation results with one-month long
                 real-world Big Data analytics production traces from
                 Yahoo! show up to 42\% reduction in the cooling energy
                 costs with T * courtesy of its lower and more uniform
                 thermal-profile and 9x better performance than the
                 state-of-the-art data-agnostic cooling techniques.",
  acknowledgement = ack-nhfb,
  articleno =    "52",

  author =       "Vignesh T. Ravi and Michela Becchi and Gagan Agrawal
                 and Srimat Chakradhar",
  title =        "{ValuePack}: value-based scheduling framework for
                 {CPU--GPU} clusters",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "53:1--53:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Heterogeneous computing nodes are becoming commonplace
                 today, and recent trends strongly indicate that
                 clusters, supercomputers, and cloud environments will
                 increasingly host more heterogeneous resources, with
                 some being massively parallel (e.g., GPU). With such
                 heterogeneous environments becoming common, it is
                 important to revisit scheduling problems for clusters
                 and cloud environments. In this paper, we formulate and
                 address the problem of value-driven scheduling of
                 independent jobs on heterogeneous clusters, which
                 captures both the urgency and relative priority of
                 jobs. Our overall scheduling goal is to maximize the
                 aggregate value or yield of all jobs. Exploiting the
                 portability available from the underlying programming
                 model, we propose four novel scheduling schemes that
                 can automatically and dynamically map jobs onto
                 heterogeneous resources. Additionally, to improve the
                 utilization of massively parallel resources, we also
                 propose heuristics to automatically decide when and
                 which jobs can share a single resource.",
  acknowledgement = ack-nhfb,
  articleno =    "53",

  author =       "Robert Preissl and Theodore M. Wong and Pallab Datta
                 and Myron Flickner and Raghavendra Singh and Steven K.
                 Esser and William P. Risk and Horst D. Simon and
                 Dharmendra S. Modha",
  title =        "{Compass}: a scalable simulator for an architecture
                 for cognitive computing",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "54:1--54:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Inspired by the function, power, and volume of the
                 organic brain, we are developing TrueNorth, a novel
                 modular, non-von Neumann, ultra-low power, compact
                 architecture. TrueNorth consists of a scalable network
                 of neurosynaptic cores, with each core containing
                 neurons, dendrites, synapses, and axons. To set sail
                 for TrueNorth, we developed Compass, a multi-threaded,
                 massively parallel functional simulator and a parallel
                 compiler that maps a network of long-distance pathways
                 in the macaque monkey brain to TrueNorth. We
                 demonstrate near-perfect weak scaling on a 16 rack
                 IBM\reg{} Blue Gene\reg{}/Q (262144 CPUs, 256 TB
                 memory), achieving an unprecedented scale of 256
                 million neurosynaptic cores containing 65 billion
                 neurons and 16 trillion synapses running only 388X
                 slower than real time with an average spiking rate of
                 8.1 Hz. By using emerging PGAS communication
                 primitives, we also demonstrate 2X better real-time
                 performance over MPI primitives on a 4 rack Blue Gene/P
                 (16384 CPUs, 16 TB memory).",
  acknowledgement = ack-nhfb,
  articleno =    "54",

  author =       "Yanhua Sun and Gengbin Zheng and Chao Mei and Eric J.
                 Bohm and James C. Phillips and Laximant V. Kal{\'e} and
                 Terry R. Jones",
  title =        "Optimizing fine-grained communication in a
                 biomolecular simulation application on {Cray XK6}",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "55:1--55:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Achieving good scaling for fine-grained communication
                 intensive applications on modern supercomputers remains
                 challenging. In our previous work, we have shown that
                 such an application --- NAMD --- scales well on the
                 full Jaguar XT5 without long-range interactions; Yet,
                 with them, the speedup falters beyond 64K cores.
                 Although the new Gemini interconnect on Cray XK6 has
                 improved network performance, the challenges remain,
                 and are likely to remain for other such networks as
                 well. We analyze communication bottlenecks in NAMD and
                 its CHARM++ runtime, using the Projections performance
                 analysis tool. Based on the analysis, we optimize the
                 runtime, built on the uGNI library for Gemini. We
                 present several techniques to improve the fine-grained
                 communication. Consequently, the performance of running
                 92224-atom Apoa1 with GPUs on TitanDev is improved by
                 36\%. For 100-million-atom STMV, we improve upon the
                 prior Jaguar XT5 result of 26 ms/step to 13 ms/step
                 using 298,992 cores on Jaguar XK6.",
  acknowledgement = ack-nhfb,
  articleno =    "55",

  author =       "Yuri Alexeev and Ashutosh Mahajan and Sven Leyffer and
                 Graham Fletcher and Dmitri G. Fedorov",
  title =        "Heuristic static load-balancing algorithm applied to
                 the fragment molecular orbital method",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "56:1--56:13",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "In the era of petascale supercomputing, the importance
                 of load balancing is crucial. Although dynamic load
                 balancing is widespread, it is increasingly difficult
                 to implement effectively with thousands of processors
                 or more, prompting a second look at static
                 load-balancing techniques even though the optimal
                 allocation of tasks to processors is an NP-hard
                 problem. We propose a heuristic static load-balancing
                 algorithm, employing fitted benchmarking data, as an
                 alternative to dynamic load balancing. The problem of
                 allocating CPU cores to tasks is formulated as a
                 mixed-integer nonlinear optimization problem, which is
                 solved by using an optimization solver. On 163,840
                 cores of Blue Gene/P, we achieved a parallel efficiency
                 of 80\% for an execution of the fragment molecular
                 orbital method applied to model protein-ligand
                 complexes quantum-mechanically. The obtained allocation
                 is shown to outperform dynamic load balancing by at
                 least a factor of 2, thus motivating the use of this
                 approach on other coarse-grained applications.",
  acknowledgement = ack-nhfb,
  articleno =    "56",

  author =       "Dong Li and Jeffrey S. Vetter and Weikuan Yu",
  title =        "Classifying soft error vulnerabilities in
                 extreme-scale scientific applications using a binary
                 instrumentation tool",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "57:1--57:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Extreme-scale scientific applications are at a
                 significant risk of being hit by soft errors on
                 supercomputers as the scale of these systems and the
                 component density continues to increase. In order to
                 better understand the specific soft error
                 vulnerabilities in scientific applications, we have
                 built an empirical fault injection and consequence
                 analysis tool --- BIFIT --- that allows us to evaluate
                 how soft errors impact applications. In particular,
                 BIFIT is designed with capability to inject faults at
                 very specific targets: an arbitrarily-chosen execution
                 point and any specific data structure. We apply BIFIT
                 to three mission-critical scientific applications and
                 investigate the applications vulnerability to soft
                 errors by performing thousands of statistical tests.
                 We, then, classify each applications individual data
                 structures based on their sensitivity to these
                 vulnerabilities, and generalize these classifications
                 across applications. Subsequently, these
                 classifications can be used to apply appropriate
                 resiliency solutions to each data structure within an
                 application. Our study reveals that these scientific
                 applications have a wide range of sensitivities to both
                 the time and the location of a soft error; yet, we are
                 able to identify intrinsic relationships between
                 application vulnerabilities and specific types of data
                 objects. In this regard, BIFIT enables new
                 opportunities for future resiliency research.",
  acknowledgement = ack-nhfb,
  articleno =    "57",

  author =       "Jinsuk Chung and Ikhwan Lee and Michael Sullivan and
                 Jee Ho Ryoo and Dong Wan Kim and Doe Hyun Yoon and
                 Larry Kaplan and Mattan Erez",
  title =        "Containment domains: a scalable, efficient, and
                 flexible resilience scheme for exascale systems",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "58:1--58:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "This paper describes and evaluates a scalable and
                 efficient resilience scheme based on the concept of
                 containment domains. Containment domains are a
                 programming construct that enable applications to
                 express resilience needs and to interact with the
                 system to tune and specialize error detection, state
                 preservation and restoration, and recovery schemes.
                 Containment domains have weak transactional semantics
                 and are nested to take advantage of the machine and
                 application hierarchies and to enable hierarchical
                 state preservation, restoration, and recovery. We
                 evaluate the scalability and efficiency of containment
                 domains using generalized trace-driven simulation and
                 analytical analysis and show that containment domains
                 are superior to both checkpoint restart and redundant
                 execution approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "58",

  author =       "Surendra Byna and Jerry Chou and Oliver R{\"u}bel and
                 Prabhat and Homa Karimabadi and William S. Daughton and
                 Vadim Roytershteyn and E. Wes Bethel and Mark Howison
                 and Ke-Jou Hsu and Kuan-Wu Lin and Arie Shoshani and
                 Andrew Uselton and Kesheng Wu",
  title =        "Parallel {I/O}, analysis, and visualization of a
                 trillion particle simulation",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "59:1--59:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Petascale plasma physics simulations have recently
                 entered the regime of simulating trillions of
                 particles. These unprecedented simulations generate
                 massive amounts of data, posing significant challenges
                 in storage, analysis, and visualization. In this paper,
                 we present parallel I/O, analysis, and visualization
                 results from a VPIC trillion particle simulation
                 running on 120,000 cores, which produces ~ 30 TB of
                 data for a single timestep. We demonstrate the
                 successful application of H5Part, a particle data
                 extension of parallel HDF5, for writing the dataset at
                 a significant fraction of system peak I/O rates. To
                 enable efficient analysis, we develop hybrid parallel
                 FastQuery to index and query data using multi-core CPUs
                 on distributed memory hardware. We show good
                 scalability results for the FastQuery implementation
                 using up to 10,000 cores. Finally, we apply this
                 indexing/query-driven approach to facilitate the
                 first-ever analysis and visualization of the trillion
                 particle dataset.",
  acknowledgement = ack-nhfb,
  articleno =    "59",

  author =       "Kalin Kanov and Randal Burns and Greg Eyink and
                 Charles Meneveau and Alexander Szalay",
  title =        "Data-intensive spatial filtering in large numerical
                 simulation datasets",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "60:1--60:9",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "We present a query processing framework for the
                 efficient evaluation of spatial filters on large
                 numerical simulation datasets stored in a
                 data-intensive cluster. Previously, filtering of large
                 numerical simulations stored in scientific databases
                 has been impractical owing to the immense data
                 requirements. Rather, filtering is done during
                 simulation or by loading snapshots into the aggregate
                 memory of an HPC cluster. Our system performs filtering
                 within the database and supports large filter widths.
                 We present two complementary methods of execution: I/O
                 streaming computes a batch filter query in a single
                 sequential pass using incremental evaluation of
                 decomposable kernels, summed volumes generates an
                 intermediate data set and evaluates each filtered value
                 by accessing only eight points in this dataset. We
                 dynamically choose between these methods depending upon
                 workload characteristics. The system allows us to
                 perform filters against large data sets with little
                 overhead: query performance scales with the cluster's
                 aggregate I/O throughput.",
  acknowledgement = ack-nhfb,
  articleno =    "60",

  author =       "Boonthanome Nouanesengsy and Teng-Yok Lee and Kewei Lu
                 and Han-Wei Shen and Tom Peterka",
  title =        "Parallel particle advection and {FTLE} computation for
                 time-varying flow fields",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "61:1--61:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Flow fields are an important product of scientific
                 simulations. One popular flow visualization technique
                 is particle advection, in which seeds are traced
                 through the flow field. One use of these traces is to
                 compute a powerful analysis tool called the Finite-Time
                 Lyapunov Exponent (FTLE) field, but no existing
                 particle tracing algorithms scale to the particle
                 injection frequency required for high-resolution FTLE
                 analysis. In this paper, a framework to trace the
                 massive number of particles necessary for FTLE
                 computation is presented. A new approach is explored,
                 in which processes are divided into groups, and are
                 responsible for mutually exclusive spans of time. This
                 pipelining over time intervals reduces overall idle
                 time of processes and decreases I/O overhead. Our
                 parallel FTLE framework is capable of advecting
                 hundreds of millions of particles at once, with
                 performance scaling up to tens of thousands of
  acknowledgement = ack-nhfb,
  articleno =    "61",

  author =       "Mostofa Ali Patwary and Diana Palsetia and Ankit
                 Agrawal and Wei-keng Liao and Fredrik Manne and Alok
  title =        "A new scalable parallel {DBSCAN} algorithm using the
                 disjoint-set data structure",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "62:1--62:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "DBSCAN is a well-known density based clustering
                 algorithm capable of discovering arbitrary shaped
                 clusters and eliminating noise data. However,
                 parallelization of Dbscan is challenging as it exhibits
                 an inherent sequential data access order. Moreover,
                 existing parallel implementations adopt a master-slave
                 strategy which can easily cause an unbalanced workload
                 and hence result in low parallel efficiency. We present
                 a new parallel Dbscan algorithm (Pdsdbscan) using graph
                 algorithmic concepts. More specifically, we employ the
                 disjoint-set data structure to break the access
                 sequentiality of Dbscan. In addition, we use a
                 tree-based bottom-up approach to construct the
                 clusters. This yields a better-balanced workload
                 distribution. We implement the algorithm both for
                 shared and for distributed memory. Using data sets
                 containing up to several hundred million
                 high-dimensional points, we show that Pdsdbscan
                 significantly outperforms the master-slave approach,
                 achieving speedups up to 25.97 using 40 cores on shared
                 memory architecture, and speedups up to 5,765 using
                 8,192 cores on distributed memory architecture.",
  acknowledgement = ack-nhfb,
  articleno =    "62",

  author =       "Olga Nikolova and Srinivas Aluru",
  title =        "Parallel {Bayesian} network structure learning with
                 application to gene networks",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "63:1--63:9",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Bayesian networks (BN) are probabilistic graphical
                 models which are widely utilized in various research
                 areas, including modeling complex biological
                 interactions in the cell. Learning the structure of a
                 BN is an NP-hard problem and exact solutions are
                 limited to a few tens of variables. In this work, we
                 present a parallel BN structure learning algorithm that
                 combines principles of both heuristic and exact
                 approaches and facilitates learning of larger networks.
                 We demonstrate the applicability of our approach by an
                 implementation on a Cray AMD cluster, and present
                 experimental results for the problem of inferring gene
                 networks. Our approach is work-optimal and achieves
                 nearly perfect scaling.",
  acknowledgement = ack-nhfb,
  articleno =    "63",

  author =       "Arif M. Khan and David F. Gleich and Alex Pothen and
                 Mahantesh Halappanavar",
  title =        "A multithreaded algorithm for network alignment via
                 approximate matching",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "64:1--64:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Network alignment is an optimization problem to find
                 the best one-to-one map between the vertices of a pair
                 of graphs that overlaps as many edges as possible. It
                 is a relaxation of the graph isomorphism problem and is
                 closely related to the subgraph isomorphism problem.
                 The best current approaches are entirely heuristic and
                 iterative in nature. They generate real-valued
                 heuristic weights that must be rounded to find integer
                 solutions. This rounding requires solving a bipartite
                 maximum weight matching problem at each iteration in
                 order to avoid missing high quality solutions. We
                 investigate substituting a parallel, half-approximation
                 for maximum weight matching instead of an exact
                 computation. Our experiments show that the resulting
                 difference in solution quality is negligible. We
                 demonstrate almost a 20-fold speedup using 40 threads
                 on an 8 processor Intel Xeon E7-8870 system and now
                 solve real-world problems in 36 seconds instead of 10
  acknowledgement = ack-nhfb,
  articleno =    "64",

  author =       "Stephen L. Olivier and Bronis R. de Supinski and
                 Martin Schulz and Jan F. Prins",
  title =        "Characterizing and mitigating work time inflation in
                 task parallel programs",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "65:1--65:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Task parallelism raises the level of abstraction in
                 shared memory parallel programming to simplify the
                 development of complex applications. However, task
                 parallel applications can exhibit poor performance due
                 to thread idleness, scheduling overheads, and work time
                 inflation --- additional time spent by threads in a
                 multithreaded computation beyond the time required to
                 perform the same work in a sequential computation. We
                 identify the contributions of each factor to lost
                 efficiency in various task parallel OpenMP applications
                 and diagnose the causes of work time inflation in those
                 applications. Increased data access latency can cause
                 significant work time inflation in NUMA systems. Our
                 locality framework for task parallel OpenMP programs
                 mitigates this cause of work time inflation. Our
                 extensions to the Qthreads library demonstrate that
                 locality-aware scheduling can improve performance up to
                 3X compared to the Intel OpenMP task scheduler.",
  acknowledgement = ack-nhfb,
  articleno =    "65",

  author =       "Michael Bauer and Sean Treichler and Elliott Slaughter
                 and Alex Aiken",
  title =        "{Legion}: expressing locality and independence with
                 logical regions",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "66:1--66:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Modern parallel architectures have both heterogeneous
                 processors and deep, complex memory hierarchies. We
                 present Legion, a programming model and runtime system
                 for achieving high performance on these machines.
                 Legion is organized around logical regions, which
                 express both locality and independence of program data,
                 and tasks, functions that perform computations on
                 regions. We describe a runtime system that dynamically
                 extracts parallelism from Legion programs, using a
                 distributed, parallel scheduling algorithm that
                 identifies both independent tasks and nested
                 parallelism. Legion also enables explicit, programmer
                 controlled movement of data through the memory
                 hierarchy and placement of tasks based on locality
                 information via a novel mapping interface. We evaluate
                 our Legion implementation on three applications:
                 fluid-flow on a regular grid, a three-level AMR code
                 solving a heat diffusion equation, and a circuit
  acknowledgement = ack-nhfb,
  articleno =    "66",

  author =       "Michael Garland and Manjunath Kudlur and Yili Zheng",
  title =        "Designing a unified programming model for
                 heterogeneous machines",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "67:1--67:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "While high-efficiency machines are increasingly
                 embracing heterogeneous architectures and massive
                 multithreading, contemporary mainstream programming
                 languages reflect a mental model in which processing
                 elements are homogeneous, concurrency is limited, and
                 memory is a flat undifferentiated pool of storage.
                 Moreover, the current state of the art in programming
                 heterogeneous machines tends towards using separate
                 programming models, such as OpenMP and CUDA, for
                 different portions of the machine. Both of these
                 factors make programming emerging heterogeneous
                 machines unnecessarily difficult. We describe the
                 design of the Phalanx programming model, which seeks to
                 provide a unified programming model for heterogeneous
                 machines. It provides constructs for bulk parallelism,
                 synchronization, and data placement which operate
                 across the entire machine. Our prototype implementation
                 is able to launch and coordinate work on both CPU and
                 GPU processors within a single node, and by leveraging
                 the GASNet runtime, is able to run across all the nodes
                 of a distributed-memory machine.",
  acknowledgement = ack-nhfb,
  articleno =    "67",

  author =       "Sushant Sharma and Dimitrios Katramatos and Dantong Yu
                 and Li Shi",
  title =        "Design and implementation of an intelligent end-to-end
                 network {QoS} system",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "68:1--68:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "End-to-End guaranteed network QoS is a requirement for
                 predictable data transfers between geographically
                 distant end-hosts. Existing QoS systems, however, do
                 not have the capability/intelligence to decide what
                 resources to reserve and which paths to choose when
                 there are multiple and flexible resource reservation
                 requests. In this paper, we design and implement an
                 intelligent system that can guarantee end-to-end
                 network QoS for multiple flexible reservation requests.
                 At the heart of this system is a polynomial time
                 algorithm called resource reservation and path
                 construction (RRPC). The RRPC algorithm schedules
                 multiple flexible end-to-end data transfer requests by
                 jointly optimizing the path construction and bandwidth
                 reservation along these paths. We show that
                 constructing such schedules is NP-hard. We implement
                 our intelligent QoS system, and present the results of
                 deployment on real world production networks (ESnet and
                 Internet2). Our implementation does not require
                 modifications or new software to be deployed on the
                 routers within network.",
  acknowledgement = ack-nhfb,
  articleno =    "68",

  author =       "Dong Chen and Noel Eisley and Philip Heidelberger and
                 Sameer Kumar and Amith Mamidala and Fabrizio Petrini
                 and Robert Senger and Yutaka Sugawara and Robert Walkup
                 and Burkhard Steinmacher-Burow and Anamitra Choudhury
                 and Yogish Sabharwal and Swati Singhal and Jeffrey J.
  title =        "Looking under the hood of the {IBM Blue Gene/Q}
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "69:1--69:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "This paper explores the performance and optimization
                 of the IBM Blue Gene/Q (BG/Q) five dimensional torus
                 network on up to 16K nodes. The BG/Q hardware supports
                 multiple dynamic routing algorithms and different
                 traffic patterns may require different algorithms to
                 achieve best performance. Between 85\% to 95\% of peak
                 network performance is achieved for all-to-all traffic,
                 while over 85\% of peak is obtained for challenging
                 bisection pairings. A new software-controlled algorithm
                 is developed for bisection traffic that selects which
                 hardware algorithm to employ and achieves better
                 performance than any individual hardware algorithm. The
                 benefit of dynamic routing is shown for a highly
                 non-uniform ``transpose'' traffic pattern. To evaluate
                 memory and network performance, the HPCC Random Access
                 benchmark was tuned for BG/Q and achieved 858 Giga
                 Updates per Second (GUPS) on 16K nodes. To further
                 accelerate message processing, the message libraries on
                 BG/Q enable the offloading of messaging overhead onto
                 dedicated communication threads. Several applications,
                 including Algebraic Multigrid (AMG), exhibit from 3 to
                 20\% gain using communication threads.",
  acknowledgement = ack-nhfb,
  articleno =    "69",

  author =       "H. Subramoni and S. Potluri and K. Kandalla and B.
                 Barth and J. Vienne and J. Keasler and K. Tomko and K.
                 Schulz and A. Moody and D. K. Panda",
  title =        "Design of a scalable {InfiniBand} topology service to
                 enable network-topology-aware placement of processes",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "70:1--70:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Over the last decade, InfiniBand has become an
                 increasingly popular interconnect for deploying modern
                 super-computing systems. However, there exists no
                 detection service that can discover the underlying
                 network topology in a scalable manner and expose this
                 information to runtime libraries and users of the high
                 performance computing systems in a convenient way. In
                 this paper, we design a novel and scalable method to
                 detect the InfiniBand network topology by using
                 Neighbor-Joining techniques (NJ). To the best of our
                 knowledge, this is the first instance where the
                 neighbor joining algorithm has been applied to solve
                 the problem of detecting InfiniBand network topology.
                 We also design a network-topology-aware MPI library
                 that takes advantage of the network topology service.
                 The library places processes taking part in the MPI job
                 in a network-topology-aware manner with the dual aim of
                 increasing intra-node communication and reducing the
                 long distance inter-node communication across the
                 InfiniBand fabric.",
  acknowledgement = ack-nhfb,
  articleno =    "70",

  author =       "Guancheng Chen and Per Stenstrom",
  title =        "Critical lock analysis: diagnosing critical section
                 bottlenecks in multithreaded applications",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "71:1--71:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Critical sections are well known potential performance
                 bottlenecks in multithreaded applications and
                 identifying the ones that inhibit scalability are
                 important for performance optimizations. While previous
                 approaches use idle time as a key measure, we show such
                 a measure is not reliable. The reason is that idleness
                 does not necessarily mean the critical section is on
                 the critical path. We introduce critical lock analysis,
                 a new method for diagnosing critical section
                 bottlenecks in multithreaded applications. Our method
                 firstly identifies the critical sections appearing on
                 the critical path, and then quantifies the impact of
                 such critical sections on the overall performance by
                 using quantitative performance metrics. Case studies
                 show that our method can successfully identify critical
                 sections that are most beneficial for improving overall
                 performance as well as quantify their performance
                 impact on the critical path, which results in a more
                 reliable establishment of the inherent critical section
                 bottlenecks than previous approaches.",
  acknowledgement = ack-nhfb,
  articleno =    "71",

  author =       "Mahesh Ravishankar and John Eisenlohr and
                 Louis-No{\"e}l Pouchet and J. Ramanujam and Atanas
                 Rountev and P. Sadayappan",
  title =        "Code generation for parallel execution of a class of
                 irregular loops on distributed memory systems",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "72:1--72:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Parallelization and locality optimization of affine
                 loop nests has been successfully addressed for
                 shared-memory machines. However, many large-scale
                 simulation applications must be executed in a
                 distributed-memory environment, and use
                 irregular/sparse computations where the control-flow
                 and array-access patterns are data-dependent. In this
                 paper, we propose an approach for effective parallel
                 execution of a class of irregular loop computations in
                 a distributed-memory environment, using a combination
                 of static and runtime analysis. We discuss algorithms
                 that analyze sequential code to generate an inspector
                 and an executor. The inspector captures the
                 data-dependent behavior of the computation in parallel
                 and without requiring complete replication of any of
                 the data structures used in the original computation.
                 The executor performs the computation in parallel. The
                 effectiveness of the framework is demonstrated on
                 several benchmarks and a climate modeling
  acknowledgement = ack-nhfb,
  articleno =    "72",

  author =       "Jean-Michel Alimi and Vincent Bouillot and Yann Rasera
                 and Vincent Reverdy and Pier-Stefano Corasaniti and
                 Ir{\`e}ne Balm{\`e}s and St{\'e}phane Requena and
                 Xavier Delaruelle and Jean-Noel Richet",
  title =        "First-ever full observable universe simulation",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "73:1--73:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "We have performed the first-ever numerical N-body
                 simulation of the full observable universe (DEUS ``Dark
                 Energy Universe Simulation'' FUR ``Full Universe
                 Run''). This has evolved 550 billion particles on an
                 Adaptive Mesh Refinement grid with more than two
                 trillion computing points along the entire evolutionary
                 history of the universe and across 6 order of
                 magnitudes length scales, from the size of the Milky
                 Way to that of the whole observable Universe. To date,
                 this is the largest and most advanced cosmological
                 simulation ever run. It provides unique information on
                 the formation and evolution of the largest structure in
                 the universe and an exceptional support to future
                 observational programs dedicated to mapping the
                 distribution of matter and galaxies in the universe.
                 The simulation has run on 4752 (of 5040) thin nodes of
                 BULL supercomputer CURIE, using more than 300 TB of
                 memory for 10 million hours of computing time. About 50
                 PBytes of data were generated throughout the run. Using
                 an advanced and innovative reduction workflow the
                 amount of useful stored data has been reduced to 500
  acknowledgement = ack-nhfb,
  articleno =    "73",

  author =       "William B. March and Kenneth Czechowski and Marat
                 Dukhan and Thomas Benson and Dongryeol Lee and Andrew
                 J. Connolly and Richard Vuduc and Edmond Chow and
                 Alexander G. Gray",
  title =        "Optimizing the computation of $n$-point correlations
                 on large-scale astronomical data",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "74:1--74:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "The n-point correlation functions (npcf) are powerful
                 statistics that are widely used for data analyses in
                 astronomy and other fields. These statistics have
                 played a crucial role in fundamental physical
                 breakthroughs, including the discovery of dark energy.
                 Unfortunately, directly computing the npcf at a single
                 value requires O ( N$^n$ ) time for N points and values
                 of n of 2, 3, 4, or even larger. Astronomical data sets
                 can contain billions of points, and the next generation
                 of surveys will generate terabytes of data per night.
                 To meet these computational demands, we present a
                 highly-tuned npcf computation code that show an
                 order-of-magnitude speedup over current
                 state-of-the-art. This enables a much larger 3-point
                 correlation computation on the galaxy distribution than
                 was previously possible. We show a detailed performance
                 evaluation on many different architectures.",
  acknowledgement = ack-nhfb,
  articleno =    "74",

  author =       "Jingjin Wu and Zhiling Lan and Xuanxing Xiong and
                 Nickolay Y. Gnedin and Andrey V. Kravtsov",
  title =        "Hierarchical task mapping of cell-based {AMR}
                 cosmology simulations",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "75:1--75:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Cosmology simulations are highly
                 communication-intensive, thus it is critical to exploit
                 topology-aware task mapping techniques for performance
                 optimization. To exploit the architectural properties
                 of multiprocessor clusters (the performance gap between
                 inter-node and intra-node communication as well as the
                 gap between inter-socket and intra-socket
                 communication), we design and develop a hierarchical
                 task mapping scheme for cell-based AMR (Adaptive Mesh
                 Refinement) cosmology simulations, in particular, the
                 ART application. Our scheme consists of two parts: (1)
                 an inter-node mapping to map application processes onto
                 nodes with the objective of minimizing network traffic
                 among nodes and (2) an intra-node mapping within each
                 node to minimize the maximum size of messages
                 transmitted between CPU sockets. Experiments on
                 production supercomputers with 3D torus and fat-tree
                 topologies show that our scheme can significantly
                 reduce application communication cost by up to 50\%.
                 More importantly, our scheme is generic and can be
                 extended to many other applications.",
  acknowledgement = ack-nhfb,
  articleno =    "75",

  author =       "Vilas Sridharan and Dean Liberty",
  title =        "A study of {DRAM} failures in the field",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "76:1--76:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Most modern computer systems use dynamic random access
                 memory (DRAM) as a main memory store. Recent
                 publications have confirmed that DRAM errors are a
                 common source of failures in the field. Therefore,
                 further attention to the faults experienced by DRAM
                 sub-systems is warranted. In this paper, we present a
                 study of 11 months of DRAM errors in a large
                 high-performance computing cluster. Our goal is to
                 understand the failure modes, rates, and fault types
                 experienced by DRAM in production settings. We identify
                 several unique DRAM failure modes, including
                 single-bit, multi-bit, and multi-chip failures. We also
                 provide a deterministic bound on the rate of transient
                 faults in the DRAM array, by exploiting the presence of
                 a hardware scrubber on our nodes. We draw several
                 conclusions from our study. First, DRAM failures are
                 dominated by permanent, rather than transient, faults,
                 although not to the extent found by previous
                 publications. Second, DRAMs are susceptible to large
                 multi-bit failures, such as failures that affect an
                 entire DRAM row or column, indicating faults in shared
                 internal circuitry. Third, we identify a DRAM failure
                 mode that disrupts access to other DRAM devices that
                 share the same board-level circuitry. Finally, we find
                 that chipkill error-correcting codes (ECC) are
                 extremely effective, reducing the node failure rate
                 from uncorrected DRAM errors by 42x compared to
                 single-error correct/double-error detect (SEC-DED)
  acknowledgement = ack-nhfb,
  articleno =    "76",

  author =       "Ana Gainaru and Franck Cappello and Marc Snir and
                 William Kramer",
  title =        "Fault prediction under the microscope: a closer look
                 into {HPC} systems",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "77:1--77:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "A large percentage of computing capacity in today's
                 large high-performance computing systems is wasted
                 because of failures. Consequently current research is
                 focusing on providing fault tolerance strategies that
                 aim to minimize fault's effects on applications. By far
                 the most popular technique is the checkpoint-restart
                 strategy. A complement to this classical approach is
                 failure avoidance, by which the occurrence of a fault
                 is predicted and preventive measures are taken. This
                 requires a reliable prediction system to anticipate
                 failures and their locations. Thus far, research in
                 this field has used ideal predictors that were not
                 implemented in real HPC systems. In this paper, we
                 merge signal analysis concepts with data mining
                 techniques to extend the ELSA (Event Log Signal
                 Analyzer) toolkit and offer an adaptive and more
                 efficient prediction module. Our goal is to provide
                 models that characterize the normal behavior of a
                 system and the way faults affect it. Being able to
                 detect deviations from normality quickly is the
                 foundation of accurate fault prediction. However, this
                 is challenging because component failure dynamics are
                 heterogeneous in space and time. To this end, a large
                 part of the paper is focused on a detailed analysis of
                 the prediction method, by applying it to two
                 large-scale systems and by investigating the
                 characteristics and bottlenecks of each step of the
                 prediction process. Furthermore, we analyze the
                 prediction's precision and recall impact on current
                 checkpointing strategies and highlight future
                 improvements and directions for research in this
  acknowledgement = ack-nhfb,
  articleno =    "77",

  author =       "David Fiala and Frank Mueller and Christian Engelmann
                 and Rolf Riesen and Kurt Ferreira and Ron Brightwell",
  title =        "Detection and correction of silent data corruption for
                 large-scale high-performance computing",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "78:1--78:12",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Faults have become the norm rather than the exception
                 for high-end computing clusters. Exacerbating this
                 situation, some of these faults remain undetected,
                 manifesting themselves as silent errors that allow
                 applications to compute incorrect results. This paper
                 studies the potential for redundancy to detect and
                 correct soft errors in MPI message-passing applications
                 while investigating the challenges inherent to
                 detecting soft errors within MPI applications by
                 providing transparent MPI redundancy. By assuming a
                 model wherein corruption in application data manifests
                 itself by producing differing MPI messages between
                 replicas, we study the best suited protocols for
                 detecting and correcting corrupted MPI messages. Using
                 our fault injector, we observe that even a single error
                 can have profound effects on applications by causing a
                 cascading pattern of corruption which in most cases
                 spreads to all other processes. Results indicate that
                 our consistency protocols can successfully protect
                 applications experiencing even high rates of silent
                 data corruption.",
  acknowledgement = ack-nhfb,
  articleno =    "78",

  author =       "Dmytro Karpenko and Roman Vitenberg and Alexander L.
  title =        "{ATLAS} grid workload on {NDGF} resources: analysis,
                 modeling, and workload generation",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "79:1--79:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Evaluating new ideas for job scheduling or data
                 transfer algorithms in large-scale grid systems is
                 known to be notoriously challenging. Existing grid
                 simulators expect to receive a realistic workload as an
                 input. Such input is difficult to provide in absence of
                 an in-depth study of representative grid workloads. In
                 this work, we analyze the ATLAS workload processed on
                 the resources of NDG Facility. ATLAS is one of the
                 biggest grid technology users, with extreme demands for
                 CPU power and bandwidth. The analysis is based on the
                 data sample with ~1.6 million jobs, 1,723 TB of data
                 transfer, and 873 years of processor time. Our
                 additional contributions are (a) scalable workload
                 models that can be used to generate a synthetic
                 workload for a given number of jobs, (b) an open-source
                 workload generator software integrated with existing
                 grid simulators, and (c) suggestions for grid system
                 designers based on the insights of data analysis.",
  acknowledgement = ack-nhfb,
  articleno =    "79",

  author =       "Trilce Estrada and Michela Taufer",
  title =        "On the effectiveness of application-aware
                 self-management for scientific discovery in volunteer
                 computing systems",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "80:1--80:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "An important challenge faced by high-throughput,
                 multiscale applications is that human intervention has
                 a central role in driving their success. However,
                 manual intervention is in-efficient, error-prone and
                 promotes resource wasting. This paper presents an
                 application-aware modular framework that provides
                 self-management for computational multiscale
                 applications in volunteer computing (VC). Our framework
                 consists of a learning engine and three modules that
                 can be easily adapted to different distributed systems.
                 The learning engine of this framework is based on our
                 novel tree-like structure called KOTree. KOTree is a
                 fully automatic method that organizes statistical
                 information in a multidimensional structure that can be
                 efficiently searched and updated at runtime. Our
                 empirical evaluation shows that our framework can
                 effectively provide application-aware self-management
                 in VC systems. Additionally, we observed that our
                 KOTree algorithm is able to predict accurately the
                 expected length of new jobs, resulting in an average of
                 85\% increased throughput with respect to other
  acknowledgement = ack-nhfb,
  articleno =    "80",

  author =       "Z. Liu and M. Veeraraghavan and Z. Yan and C. Tracy
                 and J. Tie and I. Foster and J. Dennis and J. Hick and
                 Y. Li and W. Yang",
  title =        "On using virtual circuits for {GridFTP} transfers",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "81:1--81:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "The goal of this work is to characterize scientific
                 data transfers and to determine the suitability of
                 dynamic virtual circuit service for these transfers
                 instead of the currently used IP-routed service.
                 Specifically, logs collected by servers executing a
                 commonly used scientific data transfer application,
                 GridFTP, are obtained from three US
                 super-computing/scientific research centers, NERSC,
                 SLAC, and NCAR, and analyzed. Dynamic virtual circuit
                 (VC) service, a relatively new offering from providers
                 such as ESnet and Internet2, allows for the selection
                 of a path on which a rate-guaranteed connection is
                 established prior to data transfer. Given VC setup
                 overhead, the first analysis of the GridFTP transfer
                 logs characterizes the duration of sessions, where a
                 session consists of multiple back-to-back transfers
                 executed in batch mode between the same two GridFTP
                 servers. Of the NCAR-NICS sessions analyzed, 56\% of
                 all sessions (90\% of all transfers) would have been
                 long enough to be served with dynamic VC service. An
                 analysis of transfer logs across four paths, NCAR-NICS,
                 SLAC-BNL, NERSC-ORNL and NERSC-ANL, shows significant
                 throughput variance, where NICS, BNL, ORNL, and ANL are
                 other US national laboratories. For example, on the
                 NERSC-ORNL path, the inter-quartile range was 695 Mbps,
                 with a maximum value of 3.64 Gbps and a minimum value
                 of 758 Mbps. An analysis of the impact of various
                 factors that are potential causes of this variance is
                 also presented.",
  acknowledgement = ack-nhfb,
  articleno =    "81",

  author =       "Jiayuan Meng and Vitali A. Morozov and Venkatram
                 Vishwanath and Kalyan Kumaran",
  title =        "Dataflow-driven {GPU} performance projection for
                 multi-kernel transformations",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "82:1--82:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Applications often have a sequence of parallel
                 operations to be offloaded to graphics processors; each
                 operation can become an individual GPU kernel.
                 Developers typically explore a variety of
                 transformations for each kernel. Furthermore, it is
                 well known that efficient data management is critical
                 in achieving high GPU performance and that ``fusing''
                 multiple kernels into one may greatly improve data
                 locality. Doing so, however, requires transformations
                 across multiple, potentially nested, parallel loops; at
                 the same time, the original code semantics and data
                 dependency must be preserved. Since each kernel may
                 have distinct data access patterns, their combined
                 dataflow can be nontrivial. As a result, the complexity
                 of multi-kernel transformations often leads to
                 significant effort with no guarantee of performance
                 benefits. This paper proposes a dataflow-driven
                 analytical framework to project GPU performance for a
                 sequence of parallel operations. Users need only
                 provide CPU code skeletons for a sequence of parallel
                 loops. The framework can then automatically identify
                 opportunities for multi-kernel transformations and data
                 management. It is also able to project the overall
                 performance without implementing GPU code or using
                 physical hardware.",
  acknowledgement = ack-nhfb,
  articleno =    "82",

  author =       "Tyler Dwyer and Alexandra Fedorova and Sergey
                 Blagodurov and Mark Roth and Fabien Gaud and Jian Pei",
  title =        "A practical method for estimating performance
                 degradation on multicore processors, and its
                 application to {HPC} workloads",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "83:1--83:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "When multiple threads or processes run on a multi-core
                 CPU they compete for shared resources, such as caches
                 and memory controllers, and can suffer performance
                 degradation as high as 200\%. We design and evaluate a
                 new machine learning model that estimates this
                 degradation online, on previously unseen workloads, and
                 without perturbing the execution. Our motivation is to
                 help data center and HPC cluster operators effectively
                 use workload consolidation. Data center consolidation
                 is about placing many applications on the same server
                 to maximize hardware utilization. In HPC clusters,
                 processes of the same distributed applications run on
                 the same machine. Consolidation improves hardware
                 utilization, but may sacrifice performance as processes
                 compete for resources. Our model helps determine when
                 consolidation is overly harmful to performance. Our
                 work is the first to apply machine learning to this
                 problem domain, and we report on our experience reaping
                 the advantages of machine learning while navigating
                 around its limitations. We demonstrate how the model
                 can be used to improve performance fidelity and save
                 energy for HPC workloads.",
  acknowledgement = ack-nhfb,
  articleno =    "83",

  author =       "Kyle L. Spafford and Jeffrey S. Vetter",
  title =        "{Aspen}: a domain specific language for performance
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "84:1--84:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "We present a new approach to analytical performance
                 modeling using Aspen, a domain specific language. Aspen
                 (Abstract Scalable Performance Engineering Notation)
                 fills an important gap in existing performance modeling
                 techniques and is designed to enable rapid exploration
                 of new algorithms and architectures. It includes a
                 formal specification of an application's performance
                 behavior and an abstract machine model. We provide an
                 overview of Aspen's features and demonstrate how it can
                 be used to express a performance model for a three
                 dimensional Fast Fourier Transform. We then demonstrate
                 the composability and modularity of Aspen by importing
                 and reusing the FFT model in a molecular dynamics
                 model. We have also created a number of tools that
                 allow scientists to balance application and system
                 factors quickly and accurately.",
  acknowledgement = ack-nhfb,
  articleno =    "84",

  author =       "Zhao Zhang and Daniel S. Katz and Justin M. Wozniak
                 and Allan Espinosa and Ian Foster",
  title =        "Design and analysis of data management in scalable
                 parallel scripting",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "85:1--85:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "We seek to enable efficient large-scale parallel
                 execution of applications in which a shared filesystem
                 abstraction is used to couple many tasks. Such parallel
                 scripting ( many-task computing, MTC ) applications
                 suffer poor performance and utilization on large
                 parallel computers because of the volume of filesystem
                 I/O and a lack of appropriate optimizations in the
                 shared filesystem. Thus, we design and implement a
                 scalable MTC data management system that uses
                 aggregated compute node local storage for more
                 efficient data movement strategies. We co-design the
                 data management system with the data-aware scheduler to
                 enable dataflow pattern identification and automatic
                 optimization. The framework reduces the time to
                 solution of parallel stages of an astronomy data
                 analysis application, Montage, by 83.2\% on 512 cores;
                 decreases the time to solution of a seismology
                 application, CyberShake, by 7.9\% on 2,048 cores; and
                 delivers BLAST performance better than mpiBLAST at
                 various scales up to 32,768 cores, while preserving the
                 flexibility of the original BLAST application.",
  acknowledgement = ack-nhfb,
  articleno =    "85",

  author =       "Ian F. Adams and Brian A. Madden and Joel C. Frank and
                 Mark W. Storer and Ethan L. Miller and Gene Harano",
  title =        "Usage behavior of a large-scale scientific archive",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "86:1--86:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Archival storage systems for scientific data have been
                 growing in both size and relevance over the past two
                 decades, yet researchers and system designers alike
                 must rely on limited and obsolete knowledge to guide
                 archival management and design. To address this issue,
                 we analyzed three years of file-level activities from
                 the NCAR mass storage system, providing valuable
                 insight into a large-scale scientific archive with over
                 1600 users, tens of millions of files, and petabytes of
                 data. Our examination of system usage showed that,
                 while a subset of users were responsible for most of
                 the activity, this activity was widely distributed at
                 the file level. We also show that the physical grouping
                 of files and directories on media can improve archival
                 storage system performance. Based on our observations,
                 we provide suggestions and guidance for both future
                 scientific archival system designs as well as improved
                 tracing of archival activity.",
  acknowledgement = ack-nhfb,
  articleno =    "86",

  author =       "Jharrod LaFon and Satyajayant Misra and Jon
  title =        "On distributed file tree walk of parallel file
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "87:1--87:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Supercomputers generate vast amounts of data,
                 typically organized into large directory hierarchies on
                 parallel file systems. While the supercomputing
                 applications are parallel, the tools used to process
                 them requiring complete directory traversal, are
                 typically serial. We present an algorithm framework and
                 three fully distributed algorithms for traversing large
                 parallel file systems, and performing file operations
                 in parallel. The first algorithm introduces a
                 randomized work-stealing scheduler; the second improves
                 the first with proximity-awareness; and the third
                 improves upon the second by using a hybrid approach. We
                 have tested our implementation on Cielo, a 1.37
                 petaflop supercomputer at the Los Alamos National
                 Laboratory and its 7 petabyte file system. Test results
                 show that our algorithms execute orders of magnitude
                 faster than state-of-the-art algorithms while achieving
                 ideal load balancing and low communication cost. We
                 present performance insights from the use of our
                 algorithms in production systems at LANL, performing
                 daily file system operations.",
  acknowledgement = ack-nhfb,
  articleno =    "87",

  author =       "I-Hsin Chung and Changhoan Kim and Hui-Fang Wen and
                 Guojing Cong",
  title =        "Application data prefetching on the {IBM Blue Gene/Q}
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "88:1--88:8",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Memory access latency is often a crucial performance
                 limitation for high performance computing. Prefetching
                 is one of the strategies used by system designers to
                 bridge the processor-memory gap. This paper describes a
                 new innovative list prefetching feature introduced in
                 the IBM Blue Gene/Q supercomputer. The list prefetcher
                 records the L1 cache miss addresses and prefetches them
                 in the next iteration. The evaluation shows this list
                 prefetching mechanism reduces data fetching time when
                 L1 cache misses happen and improves the performance for
                 high performance computing applications with repeating
                 non-uniform memory access patterns. Its performance is
                 compatible with classic stream prefetcher when properly
  acknowledgement = ack-nhfb,
  articleno =    "88",

  author =       "Lluc Alvarez and Llu{\'\i}s Vilanova and Marc Gonzalez
                 and Xavier Martorell and Nacho Navarro and Eduard
  title =        "Hardware-software coherence protocol for the
                 coexistence of caches and local memories",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "89:1--89:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Cache coherence protocols limit the scalability of
                 chip multiprocessors. One solution is to introduce a
                 local memory alongside the cache hierarchy, forming a
                 hybrid memory system. Local memories are more
                 power-efficient than caches and they do not generate
                 coherence traffic but they suffer from poor
                 programmability. When non-predictable memory access
                 patterns are found compilers do not succeed in
                 generating code because of the incoherency between the
                 two storages. This paper proposes a coherence protocol
                 for hybrid memory systems that allows the compiler to
                 generate code even in the presence of memory aliasing
                 problems. Coherency is ensured by a simple
                 software/hardware co-design where the compiler
                 identifies potentially incoherent memory accesses and
                 the hardware diverts them to the correct copy of the
                 data. The coherence protocol introduces overheads of
                 0.24\% in execution time and of 1.06\% in energy
                 consumption to enable the usage of the hybrid memory
  acknowledgement = ack-nhfb,
  articleno =    "89",

  author =       "Martin Schindewolf and Barna Bihari and John
                 Gyllenhaal and Martin Schulz and Amy Wang and Wolfgang
  title =        "What scientific applications can benefit from hardware
                 transactional memory?",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "90:1--90:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Achieving efficient and correct synchronization of
                 multiple threads is a difficult and error-prone task at
                 small scale and, as we march towards extreme scale
                 computing, will be even more challenging when the
                 resulting application is supposed to utilize millions
                 of cores efficiently. Transactional Memory (TM) is a
                 promising technique to ease the burden on the
                 programmer, but only recently has become available on
                 commercial hardware in the new Blue Gene/Q system and
                 hence the real benefit for realistic applications has
                 not been studied yet. This paper presents the first
                 performance results of TM embedded into OpenMP on a
                 prototype system of BG/Q and characterizes code
                 properties that will likely lead to benefits when
                 augmented with TM primitives. We first study the
                 influence of thread count, environment variables and
                 memory layout on TM performance and identify code
                 properties that will yield performance gains with TM.
                 Second, we evaluate the combination of OpenMP with
                 multiple synchronization primitives on top of MPI to
                 determine suitable task to thread ratios per node.
                 Finally, we condense our findings into a set of best
                 practices. These are applied to a Monte Carlo Benchmark
                 and a Smoothed Particle Hydrodynamics method. In both
                 cases an optimized TM version, executed with 64 threads
                 on one node, outperforms a simple TM implementation.
                 MCB with optimized TM yields a speedup of 27.45 over
  acknowledgement = ack-nhfb,
  articleno =    "90",

  author =       "Laura Grigori and Radek Stompor and Mikolaj
  title =        "A parallel two-level preconditioner for cosmic
                 microwave background map-making",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "91:1--91:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Generalized least square problems with non-diagonal
                 weights arise frequently in an estimation of two
                 dimensional images from data of cosmological as well as
                 astro- or geo- physical observations. As the
                 observational data sets keep growing at Moore's rate,
                 with their volumes exceeding tens and hundreds billions
                 of samples, the need for fast and efficiently
                 parallelizable iterative solvers is generally
                 recognized. In this work we propose a new iterative
                 algorithm for solving generalized least square systems
                 with weights given by a block-diagonal matrix with
                 Toeplitz blocks. Such cases are physically well
                 motivated and correspond to measurement noise being
                 piece-wise stationary --- a common occurrence in many
                 actual observations. Our iterative algorithm is based
                 on the conjugate gradient method and includes a
                 parallel two-level preconditioner (2lvl-PCG)
                 constructed from a limited number of sparse vectors
                 estimated from the coefficients of the initial linear
                 system. Our prototypical application is the map-making
                 problem in the Cosmic Microwave Background data
                 analysis. We show experimentally that our parallel
                 implementation of 2lvl-PCG outperforms by a factor of
                 up to 6 the standard one-level PCG in terms of both the
                 convergence rate and the time to solution on up to 12,
                 228 cores of NERSC's Cray XE6 (Hopper) system
                 displaying nearly perfect strong and weak scaling
                 behavior in this regime.",
  acknowledgement = ack-nhfb,
  articleno =    "91",

  author =       "R. Speck and D. Ruprecht and R. Krause and M. Emmett
                 and M. Minion and M. Winkel and P. Gibbon",
  title =        "A massively space-time parallel {$N$}-body solver",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "92:1--92:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "We present a novel space-time parallel version of the
                 Barnes--Hut tree code pepc using pfasst, the Parallel
                 Full Approximation Scheme in Space and Time. The naive
                 use of increasingly more processors for a fixed-size
                 N-body problem is prone to saturate as soon as the
                 number of unknowns per core becomes too small. To
                 overcome this intrinsic strong-scaling limit, we
                 introduce temporal parallelism on top of pepc's
                 existing hybrid MPI/PThreads spatial decomposition.
                 Here, we use pfasst which is based on a combination of
                 the iterations of the parallel-in-time algorithm
                 parareal with the sweeps of spectral deferred
                 correction (SDC) schemes. By combining these sweeps
                 with multiple space-time discretization levels, pfasst
                 relaxes the theoretical bound on parallel efficiency in
                 parareal. We present results from runs on up to 262,144
                 cores on the IBM Blue Gene/P installation JUGENE,
                 demonstrating that the space-time parallel code
                 provides speedup beyond the saturation of the purely
                 space-parallel approach.",
  acknowledgement = ack-nhfb,
  articleno =    "92",

  author =       "Katsuki Fujisawa and Hitoshi Sato and Satoshi Matsuoka
                 and Toshio Endo and Makoto Yamashita and Maho Nakata",
  title =        "High-performance general solver for extremely
                 large-scale semidefinite programming problems",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "93:1--93:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Semidefinite programming (SDP) is one of the most
                 important problems among optimization problems at
                 present. It is relevant to a wide range of fields such
                 as combinatorial optimization, structural optimization,
                 control theory, economics, quantum chemistry, sensor
                 network location and data mining. The capability to
                 solve extremely large-scale SDP problems will have a
                 significant effect on the current and future
                 applications of SDP. In 1995, Fujisawa et al. started
                 the SDPA(Semidefinite programming algorithm) Project
                 aimed at solving large-scale SDP problems with high
                 numerical stability and accuracy. SDPA is one of the
                 main codes to solve general SDPs. SDPARA is a parallel
                 version of SDPA on multiple processors with distributed
                 memory, and it replaces two major bottleneck parts (the
                 generation of the Schur complement matrix and its
                 Cholesky factorization) of SDPA by their parallel
                 implementation. In particular, it has been successfully
                 applied to combinatorial optimization and truss
                 topology optimization. The new version of SDPARA
                 (7.5.0-G) on a large-scale supercomputer called TSUBAME
                 2.0 at the Tokyo Institute of Technology has
                 successfully been used to solve the largest SDP problem
                 (which has over 1.48 million constraints), and created
                 a new world record. Our implementation has also
                 achieved 533 TFlops in double precision for large-scale
                 Cholesky factorization using 2,720 CPUs and 4,080
  acknowledgement = ack-nhfb,
  articleno =    "93",

  author =       "Rob F. {Van der Wijngaart} and Srinivas Sridharan and
                 Victor W. Lee",
  title =        "Extending the {BT NAS} parallel benchmark to exascale
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "94:1--94:9",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "The NAS Parallel Benchmarks (NPB) are a well-known
                 suite of benchmarks that proxy scientific computing
                 applications. They specify several problem sizes that
                 represent how such applications may run on different
                 sizes of HPC systems. However, even the largest problem
                 (class F) is still far too small to exercise properly a
                 petascale supercomputer. Our work shows how one may
                 scale the Block Tridiagonal (BT) NPB from today's
                 published size to petascale and exascale computing
                 systems. In this paper we discuss the pros and cons of
                 various ways of scaling. We discuss how scaling BT
                 would impact computation, memory access, and
                 communications, and highlight the expected bottleneck,
                 which turns out to be not memory or communication
                 bandwidth, but network latency. Two complementary ways
                 are presented to overcome latency obstacles. We also
                 describe a practical method to gather approximate
                 performance data for BT at exascale on actual hardware,
                 without requiring an exascale system.",
  acknowledgement = ack-nhfb,
  articleno =    "94",

  author =       "Michael Frasca and Kamesh Madduri and Padma Raghavan",
  title =        "{NUMA}-aware graph mining techniques for performance
                 and energy efficiency",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "95:1--95:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "We investigate dynamic methods to improve the power
                 and performance profiles of large irregular
                 applications on modern multi-core systems. In this
                 context, we study a large sparse graph application,
                 Betweenness Centrality, and focus on memory behavior as
                 core count scales. We introduce new techniques to
                 efficiently map the computational demands onto
                 non-uniform memory architectures (NUMA). Our dynamic
                 design adapts to hardware topology and dramatically
                 improves both energy and performance. These gains are
                 more significant at higher core counts. We implement a
                 scheme for adaptive data layout, which reorganizes the
                 graph after observing parallel access patterns, and a
                 dynamic task scheduler that encourages shared data
                 between neighboring cores. We measure performance and
                 energy consumption on a modern multi-core machine and
                 observe that mean execution time is reduced by 51.2\%
                 and energy is reduced by 52.4\%.",
  acknowledgement = ack-nhfb,
  articleno =    "95",

  author =       "Samuel Williams and Dhiraj D. Kalamkar and Amik Singh
                 and Anand M. Deshpande and Brian {Van Straalen} and
                 Mikhail Smelyanskiy and Ann Almgren and Pradeep Dubey
                 and John Shalf and Leonid Oliker",
  title =        "Optimization of geometric multigrid for emerging
                 multi- and manycore processors",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "96:1--96:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Multigrid methods are widely used to accelerate the
                 convergence of iterative solvers for linear systems
                 used in a number of different application areas. In
                 this paper, we explore optimization techniques for
                 geometric multigrid on existing and emerging multicore
                 systems including the Opteron-based Cray XE6,
                 Intel\reg{} Xeon\reg{} E5-2670 and X5550
                 processor-based InfiniBand clusters, as well as the new
                 Intel\reg{} Xeon PhiTM coprocessor (Knights Corner).
                 Our work examines a variety of novel techniques
                 including communication-aggregation, threaded
                 wavefront-based DRAM communication-avoiding, dynamic
                 threading decisions, SIMDization, and fusion of
                 operators. We quantify performance through each phase
                 of the V-cycle for both single-node and
                 distributed-memory experiments and provide detailed
                 analysis for each class of optimization. Results show
                 our optimizations yield significant speedups across a
                 variety of subdomain sizes while simultaneously
                 demonstrating the potential of multi- and manycore
                 processors to dramatically accelerate single-node
                 performance. However, our analysis also indicates that
                 improvements in networks and communication will be
                 essential to reap the potential of manycore processors
                 in large-scale multigrid calculations.",
  acknowledgement = ack-nhfb,
  articleno =    "96",

  author =       "Abhinav Bhatele and Todd Gamblin and Steven H. Langer
                 and Peer-Timo Bremer and Erik W. Draeger and Bernd
                 Hamann and Katherine E. Isaacs and Aaditya G. Landge
                 and Joshua A. Levine and Valerio Pascucci and Martin
                 Schulz and Charles H. Still",
  title =        "Mapping applications with collectives over
                 sub-communicators on torus networks",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "97:1--97:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "The placement of tasks in a parallel application on
                 specific nodes of a supercomputer can significantly
                 impact performance. Traditionally, this task mapping
                 has focused on reducing the distance between
                 communicating tasks on the physical network. This
                 minimizes the number of hops that point-to-point
                 messages travel and thus reduces link sharing between
                 messages and contention. However, for applications that
                 use collectives over sub-communicators, this heuristic
                 may not be optimal. Many collectives can benefit from
                 an increase in bandwidth even at the cost of an
                 increase in hop count, especially when sending large
                 messages. For example, placing communicating tasks in a
                 cube configuration rather than a plane or a line on a
                 torus network increases the number of possible paths
                 messages might take. This increases the available
                 bandwidth which can lead to significant performance
                 gains. We have developed Rubik, a tool that provides a
                 simple and intuitive interface to create a wide variety
                 of mappings for structured communication patterns.
                 Rubik supports a number of elementary operations such
                 as splits, tilts, or shifts, that can be combined into
                 a large number of unique patterns. Each operation can
                 be applied to disjoint groups of processes involved in
                 collectives to increase the effective bandwidth. We
                 demonstrate the use of Rubik for improving performance
                 of two parallel codes, pF3D and Qbox, which use
                 collectives over sub-communicators.",
  acknowledgement = ack-nhfb,
  articleno =    "97",

  author =       "Torsten Hoefler and Timo Schneider",
  title =        "Optimization principles for collective neighborhood
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "98:1--98:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Many scientific applications operate in a
                 bulk-synchronous mode of iterative communication and
                 computation steps. Even though the communication steps
                 happen at the same logical time, important patterns
                 such as stencil computations cannot be expressed as
                 collective communications in MPI. We demonstrate how
                 neighborhood collective operations allow to specify
                 arbitrary collective communication relations during
                 run-time and enable optimizations similar to
                 traditional collective calls. We show a number of
                 optimization opportunities and algorithms for different
                 communication scenarios. We also show how users can
                 assert constraints that provide additional optimization
                 opportunities in a portable way. We demonstrate the
                 utility of all described optimizations in a highly
                 optimized implementation of neighborhood collective
                 operations. Our communication and protocol
                 optimizations result in a performance improvement of up
                 to a factor of two for small stencil communications. We
                 found that, for some patterns, our optimization
                 heuristics automatically generate communication
                 schedules that are comparable to hand-tuned
                 collectives. With those optimizations in place, we are
                 able to accelerate arbitrary collective communication
                 patterns, such as regular and irregular stencils with
                 optimization methods for collective communications. We
                 expect that our methods will influence the design of
                 future MPI libraries and provide a significant
                 performance benefit on large-scale systems.",
  acknowledgement = ack-nhfb,
  articleno =    "98",

  author =       "Zheng Cui and Lei Xia and Patrick G. Bridges and Peter
                 A. Dinda and John R. Lange",
  title =        "Optimizing overlay-based virtual networking through
                 optimistic interrupts and cut-through forwarding",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "99:1--99:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Overlay-based virtual networking provides a powerful
                 model for realizing virtual distributed and parallel
                 computing systems with strong isolation, portability,
                 and recoverability properties. However, in extremely
                 high throughput and low latency networks, such overlays
                 can suffer from bandwidth and latency limitations,
                 which is of particular concern if we want to apply the
                 model in HPC environments. Through careful study of an
                 existing very high performance overlay-based virtual
                 network system, we have identified two core issues
                 limiting performance: delayed and/or excessive virtual
                 interrupt delivery into guests, and copies between host
                 and guest data buffers done during encapsulation. We
                 respond with two novel optimizations: optimistic,
                 timer-free virtual interrupt injection, and zero-copy
                 cut-through data forwarding. These optimizations
                 improve the latency and bandwidth of the overlay
                 network on 10 Gbps interconnects, resulting in
                 near-native performance for a wide range of
                 microbenchmarks and MPI application benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "99",

  author =       "Evangelos Georganas and Jorge
                 Gonz{\'a}lez-Dom{\'\i}nguez and Edgar Solomonik and
                 Yili Zheng and Juan Touri{\~n}o and Katherine Yelick",
  title =        "Communication avoiding and overlapping for numerical
                 linear algebra",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "100:1--100:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "To efficiently scale dense linear algebra problems to
                 future exascale systems, communication cost must be
                 avoided or overlapped. Communication-avoiding 2.5D
                 algorithms improve scalability by reducing
                 inter-processor data transfer volume at the cost of
                 extra memory usage. Communication overlap attempts to
                 hide messaging latency by pipelining messages and
                 overlapping with computational work. We study the
                 interaction and compatibility of these two techniques
                 for two matrix multiplication algorithms (Cannon and
                 SUMMA), triangular solve, and Cholesky factorization.
                 For each algorithm, we construct a detailed performance
                 model that considers both critical path dependencies
                 and idle time. We give novel implementations of 2.5D
                 algorithms with overlap for each of these problems. Our
                 software employs UPC, a partitioned global address
                 space (PGAS) language that provides fast one-sided
                 communication. We show communication avoidance and
                 overlap provide a cumulative benefit as core counts
                 scale, including results using over 24K cores of a Cray
                 XE6 system.",
  acknowledgement = ack-nhfb,
  articleno =    "100",

  author =       "Benjamin Lipshitz and Grey Ballard and James Demmel
                 and Oded Schwartz",
  title =        "Communication-avoiding parallel {Strassen}:
                 implementation and performance",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "101:1--101:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Matrix multiplication is a fundamental kernel of many
                 high performance and scientific computing applications.
                 Most parallel implementations use classical O ( n$^3$ )
                 matrix multiplication, even though there exist
                 algorithms with lower arithmetic complexity. We
                 recently presented a new Communication-Avoiding
                 Parallel Strassen algorithm (CAPS), based on Strassen's
                 fast matrix multiplication, that minimizes
                 communication (SPAA '12). It communicates
                 asymptotically less than all classical and all previous
                 Strassen-based algorithms, and it attains theoretical
                 lower bounds. In this paper we show that CAPS is also
                 faster in practice. We benchmark and compare its
                 performance to previous algorithms on Hopper (Cray
                 XE6), Intrepid (IBM BG/P), and Franklin (Cray XT4). We
                 demonstrate significant speedups over previous
                 algorithms both for large matrices and for small
                 matrices on large numbers of processors. We model and
                 analyze the performance of CAPS and predict its
                 performance on future exascale platforms.",
  acknowledgement = ack-nhfb,
  articleno =    "101",

  author =       "Haim Avron and Anshul Gupta",
  title =        "Managing data-movement for effective shared-memory
                 parallelization of out-of-core sparse solvers",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "102:1--102:11",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Direct methods for solving sparse linear systems are
                 robust and typically exhibit good performance, but
                 often require large amounts of memory due to fill-in.
                 Many industrial applications use out-of-core techniques
                 to mitigate this problem. However, parallelizing sparse
                 out-of-core solvers poses some unique challenges
                 because accessing secondary storage introduces
                 serialization and I/O overhead. We analyze the
                 data-movement costs and memory versus parallelism
                 trade-offs in a shared-memory parallel out-of-core
                 linear solver for sparse symmetric systems. We propose
                 an algorithm that uses a novel memory management scheme
                 and adaptive task parallelism to reduce the
                 data-movement costs. We present experiments to show
                 that our solver is faster than existing out-of-core
                 sparse solvers on a single core, and is more scalable
                 than the only other known shared-memory parallel
                 out-of-core solver. This work is also directly
                 applicable at the node level in a distributed-memory
                 parallel scenario.",
  acknowledgement = ack-nhfb,
  articleno =    "102",

  author =       "Greg Faanes and Abdulla Bataineh and Duncan Roweth and
                 Tom Court and Edwin Froese and Bob Alverson and Tim
                 Johnson and Joe Kopnick and Mike Higgins and James
  title =        "{Cray Cascade}: a scalable {HPC} system based on a
                 {Dragonfly} network",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "103:1--103:9",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "Higher global bandwidth requirement for many
                 applications and lower network cost have motivated the
                 use of the Dragonfly network topology for high
                 performance computing systems. In this paper we present
                 the architecture of the Cray Cascade system, a
                 distributed memory system based on the Dragonfly [1]
                 network topology. We describe the structure of the
                 system, its Dragonfly network and the routing
                 algorithms. We describe a set of advanced features
                 supporting both mainstream high performance computing
                 applications and emerging global address space
                 programming models. We present a combination of
                 performance results from prototype systems and
                 simulation data for large systems. We demonstrate the
                 value of the Dragonfly topology and the benefits
                 obtained through extensive use of adaptive routing.",
  acknowledgement = ack-nhfb,
  articleno =    "103",

  author =       "Junichiro Makino and Hiroshi Daisaka",
  title =        "{GRAPE-8}: an accelerator for gravitational {$N$}-body
                 simulation with {20.5Gflops/W} performance",
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "104:1--104:10",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "In this paper, we describe the design and performance
                 of GRAPE-8 accelerator processor for gravitational N
                 -body simulations. It is designed to evaluate
                 gravitational interaction with cutoff between
                 particles. The cutoff function is useful for schemes
                 like TreePM or Particle-Particle Particle-Tree, in
                 which gravitational force is divided to short-range and
                 long-range components. A single GRAPE-8 processor chip
                 integrates 48 pipeline processors. The effective number
                 of floating-point operations per interaction is around
                 40. Thus the peak performance of a single GRAPE-8
                 processor chip is 480 Gflops. A GRAPE-8 processor card
                 houses two GRAPE-8 chips and one FPGA chip for
                 PCI-Express interface. The total power consumption of
                 the board is 46W. Thus, theoretical peak performance
                 per watt is 20.5 Gflops/W. The effective performance of
                 the total system, including the host computer, is
                 around 5Gflops/W. This is more than a factor of two
                 higher than the highest number in the current Green500
  acknowledgement = ack-nhfb,
  articleno =    "104",

  author =       "Greg Thorson and Michael Woodacre",
  title =        "{SGI UV2}: a fused computation and data analysis
  crossref =     "Hollingsworth:2012:SPI",
  pages =        "105:1--105:9",
  year =         "2012",
  bibdate =      "Thu Nov 15 07:38:35 MST 2012",
  bibsource =    ";
  URL =          "",
  abstract =     "UV2 is SGI's second generation data fusion system. UV2
                 was designed to meet the latest challenges facing users
                 in computation and data analysis. Its unique ability to
                 perform both functions on a single platform enables
                 efficient, easy to manage workflows. This platform has
                 a hybrid infrastructure, leveraging the latest
                 Intel\reg{} EP processors providing industry leading
                 computational power. Due to its high bandwidth,
                 extremely low latency NUMALink\reg{}6 (NL6)
                 interconnect, plus vectorized synchronization and data
                 movement, UV2 provides industry leading data intensive
                 capability. It supports a single operating system (OS)
                 image up to 64TB and 4K threads. Multiple OS images can
                 be deployed on a single NL6 fabric, which has a single
                 flat address space up to 8PB and 256K threads. These
                 capabilities allow for extreme performance on a broad
                 range of programming models and languages including
                 OpenMP[1], MPI, UPC[2], CAF[3] and SHMEM. The
                 architecture, implementation and performance of UV2 are
  acknowledgement = ack-nhfb,
  articleno =    "105",

%%% ====================================================================
%%% Cross-referenced entries must come last:
  editor =       "Jeffrey Hollingsworth",
  booktitle =    "{SC '12: Proceedings of the International Conference
                 on High Performance Computing, Networking, Storage and
                 Analysis, Salt Lake Convention Center, Salt Lake City,
                 UT, USA, November 10--16, 2012}",
  title =        "{SC '12: Proceedings of the International Conference
                 on High Performance Computing, Networking, Storage and
                 Analysis, Salt Lake Convention Center, Salt Lake City,
                 UT, USA, November 10--16, 2012}",
  publisher =    pub-IEEE,
  address =      pub-IEEE:adr,
  year =         "2012",
  ISBN =         "1-4673-0804-8",
  ISBN-13 =      "978-1-4673-0804-5",
  bibdate =      "Thu Nov 15 07:35:55 2012",
  bibsource =    ";
  acknowledgement = ack-nhfb,