Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.31",
%%%     date            = "13 March 2024",
%%%     time            = "07:24:09 MST",
%%%     filename        = "topc.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "https://www.math.utah.edu/~beebe",
%%%     checksum        = "29359 9214 49201 463299",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "ACM Transactions on Parallel Computing
%%%                       (TOPC); bibliography; BibTeX",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE BibTeX bibliography for
%%%                        ACM Transactions on Parallel Computing (TOPC)
%%%                        (CODEN ????, ISSN 2329-4949 (print),
%%%                        2329-4957 (electronic)).  The journal appears
%%%                        quarterly, and publication began with volume
%%%                        1, number 1, in September 2014.
%%%
%%%                        At version 1.31, the COMPLETE journal
%%%                        coverage looked like this:
%%%
%%%                             2014 (   8)    2018 (  22)    2022 (  16)
%%%                             2015 (  29)    2019 (  29)    2023 (  22)
%%%                             2016 (  25)    2020 (  27)    2024 (   6)
%%%                             2017 (  16)    2021 (  23)
%%%
%%%                             Article:        223
%%%
%%%                             Total entries:  223
%%%
%%%                        The journal Web page can be found at:
%%%
%%%                            http://topc.acm.org/
%%%
%%%                        The journal table of contents page is at:
%%%
%%%                            http://dl.acm.org/pub.cfm?id=J1496
%%%                            http://dl.acm.org/citation.cfm?id=2632163
%%%
%%%                        Qualified subscribers can retrieve the full
%%%                        text of recent articles in PDF form.
%%%
%%%                        The initial draft was extracted from the ACM
%%%                        Web pages.
%%%
%%%                        ACM copyrights explicitly permit abstracting
%%%                        with credit, so article abstracts, keywords,
%%%                        and subject classifications have been
%%%                        included in this bibliography wherever
%%%                        available.  Article reviews have been
%%%                        omitted, until their copyright status has
%%%                        been clarified.
%%%
%%%                        bibsource keys in the bibliography entries
%%%                        below indicate the entry originally came
%%%                        from the computer science bibliography
%%%                        archive, even though it has likely since
%%%                        been corrected and updated.
%%%
%%%                        URL keys in the bibliography point to
%%%                        World Wide Web locations of additional
%%%                        information about the entry.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        In this bibliography, entries are sorted in
%%%                        publication order, using ``bibsort -byvolume.''
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================
@Preamble{"\input bibnames.sty" #
    "\def \TM {${}^{\sc TM}$}"
}

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|https://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-TOPC                  = "ACM Transactions on Parallel Computing
                                  (TOPC)"}

%%% ====================================================================
%%% Bibliography entries:
@Article{Gibbons:2014:ATP,
  author =       "Phillip B. Gibbons",
  title =        "{ACM Transactions on Parallel Computing}: an
                 introduction",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "1:1--1:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2661651",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Techniques",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Lilja:2014:I,
  author =       "David J. Lilja",
  title =        "Introduction",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "2:1--2:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2609798",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Rane:2014:EPO,
  author =       "Ashay Rane and James Browne",
  title =        "Enhancing Performance Optimization of Multicore\slash
                 Multichip Nodes with Data Structure Metrics",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "3:1--3:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2588788",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Program performance optimization is usually based
                 solely on measurements of execution behavior of code
                 segments using hardware performance counters. However,
                 memory access patterns are critical performance
                 limiting factors for today's multicore chips where
                 performance is highly memory bound. Therefore diagnoses
                 and selection of optimizations based only on
                 measurements of the execution behavior of code segments
                 are incomplete because they do not incorporate
                 knowledge of memory access patterns and behaviors. This
                 article presents a low-overhead tool (MACPO) that
                 captures memory traces and computes metrics for the
                 memory access behavior of source-level (C, C++,
                 Fortran) data structures. MACPO explicitly targets the
                 measurement and metrics important to performance
                 optimization for multicore chips. The article also
                 presents a complete process for integrating measurement
                 and analyses of code execution with measurements and
                 analyses of memory access patterns and behaviors for
                 performance optimization, specifically targeting
                 multicore chips and multichip nodes of clusters. MACPO
                 uses more realistic cache models for computation of
                 latency metrics than those used by previous tools.
                 Evaluation of the effectiveness of adding memory access
                 behavior characteristics of data structures to
                 performance optimization was done on subsets of the
                 ASCI, NAS and Rodinia parallel benchmarks and two
                 versions of one application program from a domain not
                 represented in these benchmarks. Adding characteristics
                 of the behavior of data structures enabled easier
                 diagnoses of bottlenecks and more accurate selection of
                 appropriate optimizations than with only code centric
                 behavior measurements. The performance gains ranged
                 from a few percent to 38 percent.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Jimenez:2014:APP,
  author =       "V{\'\i}ctor Jim{\'e}nez and Francisco J. Cazorla and
                 Roberto Gioiosa and Alper Buyuktosunoglu and Pradip
                 Bose and Francis P. O'Connell and Bruce G. Mealey",
  title =        "Adaptive Prefetching on {POWER7}: Improving
                 Performance and Power Consumption",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "4:1--4:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2588889",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Hardware data prefetch engines are integral parts of
                 many general purpose server-class microprocessors in
                 the field today. Some prefetch engines allow users to
                 change some of their parameters. But, the prefetcher is
                 usually enabled in a default configuration during
                 system bring-up, and dynamic reconfiguration of the
                 prefetch engine is not an autonomic feature of current
                 machines. Conceptually, however, it is easy to infer
                 that commonly used prefetch algorithms---when applied
                 in a fixed mode---will not help performance in many
                 cases. In fact, they may actually degrade performance
                 due to useless bus bandwidth consumption and cache
                 pollution, which in turn, will also waste power. We
                 present an adaptive prefetch scheme that dynamically
                 modifies the prefetch settings in order to adapt to
                 workloads' requirements. We use a commercial processor,
                 namely the IBM POWER7 as a vehicle for our study. First
                 we characterize---in terms of performance and power
                 consumption---the prefetcher in that processor using
                 microbenchmarks and SPEC CPU2006. We then present our
                 adaptive prefetch mechanism showing performance
                 improvements with respect to the default prefetch
                 setting up to 2.7X and 1.3X for single-threaded and
                 multiprogrammed workloads, respectively. Adaptive
                 prefetching is also able to reduce power consumption in
                 some cases. Finally, we also evaluate our mechanism
                 with SPECjbb2005, improving both performance and power
                 consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Heil:2014:APH,
  author =       "Timothy Heil and Anil Krishna and Nicholas Lindberg
                 and Farnaz Toussi and Steven Vanderwiel",
  title =        "Architecture and Performance of the Hardware
                 Accelerators in {IBM}'s {PowerEN} Processor",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "5:1--5:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2588888",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Computation at the edge of a datacenter has unique
                 characteristics. It deals with streaming data from
                 multiple sources, going to multiple destinations, often
                 requiring repeated application of one or more of
                 several standard algorithmic kernels. These kernels,
                 related to encryption, compression, XML Parsing and
                 regular expression searching on the data, demand a high
                 data processing rate and power efficiency. This
                 suggests the use of hardware acceleration for key
                 functions. However, robust general purpose processing
                 support is necessary to orchestrate the flow of data
                 between accelerators, as well as perform tasks that are
                 not suited to acceleration. Further, these accelerators
                 must be tightly integrated with the general purpose
                 computation in order to keep invocation overhead and
                 latency low. The accelerators must be easy for software
                 to use, and the system must be flexible enough to
                 support evolving networking standards. In this article,
                 we describe and evaluate the architecture of IBM's
                 PowerEN processor, with a focus on PowerEN's
                 architectural enhancements and its on-chip hardware
                 accelerators. PowerEN unites the throughput of
                 application-specific accelerators with the
                 programmability of general purpose cores on a single
                 coherent memory architecture. Hardware acceleration
                 improves throughput by orders of magnitude in some
                 cases compared to equivalent computation on the general
                 purpose cores. By offloading work to the accelerators,
                 general purpose cores are freed to simultaneously work
                 on computation less suited to acceleration.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Wu:2014:MAG,
  author =       "Xing Wu and Frank Mueller and Scott Pakin",
  title =        "A methodology for automatic generation of executable
                 communication specifications from parallel {MPI}
                 applications",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "6:1--6:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2660249",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Portable parallel benchmarks are widely used for
                 performance evaluation of HPC systems. However, because
                 these are manually produced, they generally represent a
                 greatly simplified view of application behavior,
                 missing the subtle but important-to-performance nuances
                 that may exist in a complete application. This work
                 contributes novel methods to automatically generate
                 highly portable and customizable communication
                 benchmarks from HPC applications. We utilize
                 ScalaTrace, a lossless yet scalable
                 parallel-application tracing framework to collect
                 selected aspects of the run-time behavior of HPC
                 applications, including communication operations and
                 computation time, while abstracting away the details of
                 the computation proper. We subsequently generate
                 benchmarks with nearly identical run-time behavior to
                 the original applications. Results demonstrate that the
                 generated benchmarks are in fact able to preserve the
                 run-time behavior (including both the communication
                 pattern and the execution time) of the original
                 applications. Such automated benchmark generation is
                 without precedent and particularly valuable for
                 proprietary, export-controlled, or classified
                 application codes.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ravishankar:2014:APC,
  author =       "Mahesh Ravishankar and John Eisenlohr and
                 Louis-No{\"e}l Pouchet and J. Ramanujam and Atanas
                 Rountev and P. Sadayappan",
  title =        "Automatic parallelization of a class of irregular
                 loops for distributed memory systems",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2660251",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Many scientific applications spend significant time
                 within loops that are parallel, except for dependences
                 from associative reduction operations. However these
                 loops often contain data-dependent control-flow and
                 array-access patterns. Traditional optimizations that
                 rely on purely static analysis fail to generate
                 parallel code in such cases. This article proposes an
                 approach for automatic parallelization for distributed
                 memory environments, using both static and runtime
                 analysis. We formalize the computations that are
                 targeted by this approach and develop algorithms to
                 detect such computations. We also describe algorithms
                 to generate a parallel inspector that performs a
                 runtime analysis of control-flow and array-access
                 patterns, and a parallel executor to take advantage of
                 this information. The effectiveness of the approach is
                 demonstrated on several benchmarks that were
                 automatically transformed using a prototype compiler.
                 For these, the inspector overheads and performance of
                 the executor code were measured. The benefit on
                 real-world applications was also demonstrated through
                 similar manual transformations of an atmospheric
                 modeling software.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Shun:2014:SPC,
  author =       "Julian Shun and Guy E. Blelloch",
  title =        "A simple parallel {Cartesian} tree algorithm and its
                 application to parallel suffix tree construction",
  journal =      j-TOPC,
  volume =       "1",
  number =       "1",
  pages =        "8:1--8:??",
  month =        sep,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2661653",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Oct 17 12:28:03 MDT 2014",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present a simple linear work and space, and
                 polylogarithmic time parallel algorithm for generating
                 multiway Cartesian trees. We show that bottom-up
                 traversals of the multiway Cartesian tree on the
                 interleaved suffix array and longest common prefix
                 array of a string can be used to answer certain string
                 queries. By adding downward pointers in the tree (e.g.
                 using a hash table), we can also generate suffix trees
                 from suffix arrays on arbitrary alphabets in the same
                 bounds. In conjunction with parallel suffix array
                 algorithms, such as the skew algorithm, this gives a
                 rather simple linear work parallel, $ O(n \epsilon) $
                 time $ (0 < \epsilon < 1) $, algorithm for generating
                 suffix trees over an integer alphabet $ \Sigma \{ 1,
                 \ldots {}, n \} $, where $n$ is the length of the input
                 string. It also gives a linear work parallel algorithm
                 requiring $ O(\log_2 n) $ time with high probability
                 for constant-sized alphabets. More generally, given a
                 sorted sequence of strings and the longest common
                 prefix lengths between adjacent elements, the algorithm
                 will generate a patricia tree (compacted trie) over the
                 strings. Of independent interest, we describe a
                 work-efficient parallel algorithm for solving the all
                 nearest smaller values problem using Cartesian trees,
                 which is much simpler than the work-efficient parallel
                 algorithm described in previous work. We also present
                 experimental results comparing the performance of the
                 algorithm to existing sequential implementations and a
                 second parallel algorithm that we implement. We present
                 comparisons for the Cartesian tree algorithm on its own
                 and for constructing a suffix tree. The results show
                 that on a variety of strings our algorithm is
                 competitive with the sequential version on a single
                 processor and achieves good speedup on multiple
                 processors. We present experiments for three
                 applications that require only the Cartesian tree, and
                 also for searching using the suffix tree.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Pingali:2015:ISI,
  author =       "Keshav Pingali and J. Ramanujam and P. Sadayappan",
  title =        "Introduction to the Special Issue on {PPoPP'12}",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2716343",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bouteiller:2015:ABF,
  author =       "Aurelien Bouteiller and Thomas Herault and George
                 Bosilca and Peng Du and Jack Dongarra",
  title =        "Algorithm-Based Fault Tolerance for Dense Matrix
                 Factorizations, Multiple Failures and Accuracy",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686892",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/bibnet/authors/d/dongarra-jack-j.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Dense matrix factorizations, such as LU, Cholesky and
                 QR, are widely used for scientific applications that
                 require solving systems of linear equations,
                 eigenvalues and linear least squares problems. Such
                 computations are normally carried out on
                 supercomputers, whose ever-growing scale induces a fast
                 decline of the Mean Time To Failure (MTTF). This
                 article proposes a new hybrid approach, based on
                 Algorithm-Based Fault Tolerance (ABFT), to help matrix
                 factorizations algorithms survive fail-stop failures.
                 We consider extreme conditions, such as the absence of
                 any reliable node and the possibility of losing both
                 data and checksum from a single failure. We will
                 present a generic solution for protecting the right
                 factor, where the updates are applied, of all above
                 mentioned factorizations. For the left factor, where
                 the panel has been applied, we propose a scalable
                 checkpointing algorithm. This algorithm features high
                 degree of checkpointing parallelism and cooperatively
                 utilizes the checksum storage leftover from the right
                 factor protection. The fault-tolerant algorithms
                 derived from this hybrid solution is applicable to a
                 wide range of dense matrix factorizations, with minor
                 modifications. Theoretical analysis shows that the
                 fault tolerance overhead decreases inversely to the
                 scaling in the number of computing units and the
                 problem size. Experimental results of LU and QR
                 factorization on the Kraken (Cray XT5) supercomputer
                 validate the theoretical evaluation and confirm
                 negligible overhead, with- and without-errors.
                 Applicability to tolerate multiple failures and
                 accuracy after multiple recovery is also considered.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ballard:2015:ACS,
  author =       "Grey Ballard and James Demmel and Nicholas Knight",
  title =        "Avoiding Communication in Successive Band Reduction",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686877",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The running time of an algorithm depends on both
                 arithmetic and communication (i.e., data movement)
                 costs, and the relative costs of communication are
                 growing over time. In this work, we present sequential
                 and distributed-memory parallel algorithms for
                 tridiagonalizing full symmetric and symmetric band
                 matrices that asymptotically reduce communication
                 compared to previous approaches. The tridiagonalization
                 of a symmetric band matrix is a key kernel in solving
                 the symmetric eigenvalue problem for both full and band
                 matrices. In order to preserve structure,
                 tridiagonalization routines use annihilate-and-chase
                 procedures that previously have suffered from poor data
                 locality and high parallel latency cost. We improve
                 both by reorganizing the computation and obtain
                 asymptotic improvements. We also propose new algorithms
                 for reducing a full symmetric matrix to band form in a
                 communication-efficient manner. In this article, we
                 consider the cases of computing eigenvalues only and of
                 computing eigenvalues and all eigenvectors.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Sack:2015:CAM,
  author =       "Paul Sack and William Gropp",
  title =        "Collective Algorithms for Multiported Torus Networks",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686882",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Modern supercomputers with torus networks allow each
                 node to simultaneously pass messages on all of its
                 links. However, most collective algorithms are designed
                 to only use one link at a time. In this work, we
                 present novel multiported algorithms for the scatter,
                 gather, all-gather, and reduce-scatter operations. Our
                 algorithms can be combined to create multiported
                 reduce, all-reduce, and broadcast algorithms. Several
                 of these algorithms involve a new technique where we
                 relax the MPI message-ordering constraints to achieve
                 high performance and restore the correct ordering using
                 an additional stage of redundant communication.
                 According to our models, on an $n$-dimensional torus,
                 our algorithms should allow for nearly a $ 2 n$-fold
                 improvement in communication performance compared to
                 known, single-ported torus algorithms. In practice, we
                 have achieved nearly $ 6 \times $ better performance on
                 a 32k-node 3-dimensional torus.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Dice:2015:LCG,
  author =       "David Dice and Virendra J. Marathe and Nir Shavit",
  title =        "Lock Cohorting: a General Technique for Designing
                 {NUMA} Locks",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686884",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Multicore machines are quickly shifting to NUMA and
                 CC-NUMA architectures, making scalable NUMA-aware
                 locking algorithms, ones that take into account the
                 machine's nonuniform memory and caching hierarchy, ever
                 more important. This article presents lock cohorting, a
                 general new technique for designing NUMA-aware locks
                 that is as simple as it is powerful. Lock cohorting
                 allows one to transform any spin-lock algorithm, with
                 minimal nonintrusive changes,into a scalable NUMA-aware
                 spin-lock. Our new cohorting technique allows us to
                 easily create NUMA-aware versions of the TATAS-Backoff,
                 CLH, MCS, and ticket locks, to name a few. Moreover, it
                 allows us to derive a CLH-based cohort abortable lock,
                 the first NUMA-aware queue lock to support
                 abortability. We empirically compared the performance
                 of cohort locks with prior NUMA-aware and classic
                 NUMA-oblivious locks on a synthetic micro-benchmark, a
                 real world key-value store application memcached, as
                 well as the libc memory allocator. Our results
                 demonstrate that cohort locks perform as well or better
                 than known locks when the load is low and significantly
                 out-perform them as the load increases.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Merrill:2015:HPS,
  author =       "Duane Merrill and Michael Garland and Andrew
                 Grimshaw",
  title =        "High-Performance and Scalable {GPU} Graph Traversal",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "14:1--14:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2717511",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Breadth-First Search (BFS) is a core primitive for
                 graph traversal and a basis for many higher-level graph
                 analysis algorithms. It is also representative of a
                 class of parallel computations whose memory accesses
                 and work distribution are both irregular and data
                 dependent. Recent work has demonstrated the
                 plausibility of GPU sparse graph traversal, but has
                 tended to focus on asymptotically inefficient
                 algorithms that perform poorly on graphs with
                 nontrivial diameter. We present a BFS parallelization
                 focused on fine-grained task management constructed
                 from efficient prefix sum computations that achieves an
                 asymptotically optimal O(|V| + |E|) gd work complexity.
                 Our implementation delivers excellent performance on
                 diverse graphs, achieving traversal rates in excess of
                 3.3 billion and 8.3 billion traversed edges per second
                 using single- and quad-GPU configurations,
                 respectively. This level of performance is several
                 times faster than state-of-the-art implementations on
                 both CPU and GPU platforms.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kramer:2015:SET,
  author =       "Stephan C. Kramer and Johannes Hagemann",
  title =        "{SciPAL}: Expression Templates and Composition Closure
                 Objects for High Performance Computational Physics with
                 {CUDA} and {OpenMP}",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "15:1--15:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2686886",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present SciPAL (scientific parallel algorithms
                 library), a C ++-based, hardware-independent
                 open-source library. Its core is a domain-specific
                 embedded language for numerical linear algebra. The
                 main fields of application are finite element
                 simulations, coherent optics and the solution of
                 inverse problems. Using SciPAL algorithms can be stated
                 in a mathematically intuitive way in terms of matrix
                 and vector operations. Existing algorithms can easily
                 be adapted to GPU-based computing by proper template
                 specialization. Our library is compatible with the
                 finite element library deal.II and provides a port of
                 deal.II's most frequently used linear algebra classes
                 to CUDA (NVidia's extension of the programming
                 languages C and C ++ for programming their GPUs).
                 SciPAL 's operator-based API for BLAS operations
                 particularly aims at simplifying the usage of NVidia's
                 CUBLAS. For non-BLAS array arithmetic SciPAL 's
                 expression templates are able to generate CUDA kernels
                 at compile time. We demonstrate the benefits of SciPAL
                 using the iterative principal component analysis as
                 example which is the core algorithm for the
                 spike-sorting problem in neuroscience.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Totoni:2015:PME,
  author =       "Ehsan Totoni and Nikhil Jain and Laxmikant V. Kale",
  title =        "Power Management of Extreme-Scale Networks with
                 On\slash Off Links in Runtime Systems",
  journal =      j-TOPC,
  volume =       "1",
  number =       "2",
  pages =        "16:1--16:??",
  month =        jan,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2687001",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Feb 18 16:46:00 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Networks are among major power consumers in
                 large-scale parallel systems. During execution of
                 common parallel applications, a sizeable fraction of
                 the links in the high-radix interconnects are either
                 never used or are underutilized. We propose a runtime
                 system based adaptive approach to turn off unused
                 links, which has various advantages over the previously
                 proposed hardware and compiler based approaches. We
                 discuss why the runtime system is the best system
                 component to accomplish this task, and test the
                 effectiveness of our approach using real applications
                 (including NAMD, MILC), and application benchmarks
                 (including NAS Parallel Benchmarks, Stencil). These
                 codes are simulated on representative topologies such
                 as 6-D Torus and multilevel directly connected network
                 (similar to IBM PERCS in Power 775 and Dragonfly in
                 Cray Aries). For common applications with near-neighbor
                 communication pattern, our approach can save up to 20\%
                 of total machine's power and energy, without any
                 performance penalty.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Herlihy:2015:GEI,
  author =       "Maurice Herlihy",
  title =        "{Guest Editor} Introduction",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "1:1--1:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2716306",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Degener:2015:LCS,
  author =       "Bastian Degener and Barbara Kempkes and Peter Kling
                 and Friedhelm {Meyer Auf Der Heide}",
  title =        "Linear and Competitive Strategies for Continuous Robot
                 Formation Problems",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "2:1--2:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742341",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We study a scenario in which n mobile robots with a
                 limited viewing range are distributed in the Euclidean
                 plane and have to solve a formation problem. The
                 formation problems we consider are the Gathering
                 problem and the Chain-Formation problem. In the
                 Gathering problem, the robots have to gather in one
                 (not predefined) point, while in the Chain-Formation
                 problem they have to form a connected communication
                 chain of minimal length between two stationary base
                 stations. Each robot may base its decisions where to
                 move only on the current relative positions of
                 neighboring robots (that are within its viewing range);
                 that is, besides having a limited viewing range, the
                 robots are oblivious (they do not use information from
                 the past), have none or only very limited identities,
                 and they do not have a common sense of direction.
                 Variants of these problems (especially for the
                 Gathering problem) have been studied extensively in
                 different discrete time models. In contrast, our work
                 focuses on a continuous time model; that is, the robots
                 continuously sense the positions of other robots within
                 their viewing range and continuously adapt their speed
                 and direction according to some simple, local rules.
                 Hereby, we assume that the robots have a maximum
                 movement speed of one. We show that this idealized idea
                 of continuous sensing allows us to solve the mentioned
                 formation problems in linear time $ O(n) $ (which,
                 given the maximum speed of one, immediately yields a
                 maximum traveled distance of $ O(n)$). Note that in the
                 more classical discrete time models, the best known
                 strategies need at least $ \Theta (n^2)$ or even $
                 \Theta (n^2 \log n)$ timesteps to solve these problems.
                 For the Gathering problem, our analysis solves a
                 problem left open by Gordon et al. [2004], where the
                 authors could prove that gathering in a continuous
                 model is possible in finite time, but were not able to
                 give runtime bounds. Apart from these linear bounds, we
                 also provide runtime bounds for both formation problems
                 that relate the runtime of our strategies to the
                 runtime of an optimal, global algorithm. Specifically,
                 we show that our strategy for the Gathering problem is
                 log OPT-competitive and the strategy for the
                 Chain-Formation problem is $ \log n$ competitive. Here,
                 by $c$-competitive, we mean that our (local) strategy
                 is asymptotically by at most a factor of $c$ slower
                 than an optimal, global strategy.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Jain:2015:NOS,
  author =       "Navendu Jain and Ishai Menache and Joseph (Seffi) Naor
                 and Jonathan Yaniv",
  title =        "Near-Optimal Scheduling Mechanisms for
                 Deadline-Sensitive Jobs in Large Computing Clusters",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "3:1--3:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742343",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We consider a market-based resource allocation model
                 for batch jobs in cloud computing clusters. In our
                 model, we incorporate the importance of the due date of
                 a job rather than the number of servers allocated to it
                 at any given time. Each batch job is characterized by
                 the work volume of total computing units (e.g., CPU
                 hours) along with a bound on maximum degree of
                 parallelism. Users specify, along with these job
                 characteristics, their desired due date and a value for
                 finishing the job by its deadline. Given this
                 specification, the primary goal is to determine the
                 scheduling of cloud computing instances under capacity
                 constraints in order to maximize the social welfare
                 (i.e., sum of values gained by allocated users). Our
                 main result is a new (CC-kcss-1)-approximation
                 algorithm for this objective, where $C$ denotes cloud
                 capacity, $k$ is the maximal bound on parallelized
                 execution (in practical settings, $ k < C$) and $s$ is
                 the slackness on the job completion time, that is, the
                 minimal ratio between a specified deadline and the
                 earliest finish time of a job. Our algorithm is based
                 on utilizing dual fitting arguments over a strengthened
                 linear program to the problem. Based on the new
                 approximation algorithm, we construct truthful
                 allocation and pricing mechanisms, in which reporting
                 the true value and other properties of the job
                 (deadline, work volume, and the parallelism bound) is a
                 dominant strategy for all users. To that end, we extend
                 known results for single-value settings to provide a
                 general framework for transforming allocation
                 algorithms into truthful mechanisms in domains of
                 single-value and multi-properties. We then show that
                 the basic mechanism can be extended under proper
                 Bayesian assumptions to the objective of maximizing
                 revenues, which is important for public clouds. We
                 empirically evaluate the benefits of our approach
                 through simulations on data-center job traces, and show
                 that the revenues obtained under our mechanism are
                 comparable with an ideal fixed-price mechanism, which
                 sets an on-demand price using oracle knowledge of
                 users' valuations. Finally, we discuss how our model
                 can be extended to accommodate uncertainties in job
                 work volumes, which is a practical challenge in cloud
                 settings.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Feldman:2015:HCG,
  author =       "Moran Feldman and Liane Lewin-Eytan and Joseph (Seffi)
                 Naor",
  title =        "Hedonic Clustering Games",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "4:1--4:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742345",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Clustering, the partitioning of objects with respect
                 to a similarity measure, has been extensively studied
                 as a global optimization problem. We investigate
                 clustering from a game-theoretic approach, and consider
                 the class of hedonic clustering games. Here, a
                 self-organized clustering is obtained via decisions
                 made by independent players, corresponding to the
                 elements clustered. Being a hedonic setting, the
                 utility of each player is determined by the identity of
                 the other members of her cluster. This class of games
                 seems to be quite robust, as it fits with rather
                 different, yet commonly used, clustering criteria.
                 Specifically, we investigate hedonic clustering games
                 in two different models: fixed clustering, which
                 subdivides into $k$-median and $k$-center, and
                 correlation clustering. We provide a thorough analysis
                 of these games, characterizing Nash equilibria, and
                 proving upper and lower bounds on the price of anarchy
                 and price of stability. For fixed clustering we focus
                 on the existence of a Nash equilibrium, as it is a
                 rather nontrivial issue in this setting. We study it
                 both for general metrics and special cases, such as
                 line and tree metrics. In the correlation clustering
                 model, we study both minimization and maximization
                 variants, and provide almost tight bounds on both the
                 price of anarchy and price of stability.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Jahn:2015:RRA,
  author =       "Janmartin Jahn and Santiago Pagani and Sebastian Kobbe
                 and Jian-Jia Chen and J{\"o}rg Henkel",
  title =        "Runtime Resource Allocation for Software Pipelines",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "5:1--5:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742347",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Efficiently allocating the computational resources of
                 many-core systems is one of the most prominent
                 challenges, especially when resource requirements may
                 vary unpredictably at runtime. This is even more
                 challenging when facing unreliable cores --- a scenario
                 that becomes common as the number of cores increases
                 and integration sizes shrink. To address this
                 challenge, this article presents an optimal method for
                 the allocation of the resources to software-pipelined
                 applications. Here we show how runtime observations of
                 the resource requirements of tasks can be used to adapt
                 resource allocations. Furthermore, we show how the
                 optimum can be traded for a high degree of scalability
                 by clustering applications in a distributed,
                 hierarchical manner. To diminish the negative effects
                 of unreliable cores, this article shows how
                 self-organization can effectively restore the integrity
                 of such a hierarchy when it is corrupted by a failing
                 core. Experiments on Intel's 48-core Single-Chip Cloud
                 Computer and in a many-core simulator show that a
                 significant improvement in system throughput can be
                 achieved over the current state of the art.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Xu:2015:SVC,
  author =       "Yi Xu and Bo Zhao and Youtao Zhang and Jun Yang",
  title =        "Simple Virtual Channel Allocation for High-Throughput
                 and High-Frequency On-Chip Routers",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "6:1--6:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742349",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Packet-switched network-on-chip (NoC) has provided a
                 scalable solution to the communications for tiled
                 multicore processors. However, the virtual channel (VC)
                 buffers in the NoC consume significant dynamic and
                 leakage power. To improve the energy efficiency of the
                 router design, it is advantageous to use small buffer
                 sizes while still maintaining throughput of the
                 network. This article proposes two new virtual channel
                 allocation (VA) mechanisms, termed fixed VC assignment
                 with dynamic VC allocation (FVADA) and adjustable VC
                 assignment with dynamic VC allocation (AVADA). VCs are
                 designated to output ports and allocated to packets
                 according to such assignment. This can help to reduce
                 the head-of-line blocking. Such VC-output port
                 assignment can also be adjusted dynamically to
                 accommodate traffic changes. Simulation results show
                 that both mechanisms can improve network throughput by
                 41\% on average. Real traffic evaluation shows a
                 network latency reduction of up to 66\%. In addition,
                 AVADA can outperform the baseline in throughput with
                 only half of the buffer size. Finally, we are able to
                 achieve comparable or better throughput than a previous
                 dynamic VC allocator while reducing its critical path
                 delay by 57\%. Hence, the proposed VA mechanisms are
                 suitable for low-power, high-throughput, and
                 high-frequency NoC designs.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Hammouda:2015:NTE,
  author =       "Adam Hammouda and Andrew R. Siegel and Stephen F.
                 Siegel",
  title =        "Noise-Tolerant Explicit Stencil Computations for
                 Nonuniform Process Execution Rates",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "7:1--7:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742351",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Next-generation HPC computing platforms are likely to
                 be characterized by significant, unpredictable
                 nonuniformities in execution time among compute nodes
                 and cores. The resulting load imbalances from this
                 nonuniformity are expected to arise from a variety of
                 sources-manufacturing discrepancies, dynamic power
                 management, runtime component failure, OS jitter,
                 software-mediated resiliency, and TLB/- cache
                 performance variations, for example. It is well
                 understood that existing algorithms with frequent
                 points of bulk synchronization will perform relatively
                 poorly in the presence of these sources of process
                 nonuniformity. Thus, recasting classic bulk synchronous
                 algorithms into more asynchronous, coarse-grained
                 parallelism is a critical area of research for
                 next-generation computing. We propose a class of
                 parallel algorithms for explicit stencil computations
                 that can tolerate these nonuniformities by decoupling
                 per process communication and computation in order for
                 each process to progress asynchronously while
                 maintaining solution correctness. These algorithms are
                 benchmarked with a $1$D domain decomposed (``slabbed'')
                 implementation of the $2$D heat equation as a model
                 problem, and are tested in the presence of simulated
                 nonuniform process execution rates. The resulting
                 performance is compared to a classic bulk synchronous
                 implementation of the model problem. Results show that
                 the runtime of this article's algorithm on a machine
                 with simulated process nonuniformities is 5--99\%
                 slower than the runtime of its classic counterpart on a
                 machine free of nonuniformities. However, when both
                 algorithms are run on a machine with comparable
                 synthetic process nonuniformities, this article's
                 algorithm is 1--37 times faster than its classic
                 counterpart.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{McCreesh:2015:SST,
  author =       "Ciaran McCreesh and Patrick Prosser",
  title =        "The Shape of the Search Tree for the Maximum Clique
                 Problem and the Implications for Parallel Branch and
                 Bound",
  journal =      j-TOPC,
  volume =       "2",
  number =       "1",
  pages =        "8:1--8:??",
  month =        may,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2742359",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu May 21 16:27:00 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Finding a maximum clique in a given graph is one of
                 the fundamental NP-hard problems. We compare two
                 multicore thread-parallel adaptations of a
                 state-of-the-art branch-and-bound algorithm for the
                 maximum clique problem and provide a novel explanation
                 as to why they are successful. We show that load
                 balance is sometimes a problem but that the interaction
                 of parallel search order and the most likely location
                 of solutions within the search space is often the
                 dominating consideration. We use this explanation to
                 propose a new low-overhead, scalable work-splitting
                 mechanism. Our approach uses explicit early diversity
                 to avoid strong commitment to the weakest heuristic
                 advice and late resplitting for balance. More
                 generally, we argue that, for branch-and-bound,
                 parallel algorithm design should not be performed
                 independently of the underlying sequential algorithm.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Hoefler:2015:RMA,
  author =       "Torsten Hoefler and James Dinan and Rajeev Thakur and
                 Brian Barrett and Pavan Balaji and William Gropp and
                 Keith Underwood",
  title =        "Remote Memory Access Programming in {MPI-3}",
  journal =      j-TOPC,
  volume =       "2",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2780584",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Aug 7 10:22:35 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The Message Passing Interface (MPI) 3.0 standard,
                 introduced in September 2012, includes a significant
                 update to the one-sided communication interface, also
                 known as remote memory access (RMA). In particular, the
                 interface has been extended to better support popular
                 one-sided and global-address-space parallel programming
                 models to provide better access to hardware performance
                 features and enable new data-access modes. We present
                 the new RMA interface and specify formal axiomatic
                 models for data consistency and access semantics. Such
                 models can help users reason about details of the
                 semantics that are hard to extract from the English
                 prose in the standard. It also fosters the development
                 of tools and compilers, enabling them to automatically
                 analyze, optimize, and debug RMA programs.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Maldonado:2015:STB,
  author =       "Walther Maldonado and Patrick Marlier and Pascal
                 Felber and Julia Lawall and Gilles Muller and Etienne
                 Rivi{\`e}re",
  title =        "Supporting Time-Based {QoS} Requirements in Software
                 Transactional Memory",
  journal =      j-TOPC,
  volume =       "2",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2779621",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Aug 7 10:22:35 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Software transactional memory (STM) is an optimistic
                 concurrency control mechanism that simplifies parallel
                 programming. However, there has been little interest in
                 its applicability to reactive applications in which
                 there is a required response time for certain
                 operations. We propose supporting such applications by
                 allowing programmers to associate time with atomic
                 blocks in the form of deadlines and quality-of-service
                 (QoS) requirements. Based on statistics of past
                 executions, we adjust the execution mode of
                 transactions by decreasing the level of optimism as the
                 deadline approaches. In the presence of concurrent
                 deadlines, we propose different conflict resolution
                 policies. Execution mode switching mechanisms allow the
                 meeting of multiple deadlines in a consistent manner,
                 with potential QoS degradations being split fairly
                 among several threads as contention increases, and
                 avoiding starvation. Our implementation consists of
                 extensions to an STM runtime that allow gathering
                 statistics and switching execution modes. We also
                 propose novel contention managers adapted to
                 transactional workloads subject to deadlines. The
                 experimental evaluation shows that our approaches
                 significantly improve the likelihood of a transaction
                 meeting its deadline and QoS requirement, even in cases
                 where progress is hampered by conflicts and other
                 concurrent transactions with deadlines.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kestor:2015:TPD,
  author =       "Gokcen Kestor and Osman S. Unsal and Adrian Cristal
                 and Serdar Tasiran",
  title =        "{TRADE}: Precise Dynamic Race Detection for Scalable
                 Transactional Memory Systems",
  journal =      j-TOPC,
  volume =       "2",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2786021",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Aug 7 10:22:35 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "As other multithreaded programs, transactional memory
                 (TM) programs are prone to race conditions. Previous
                 work focuses on extending existing definitions of data
                 race for lock-based applications to TM applications,
                 which requires all transactions to be totally ordered
                 ``as if'' serialized by a global lock. This approach
                 poses implementation constraints on the STM that
                 severely limits TM applications' performance. This
                 article shows that forcing total ordering among all
                 running transactions, while sufficient, is not
                 necessary. We introduce an alternative data race
                 definition, relaxed transactional data race, that
                 requires ordering of only conflicting transactions. The
                 advantages of our relaxed definition are twofold:
                 First, unlike the previous definition, this definition
                 can be applied to a wide range of TMs, including those
                 that do not enforce transaction total ordering. Second,
                 within a single execution, it exposes a higher number
                 of data races, which considerably reduces debugging
                 time. Based on this definition, we propose a novel and
                 precise race detection tool for C/C++ TM applications
                 (TRADE), which detects data races by tracking
                 happens-before edges among conflicting transactions.
                 Our experiments reveal that TRADE precisely detects
                 data races for STAMP applications running on modern
                 STMs with overhead comparable to state-of-the-art race
                 detectors for lock-based applications. Our experiments
                 also show that in a single run, TRADE identifies
                 several races not discovered by 10 separate runs of a
                 race detection tool based on the previous data race
                 definition.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Diegues:2015:TWE,
  author =       "Nuno Diegues and Paolo Romano",
  title =        "{Time-Warp}: Efficient Abort Reduction in
                 Transactional Memory",
  journal =      j-TOPC,
  volume =       "2",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2775435",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Aug 7 10:22:35 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The multicore revolution that took place one decade
                 ago has turned parallel programming into a major
                 concern for the mainstream software development
                 industry. In this context, Transactional Memory (TM)
                 has emerged as a simpler and attractive alternative to
                 that of lock-based synchronization, whose complexity
                 and error-proneness are widely recognized. The notion
                 of permissiveness in TM translates to only aborting a
                 transaction when it cannot be accepted in any history
                 that guarantees a target correctness criterion. This
                 theoretically powerful property is often neglected by
                 state-of-the-art TMs because it imposes considerable
                 algorithmic costs. Instead, these TMs opt to maximize
                 their implementation's efficiency by aborting
                 transactions under overly conservative conditions. As a
                 result, they risk rejecting a significant number of
                 safe executions. In this article, we seek to identify a
                 sweet spot between permissiveness and efficiency by
                 introducing the Time-Warp Multiversion (TWM) algorithm.
                 TWM is based on the key idea of allowing an update
                 transaction that has performed stale reads (i.e.,
                 missed the writes of concurrently committed
                 transactions) to be serialized by ``committing it in
                 the past,'' which we call a time-warp commit. At its
                 core, TWM uses a novel, lightweight validation
                 mechanism with little computational overhead. TWM also
                 guarantees that read-only transactions can never be
                 aborted. Further, TWM guarantees Virtual World
                 Consistency, a safety property that is deemed as
                 particularly relevant in the context of TM. We
                 demonstrate the practicality of this approach through
                 an extensive experimental study: we compare TWM with
                 five other TMs, representative of typical alternative
                 design choices, and on a wide variety of benchmarks.
                 This study shows an average performance improvement
                 across all considered workloads and TMs of 65\% in high
                 concurrency scenarios, with gains extending up to $ 9
                 \times $ with the most favorable benchmarks. These
                 results are a consequence of TWM's ability to achieve
                 drastic reduction of aborts in scenarios of nonminimal
                 contention, while introducing little overhead
                 (approximately 10\%) in worst-case, synthetically
                 designed scenarios (i.e., no contention or contention
                 patterns that cannot be optimized using TWM).",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Eyraud-Dubois:2015:PST,
  author =       "Lionel Eyraud-Dubois and Loris Marchal and Oliver
                 Sinnen and Fr{\'e}d{\'e}ric Vivien",
  title =        "Parallel Scheduling of Task Trees with Limited
                 Memory",
  journal =      j-TOPC,
  volume =       "2",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jul,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2779052",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Aug 7 10:22:35 MDT 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "This article investigates the execution of tree-shaped
                 task graphs using multiple processors. Each edge of
                 such a tree represents some large data. A task can only
                 be executed if all input and output data fit into
                 memory, and a data can only be removed from memory
                 after the completion of the task that uses it as an
                 input data. Such trees arise in the multifrontal method
                 of sparse matrix factorization. The peak memory needed
                 for the processing of the entire tree depends on the
                 execution order of the tasks. With one processor, the
                 objective of the tree traversal is to minimize the
                 required memory. This problem was well studied, and
                 optimal polynomial algorithms were proposed. Here, we
                 extend the problem by considering multiple processors,
                 which is of obvious interest in the application area of
                 matrix factorization. With multiple processors comes
                 the additional objective to minimize the time needed to
                 traverse the tree-that is, to minimize the makespan.
                 Not surprisingly, this problem proves to be much harder
                 than the sequential one. We study the computational
                 complexity of this problem and provide
                 inapproximability results even for unit weight trees.
                 We design a series of practical heuristics achieving
                 different trade-offs between the minimization of peak
                 memory usage and makespan. Some of these heuristics are
                 able to process a tree while keeping the memory usage
                 under a given memory limit. The different heuristics
                 are evaluated in an extensive experimental evaluation
                 using realistic trees.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Dinitz:2015:ISI,
  author =       "Michael Dinitz and Torsten Hoefler",
  title =        "Introduction to the Special Issue on {SPAA 2013}",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809923",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "14e",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kumar:2015:FGA,
  author =       "Ravi Kumar and Benjamin Moseley and Sergei
                 Vassilvitskii and Andrea Vattani",
  title =        "Fast Greedy Algorithms in {MapReduce} and Streaming",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809814",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Greedy algorithms are practitioners' best
                 friends---they are intuitive, are simple to implement,
                 and often lead to very good solutions. However,
                 implementing greedy algorithms in a distributed setting
                 is challenging since the greedy choice is inherently
                 sequential, and it is not clear how to take advantage
                 of the extra processing power. Our main result is a
                 powerful sampling technique that aids in
                 parallelization of sequential algorithms. Armed with
                 this primitive, we then adapt a broad class of greedy
                 algorithms to the MapReduce paradigm; this class
                 includes maximum cover and submodular maximization
                 subject to p -system constraint problems. Our method
                 yields efficient algorithms that run in a logarithmic
                 number of rounds while obtaining solutions that are
                 arbitrarily close to those produced by the standard
                 sequential greedy algorithm. We begin with algorithms
                 for modular maximization subject to a matroid
                 constraint and then extend this approach to obtain
                 approximation algorithms for submodular maximization
                 subject to knapsack or p -system constraints.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Sanders:2015:WEM,
  author =       "Peter Sanders and Jochen Speck and Raoul Steffen",
  title =        "Work-Efficient Matrix Inversion in Polylogarithmic
                 Time",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809812",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present an algorithm for inversion of symmetric
                 positive definite matrices that combines the practical
                 requirement of an optimal number of arithmetic
                 operations and the theoretical goal of a
                 polylogarithmic critical path length. The algorithm
                 reduces inversion to matrix multiplication. It uses
                 Strassen's recursion scheme, but on the critical path
                 it breaks the recursion early, switching to an
                 asymptotically inefficient yet fast use of Newton's
                 method. We also show that the algorithm is numerically
                 stable. Overall, we get a candidate for a massively
                 parallel algorithm that scales to exascale systems even
                 on relatively small inputs.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Gilbert:2015:SBO,
  author =       "Seth Gilbert and Chaodong Zheng",
  title =        "{SybilCast}: Broadcast on the Open Airwaves",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809810",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Consider a scenario where many wireless users are
                 attempting to download data from a single base station.
                 While most of the users are honest, some users may be
                 malicious and attempt to obtain more than their fair
                 share of the bandwidth. One possible strategy for
                 attacking the system is to simulate multiple fake
                 identities, each of which is given its own equal share
                 of the bandwidth. Such an attack is often referred to
                 as a sybil attack. To counter such behavior, we propose
                 SybilCast, a protocol for multichannel wireless
                 networks that limits the number of fake identities and,
                 in doing so, ensures that each honest user gets at
                 least a constant fraction of his or her fair share of
                 the bandwidth. As a result, each honest user can
                 complete his or her data download in asymptotically
                 optimal time. A key aspect of this protocol is
                 balancing the rate at which new identities are admitted
                 and the maximum number of fake identities that can
                 coexist while keeping the overhead low. Besides sybil
                 attacks, our protocol can also tolerate spoofing and
                 jamming.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Lee:2015:FPP,
  author =       "I-Ting Angelina Lee and Charles E. Leiserson and Tao
                 B. Schardl and Zhunping Zhang and Jim Sukha",
  title =        "On-the-Fly Pipeline Parallelism",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809808",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Pipeline parallelism organizes a parallel program as a
                 linear sequence of stages. Each stage processes
                 elements of a data stream, passing each processed data
                 element to the next stage, and then taking on a new
                 element before the subsequent stages have necessarily
                 completed their processing. Pipeline parallelism is
                 used especially in streaming applications that perform
                 video, audio, and digital signal processing. Three out
                 of 13 benchmarks in PARSEC, a popular software
                 benchmark suite designed for shared-memory
                 multiprocessors, can be expressed as pipeline
                 parallelism. Whereas most concurrency platforms that
                 support pipeline parallelism use a
                 ``construct-and-run'' approach, this article
                 investigates ``on-the-fly'' pipeline parallelism, where
                 the structure of the pipeline emerges as the program
                 executes rather than being specified a priori.
                 On-the-fly pipeline parallelism allows the number of
                 stages to vary from iteration to iteration and
                 dependencies to be data dependent. We propose simple
                 linguistics for specifying on-the-fly pipeline
                 parallelism and describe a provably efficient
                 scheduling algorithm, the P iper algorithm, which
                 integrates pipeline parallelism into a work-stealing
                 scheduler, allowing pipeline and fork-join parallelism
                 to be arbitrarily nested. The Piper algorithm
                 automatically throttles the parallelism, precluding
                 ``runaway'' pipelines. Given a pipeline computation
                 with $ T_1 $ work and $ T_\infty $ span (critical-path
                 length), Piper executes the computation on $P$
                 processors in $ T_P \leq T_1 / P + O(T \infty + \lg P)$
                 expected time. Piper also limits stack space, ensuring
                 that it does not grow unboundedly with running time. We
                 have incorporated on-the-fly pipeline parallelism into
                 a Cilk-based work-stealing runtime system. Our
                 prototype Cilk-P implementation exploits optimizations
                 such as ``lazy enabling'' and ``dependency folding.''
                 We have ported the three PARSEC benchmarks that exhibit
                 pipeline parallelism to run on Cilk-P. One of these,
                 x264, cannot readily be executed by systems that
                 support only construct-and-run pipeline parallelism.
                 Benchmark results indicate that Cilk-P has low serial
                 overhead and good scalability. On x264, for example,
                 Cilk-P exhibits a speedup of 13.87 over its respective
                 serial counterpart when running on 16 processors.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Eikel:2015:IRI,
  author =       "Martina Eikel and Christian Scheideler",
  title =        "{IRIS}: a Robust Information System Against Insider
                 {DoS} Attacks",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809806",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "In this work, we present the first scalable
                 distributed information system, that is, a system with
                 low storage overhead, that is provably robust against
                 denial-of-service (DoS) attacks by a current insider.
                 We allow a current insider to have complete knowledge
                 about the information system and to have the power to
                 block any \varepsilon-fraction of its servers by a DoS
                 attack, where \varepsilon can be chosen up to a
                 constant. The task of the system is to serve any
                 collection of lookup requests with at most one per
                 nonblocked server in an efficient way despite this
                 attack. Previously, scalable solutions were only known
                 for DoS attacks of past insiders, where a past insider
                 only has complete knowledge about some past time point
                 $ t_0 $ of the information system. Scheideler et al.
                 [Awerbuch and Scheideler 2007; Baumgart et al. 2009]
                 showed that in this case, it is possible to design an
                 information system so that any information that was
                 inserted or last updated after $ t_0 $ is safe against
                 a DoS attack. But their constructions would not work at
                 all for a current insider. The key idea behind our IRIS
                 system is to make extensive use of coding. More
                 precisely, we present two alternative distributed
                 coding strategies with an at most logarithmic storage
                 overhead that can handle up to a constant fraction of
                 blocked servers.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kling:2015:PSM,
  author =       "Peter Kling and Peter Pietrzyk",
  title =        "Profitable Scheduling on Multiple Speed-Scalable
                 Processors",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "19:1--19:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2809872",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present a new online algorithm for profit-oriented
                 scheduling on multiple speed-scalable processors and
                 provide a tight analysis of the algorithm's
                 competitiveness. Our results generalize and improve
                 upon work by Chan et al. [2010], which considers a
                 single speed-scalable processor. Using significantly
                 different techniques, we can not only extend their
                 model to multiprocessors but also prove an enhanced and
                 tight competitive ratio for our algorithm. In our
                 scheduling problem, jobs arrive over time and are
                 preemptable. They have different workloads, values, and
                 deadlines. The scheduler may decide not to finish a job
                 but instead to suffer a loss equaling the job's value.
                 However, to process a job's workload until its deadline
                 the scheduler must invest a certain amount of energy.
                 The cost of a schedule is the sum of lost values and
                 invested energy. In order to finish a job, the
                 scheduler has to determine which processors to use and
                 set their speeds accordingly. A processor's energy
                 consumption is power $ P_\alpha (s) $ integrated over
                 time, where $ P_\alpha (s) = s^\alpha $ is the power
                 consumption when running at speed $s$. Since we
                 consider the online variant of the problem, the
                 scheduler has no knowledge about future jobs. This
                 problem was introduced by Chan et al. [2010] for the
                 case of a single processor. They presented an online
                 algorithm that is $ \alpha^\alpha + 2 e \alpha
                 $-competitive. We provide an online algorithm for the
                 case of multiple processors with an improved
                 competitive ratio of $ \alpha^\alpha $.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Dutta:2015:CBR,
  author =       "Chinmoy Dutta and Gopal Pandurangan and Rajmohan
                 Rajaraman and Scott Roche",
  title =        "Coalescing-Branching Random Walks on Graphs",
  journal =      j-TOPC,
  volume =       "2",
  number =       "3",
  pages =        "20:1--20:??",
  month =        oct,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2817830",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Nov 3 07:30:42 MST 2015",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We study a distributed randomized information
                 propagation mechanism in networks we call the
                 coalescing-branching random walk (cobra walk, for
                 short). A cobra walk is a generalization of the
                 well-studied ``standard'' random walk, and is useful in
                 modeling and understanding the Susceptible-Infected-
                 Susceptible (SIS)-type of epidemic processes in
                 networks. It can also be helpful in performing
                 light-weight information dissemination in
                 resource-constrained networks. A cobra walk is
                 parameterized by a branching factor $k$. The process
                 starts from an arbitrary vertex, which is labeled
                 active for step 1. In each step of a cobra walk, each
                 active vertex chooses $k$ random neighbors to become
                 active for the next step (``branching''). A vertex is
                 active for step $ t + 1$ only if it is chosen by an
                 active vertex in step $t$ (``coalescing''). This
                 results in a stochastic process in the underlying
                 network with properties that are quite different from
                 both the standard random walk (which is equivalent to
                 the cobra walk with branching factor 1) as well as
                 other gossip-based rumor spreading mechanisms. We focus
                 on the cover time of the cobra walk, which is the
                 number of steps for the walk to reach all the vertices,
                 and derive almost-tight bounds for various graph
                 classes. We show an $ O(\log^2 n)$ high probability
                 bound for the cover time of cobra walks on expanders,
                 if either the expansion factor or the branching factor
                 is sufficiently large; we also obtain an $ O(\log n)$
                 high probability bound for the partial cover time,
                 which is the number of steps needed for the walk to
                 reach at least a constant fraction of the vertices. We
                 also show that the cover time of the cobra walk is,
                 with high probability, $ O(n \log n)$ on any $n$-vertex
                 tree for $ k \geq 2$, $ {\~ O}(n^{1 / d})$ on a
                 $d$-dimensional grid for $ k \geq 2$, and $ O(\log n)$
                 on the complete graph.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Larus:2016:ISI,
  author =       "James Larus and Sandhya Dwarkadas and Jos{\'e} Moreira
                 and Andrew Lumsdaine",
  title =        "Introduction to the Special Issue on {PPoPP'14}",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "21:1--21:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2856513",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Herlihy:2016:WSF,
  author =       "Maurice Herlihy and Zhiyu Liu",
  title =        "Well-Structured Futures and Cache Locality",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "22:1--22:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2858650",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "In fork-join parallelism, a sequential program is
                 split into a directed acyclic graph of tasks linked by
                 directed dependency edges, and the tasks are executed,
                 possibly in parallel, in an order consistent with their
                 dependencies. A popular and effective way to extend
                 fork-join parallelism is to allow threads to create
                 futures. A thread creates a future to hold the results
                 of a computation, which may or may not be executed in
                 parallel. That result is returned when some thread
                 touches that future, blocking if necessary until the
                 result is ready. Recent research has shown that
                 although futures can, of course, enhance parallelism in
                 a structured way, they can have a deleterious effect on
                 cache locality. In the worst case, futures can incur $
                 \Omega (P T_\infty + t T_\infty) $ deviations, which
                 implies $ \Omega (C P T_\infty + C t T_\infty) $
                 additional cache misses, where $C$ is the number of
                 cache lines, $P$ is the number of processors, $t$ is
                 the number of touches, and $ T_\infty $ is the
                 computation span. Since cache locality has a large
                 impact on software performance on modern multicores,
                 this result is troubling. In this article, we show that
                 if futures are used in a simple, disciplined way, then
                 the situation is much better: if each future is touched
                 only once, either by the thread that created it or by a
                 later descendant of the thread that created it, then
                 parallel executions with work stealing can incur at
                 most $ O(C P T^2_\infty)$ additional cache misses-a
                 substantial improvement. This structured use of futures
                 is characteristic of many (but not all) parallel
                 applications.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Thomson:2016:CTU,
  author =       "Paul Thomson and Alastair F. Donaldson and Adam
                 Betts",
  title =        "Concurrency Testing Using Controlled Schedulers: an
                 Empirical Study",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "23:1--23:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2858651",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present an independent empirical study on
                 concurrency testing using controlled schedulers. We
                 have gathered 49 buggy concurrent software benchmarks,
                 drawn from public code bases, which we call SCTBench.
                 We applied a modified version of an existing
                 concurrency testing tool to SCTBench, testing five
                 controlled scheduling techniques: depth-first search,
                 preemption bounding, delay bounding, a controlled
                 random scheduler, and probabilistic concurrency testing
                 (PCT). We attempt to answer several research questions:
                 Which technique performs the best, in terms of bug
                 finding ability? How effective are the two main
                 schedule bounding techniques-preemption bounding and
                 delay bounding-at finding bugs? What challenges are
                 associated with applying concurrency testing techniques
                 to existing code? Can we classify certain benchmarks as
                 trivial or nontrivial? Overall, we found that PCT (with
                 parameter d = 3) was the most effective technique in
                 terms of bug finding; it found all bugs found by the
                 other techniques, plus an additional three, and it
                 missed only one bug. Surprisingly, we found that the
                 naive controlled random scheduler, which randomly
                 chooses one thread to execute at each scheduling point,
                 performs well, finding more bugs than preemption
                 bounding and just two fewer bugs than delay bounding.
                 Our findings confirm that delay bounding is superior to
                 preemption bounding and that schedule bounding is
                 superior to an unbounded depth-first search. The
                 majority of bugs in SCTBench can be exposed using a
                 small schedule bound (1--2), supporting previous
                 claims, although one benchmark requires five
                 preemptions. We found that the need to remove
                 nondeterminism and control all synchronization (as is
                 required for systematic concurrency testing) can be
                 nontrivial. There were eight distinct programs that
                 could not easily be included in out study, such as
                 those that perform network and interprocess
                 communication. We report various properties about the
                 benchmarks tested, such as the fact that the bugs in 18
                 benchmarks were exposed 50\% of the time when using
                 random scheduling. We note that future work should not
                 use the benchmarks that we classify as trivial when
                 presenting new techniques, other than as a minimum
                 baseline. We have made SCTBench and our tools publicly
                 available for reproducibility and use in future work.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Petrovic:2016:LHM,
  author =       "Darko Petrovi{\'c} and Thomas Ropars and Andr{\'e}
                 Schiper",
  title =        "Leveraging Hardware Message Passing for Efficient
                 Thread Synchronization",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "24:1--24:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2858652",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "As the level of parallelism in manycore processors
                 keeps increasing, providing efficient mechanisms for
                 thread synchronization in concurrent programs is
                 becoming a major concern. On cache-coherent
                 shared-memory processors, synchronization efficiency is
                 ultimately limited by the performance of the underlying
                 cache coherence protocol. This article studies how
                 hardware support for message passing can improve
                 synchronization performance. Considering the ubiquitous
                 problem of mutual exclusion, we devise novel algorithms
                 for (i) classic locking, where application threads
                 obtain exclusive access to a shared resource prior to
                 executing their critical sections (CSes), and (ii)
                 delegation, where CSes are executed by special threads.
                 For classic locking, our HybLock algorithm uses a mix
                 of shared memory and hardware message passing, which
                 introduces the idea of hybrid synchronization
                 algorithms. For delegation, we propose mp-server and
                 HybComb: the former is a straightforward adaptation of
                 the server approach to hardware message passing,
                 whereas the latter is a novel hybrid combining
                 algorithm. Evaluation on Tilera's TILE-Gx processor
                 shows that HybLock outperforms the best known classic
                 locks. Furthermore, mp-server can execute contended
                 CSes with unprecedented throughput, as stalls related
                 to cache coherence are removed from the critical path.
                 HybComb can achieve comparable performance while
                 avoiding the need to dedicate server cores.
                 Consequently, our queue and stack implementations,
                 based on the new synchronization algorithms, largely
                 outperform their most efficient shared-memory-only
                 counterparts.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Tardieu:2016:XAP,
  author =       "Olivier Tardieu and Benjamin Herta and David
                 Cunningham and David Grove and Prabhanjan Kambadur and
                 Vijay Saraswat and Avraham Shinnar and Mikio Takeuchi
                 and Mandana Vaziri and Wei Zhang",
  title =        "{X10} and {APGAS} at Petascale",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "25:1--25:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2894746",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "X10 is a high-performance, high-productivity
                 programming language aimed at large-scale distributed
                 and shared-memory parallel applications. It is based on
                 the Asynchronous Partitioned Global Address Space
                 (APGAS) programming model, supporting the same
                 fine-grained concurrency mechanisms within and across
                 shared-memory nodes. We demonstrate that X10 delivers
                 solid performance at petascale by running (weak
                 scaling) eight application kernels on an IBM Power--775
                 supercomputer utilizing up to 55,680 Power7 cores (for
                 1.7Pflop/s of theoretical peak performance). For the
                 four HPC Class 2 Challenge benchmarks, X10 achieves
                 41\% to 87\% of the system's potential at scale (as
                 measured by IBM's HPCC Class 1 optimized runs). We also
                 implement K-Means, Smith-Waterman, Betweenness
                 Centrality, and Unbalanced Tree Search (UTS) for
                 geometric trees. Our UTS implementation is the first to
                 scale to petaflop systems. We describe the advances in
                 distributed termination detection, distributed load
                 balancing, and use of high-performance interconnects
                 that enable X10 to scale out to tens of thousands of
                 cores. We discuss how this work is driving the
                 evolution of the X10 language, core class libraries,
                 and runtime systems.",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Maleki:2016:LRM,
  author =       "Saeed Maleki and Madanlal Musuvathi and Todd
                 Mytkowicz",
  title =        "Low-Rank Methods for Parallelizing Dynamic Programming
                 Algorithms",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "26:1--26:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2884065",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "This article proposes efficient parallel methods for
                 an important class of dynamic programming problems that
                 includes Viterbi, Needleman--Wunsch, Smith--Waterman,
                 and Longest Common Subsequence. In dynamic programming,
                 the subproblems that do not depend on each other, and
                 thus can be computed in parallel, form stages or
                 wavefronts. The methods presented in this article
                 provide additional parallelism allowing multiple stages
                 to be computed in parallel despite dependencies among
                 them. The correctness and the performance of the
                 algorithm relies on rank convergence properties of
                 matrix multiplication in the tropical semiring, formed
                 with plus as the multiplicative operation and max as
                 the additive operation. This article demonstrates the
                 efficiency of the parallel algorithm by showing
                 significant speedups on a variety of important dynamic
                 programming problems. In particular, the parallel
                 Viterbi decoder is up to $ 24 \times $ faster (with 64
                 processors) than a highly optimized commercial
                 baseline.",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Yuan:2016:FCN,
  author =       "Xin Yuan and Wickus Nienaber and Santosh Mahapatra",
  title =        "On Folded-{Clos} Networks with Deterministic
                 Single-Path Routing",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "27:1--27:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2858654",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Folded-Clos networks, also known as fat-trees, have
                 been widely used as interconnects in large-scale
                 high-performance computing clusters. Although users
                 often treat such interconnects as replacements of
                 nonblocking crossbar switches that can carry out any
                 permutation communication without contention, the
                 networking capability of such interconnects without a
                 centralized controller in computer communication
                 environments is not well understood. In this article,
                 we investigate nonblocking two-level folded-Clos
                 networks with deterministic single-path routing, but no
                 centralized controller, and establish the nonblocking
                 condition. The results indicate that nonblocking
                 two-level folded-Clos networks without a centralized
                 controller are much more expensive to construct than
                 the traditional nonblocking networks in the
                 telecommunication environment. Practical two-level
                 folded-Clos based interconnects are blocking. For such
                 interconnects, we establish the lower bound for
                 worst-case contention for permutations with any
                 deterministic single-path routing scheme, show that
                 existing routing schemes perform poorly in terms of
                 worst-case contention for permutations, present a
                 routing scheme that achieves the theoretical optimal,
                 and empirically compare the performance of existing
                 schemes with the optimal routing scheme. The techniques
                 developed for two-level folded-Clos networks are
                 further extended for the general fat-trees of any
                 heights.",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{Sandes:2016:MMA,
  author =       "Edans F. De O. Sandes and Guillermo Miranda and Xavier
                 Martorell and Eduard Ayguade and George Teodoro and
                 Alba C. M. A. {De Melo}",
  title =        "{MASA}: a Multiplatform Architecture for Sequence
                 Aligners with Block Pruning",
  journal =      j-TOPC,
  volume =       "2",
  number =       "4",
  pages =        "28:1--28:??",
  month =        mar,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2858656",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 19 08:11:13 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Biological sequence alignment is a very popular
                 application in Bioinformatics, used routinely
                 worldwide. Many implementations of biological sequence
                 alignment algorithms have been proposed for multicores,
                 GPUs, FPGAs and CellBEs. These implementations are
                 platform-specific; porting them to other systems
                 requires considerable programming effort. This article
                 proposes and evaluates MASA, a flexible and
                 customizable software architecture that enables the
                 execution of biological sequence alignment applications
                 with three variants (local, global, and semiglobal) in
                 multiple hardware/software platforms with block
                 pruning, which is able to reduce significantly the
                 amount of data processed. To attain our flexibility
                 goals, we also propose a generic version of block
                 pruning and developed multiple parallelization
                 strategies as building blocks, including a new
                 asynchronous dataflow-based parallelization, which may
                 be combined to implement efficient aligners in
                 different platforms. We provide four MASA aligner
                 implementations for multicores (OmpSs and OpenMP), GPU
                 (CUDA), and Intel Phi (OpenMP), showing that MASA is
                 very flexible. The evaluation of our generic block
                 pruning strategy shows that it significantly
                 outperforms the previously proposed block pruning,
                 being able to prune up to 66.5\% of the cells when
                 using the new dataflow-based parallelization
                 strategy.",
  acknowledgement = ack-nhfb,
  articleno =    "28",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
  remark =       "Special Issue on PPoPP'14 conference.",
}

@Article{MeyeraufderHeide:2016:ISI,
  author =       "Friedhelm {Meyer auf der Heide} and Peter Sanders and
                 Nodari Sitchinava",
  title =        "Introduction to the Special Issue on {SPAA 2014}",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "1:1--1:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2936716",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kaler:2016:EDD,
  author =       "Tim Kaler and William Hasenplaugh and Tao B. Schardl
                 and Charles E. Leiserson",
  title =        "Executing Dynamic Data-Graph Computations
                 Deterministically Using Chromatic Scheduling",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "2:1--2:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2896850",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "A data-graph computation-popularized by such
                 programming systems as Galois, Pregel, GraphLab,
                 PowerGraph, and GraphChi-is an algorithm that performs
                 local updates on the vertices of a graph. During each
                 round of a data-graph computation, an update function
                 atomically modifies the data associated with a vertex
                 as a function of the vertex's prior data and that of
                 adjacent vertices. A dynamic data-graph computation
                 updates only an active subset of the vertices during a
                 round, and those updates determine the set of active
                 vertices for the next round. This article introduces
                 Prism, a chromatic-scheduling algorithm for executing
                 dynamic data-graph computations. Prism uses a vertex
                 coloring of the graph to coordinate updates performed
                 in a round, precluding the need for mutual-exclusion
                 locks or other nondeterministic data synchronization. A
                 multibag data structure is used by Prism to maintain a
                 dynamic set of active vertices as an unordered set
                 partitioned by color. We analyze Prism using work-span
                 analysis. Let $ G = (V, E) $ be a degree-$ \Delta $
                 graph colored with \chi colors, and suppose that $ Q
                 \subseteq V $ is the set of active vertices in a round.
                 Define $ {\rm size} (Q) = | Q | + \Sigma_{v \in Q} {\rm
                 deg}(v) $, which is proportional to the space required
                 to store the vertices of $Q$ using a sparse-graph
                 layout. We show that a $P$-processor execution of Prism
                 performs updates in $Q$ using $ O(\chi (l g (Q / \chi)
                 + l g \Delta)) + l g P$ span and $ \Theta (s i z e (Q)
                 + P)$ work. These theoretical guarantees are matched by
                 good empirical performance. To isolate the effect of
                 the scheduling algorithm on performance, we modified
                 GraphLab to incorporate Prism and studied seven
                 application benchmarks on a 12-core multicore machine.
                 Prism executes the benchmarks 1.2 to 2.1 times faster
                 than GraphLab's nondeterministic lock-based scheduler
                 while providing deterministic behavior. This article
                 also presents Prism-R, a variation of Prism that
                 executes dynamic data-graph computations
                 deterministically even when updates modify global
                 variables with associative operations. Prism-R
                 satisfies the same theoretical bounds as Prism, but its
                 implementation is more involved, incorporating a
                 multivector data structure to maintain a
                 deterministically ordered set of vertices partitioned
                 by color. Despite its additional complexity, Prism-R is
                 only marginally slower than Prism. On the seven
                 application benchmarks studied, Prism-R incurs a 7\%
                 geometric mean overhead relative to Prism.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Im:2016:CST,
  author =       "Sungjin Im and Benjamin Moseley and Kirk Pruhs and
                 Eric Torng",
  title =        "Competitively Scheduling Tasks with Intermediate
                 Parallelizability",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "4:1--4:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2938378",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We introduce a scheduling algorithm Intermediate-SRPT,
                 and show that it is $ O (\log P)$-competitive with
                 respect to average flow time when scheduling jobs whose
                 parallelizability is intermediate between being fully
                 parallelizable and sequential. Here, the parameter P
                 denotes the ratio between the maximum job size to the
                 minimum. We also show a general matching lower bound on
                 the competitive ratio. Our analysis builds on an
                 interesting combination of potential function and local
                 competitiveness arguments.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bercea:2016:CMI,
  author =       "Ioana O. Bercea and Navin Goyal and David G. Harris
                 and Aravind Srinivasan",
  title =        "On Computing Maximal Independent Sets of Hypergraphs
                 in Parallel",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "5:1--5:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2938436",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Whether or not the problem of finding maximal
                 independent sets (MIS) in hypergraphs is in (R)NC is
                 one of the fundamental problems in the theory of
                 parallel computing. Essentially, the challenge is to
                 design (randomized) algorithms in which the number of
                 processors used is polynomial and the (expected)
                 runtime is polylogarithmic in the size of the input.
                 Unlike the well-understood case of MIS in graphs, for
                 the hypergraph problem, our knowledge is quite limited
                 despite considerable work. It is known that the problem
                 is in RNC when the edges of the hypergraph have
                 constant size. For general hypergraphs with n vertices
                 and m edges, the fastest previously known algorithm
                 works in time $ O(\sqrt n) $ with $ \poly (m, n) $
                 processors. In this article, we give an EREW PRAM
                 randomized algorithm that works in time $ n^{o (1)} $
                 with $ O(n + m \log n) $ processors on general
                 hypergraphs satisfying $ m \leq n^{o (1) \log \log n /
                 \log \log \log n} $. We also give an EREW PRAM
                 deterministic algorithm that runs in time $ n^\epsilon
                 $ on a graph with $ m \leq n^{1 / \delta } $ edges, for
                 any constants $ \delta $, $ \epsilon $; the number of
                 processors is polynomial in $m$, $n$ for a fixed choice
                 of $ \delta $, $ \epsilon $. Our algorithms are based
                 on a sampling idea that reduces the dimension of the
                 hypergraph and employs the algorithm for constant
                 dimension hypergraphs as a subroutine.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bilo:2016:LBN,
  author =       "Davide Bil{\`o} and Luciano Gual{\`a} and Stefano
                 Leucci and Guido Proietti",
  title =        "Locality-Based Network Creation Games",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "6:1--6:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2938426",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Network creation games have been extensively studied,
                 both by economists and computer scientists, due to
                 their versatility in modeling individual-based
                 community formation processes. These processes, in
                 turn, are the theoretical counterpart of several
                 economics, social, and computational applications on
                 the Internet. In their several variants, these games
                 model the tension of a player between the player's two
                 antagonistic goals: to be as close as possible to the
                 other players and to activate a cheapest possible set
                 of links. However, the generally adopted assumption is
                 that players have a common and complete information
                 about the ongoing network, which is quite unrealistic
                 in practice. In this article, we consider a more
                 compelling scenario in which players have only limited
                 information about the network in which they are
                 embedded. More precisely, we explore the game-theoretic
                 and computational implications of assuming that players
                 have a complete knowledge of the network structure only
                 up to a given radius k, which is one of the most
                 qualified local-knowledge models used in distributed
                 computing. In this respect, we define a suitable
                 equilibrium concept, and we provide a comprehensive set
                 of upper and lower bounds to the price of anarchy for
                 the entire range of values of k and for the two classic
                 variants of the game, namely, those in which a player's
                 cost-besides the activation cost of the owned
                 links-depends on the maximum/sum of all distances to
                 the other nodes in the network, respectively. These
                 bounds are assessed through an extensive set of
                 experiments.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Jiang:2016:PPA,
  author =       "Jiayang Jiang and Michael Mitzenmacher and Justin
                 Thaler",
  title =        "Parallel Peeling Algorithms",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "7:1--7:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2938412",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The analysis of several algorithms and data structures
                 can be framed as a peeling process on a random
                 hypergraph: vertices with degree less than k are
                 removed until there are no vertices of degree less than
                 k left. The remaining hypergraph is known as the k
                 core. In this article, we analyze parallel peeling
                 processes, in which in each round, all vertices of
                 degree less than k are removed. It is known that, below
                 a specific edge-density threshold, the k -core is empty
                 with high probability. We show that, with high
                 probability, below this threshold, only $ 1 / \log ((k
                 - 1) (r - 1)) \log \log n + O (1) $ rounds of peeling
                 are needed to obtain the empty $k$-core for $ 4 b
                 r$-uniform hypergraphs. This bound is tight up to an
                 additive constant. Interestingly, we show that, above
                 this threshold, $ \Omega (\log n)$ rounds of peeling
                 are required to find the nonempty $k$-core. Since most
                 algorithms and data structures aim to peel to an empty
                 $k$-core, this asymmetry appears fortunate. We verify
                 the theoretical results both with simulation and with a
                 parallel implementation using graphics processing units
                 (GPUs). Our implementation provides insights into how
                 to structure parallel peeling algorithms for efficiency
                 in practice.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Simhadri:2016:EAS,
  author =       "Harsha Vardhan Simhadri and Guy E. Blelloch and Jeremy
                 T. Fineman and Phillip B. Gibbons and Aapo Kyrola",
  title =        "Experimental Analysis of Space-Bounded Schedulers",
  journal =      j-TOPC,
  volume =       "3",
  number =       "1",
  pages =        "8:1--8:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2938389",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:51 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The running time of nested parallel programs on
                 shared-memory machines depends in significant part on
                 how well the scheduler mapping the program to the
                 machine is optimized for the organization of caches and
                 processor cores on the machine. Recent work proposed
                 ``space-bounded schedulers'' for scheduling such
                 programs on the multilevel cache hierarchies of current
                 machines. The main benefit of this class of schedulers
                 is that they provably preserve locality of the program
                 at every level in the hierarchy, which can result in
                 fewer cache misses and better use of bandwidth than the
                 popular work-stealing scheduler. On the other hand,
                 compared to work stealing, space-bounded schedulers are
                 inferior at load balancing and may have greater
                 scheduling overheads, raising the question as to the
                 relative effectiveness of the two schedulers in
                 practice. In this article, we provide the first
                 experimental study aimed at addressing this question.
                 To facilitate this study, we built a flexible
                 experimental framework with separate interfaces for
                 programs and schedulers. This enables a head-to-head
                 comparison of the relative strengths of schedulers in
                 terms of running times and cache miss counts across a
                 range of benchmarks. (The framework is validated by
                 comparisons with the Intel{\reg} CilkTM Plus
                 work-stealing scheduler.) We present experimental
                 results on a 32-core Xeon{\reg} 7560 comparing work
                 stealing, hierarchy-minded work stealing, and two
                 variants of space-bounded schedulers on both
                 divide-and-conquer microbenchmarks and some popular
                 algorithmic kernels. Our results indicate that
                 space-bounded schedulers reduce the number of L3 cache
                 misses compared to work-stealing schedulers by 25\% to
                 65\% for most of the benchmarks, but incur up to 27\%
                 additional scheduler and load-imbalance overhead. Only
                 for memory-intensive benchmarks can the reduction in
                 cache misses overcome the added overhead, resulting in
                 up to a 25\% improvement in running time for synthetic
                 benchmarks and about 20\% improvement for algorithmic
                 kernels. We also quantify runtime improvements varying
                 the available bandwidth per core (the ``bandwidth
                 gap'') and show up to 50\% improvements in the running
                 times of kernels as this gap increases fourfold. As
                 part of our study, we generalize prior definitions of
                 space-bounded schedulers to allow for more practical
                 variants (while still preserving their guarantees) and
                 explore implementation tradeoffs.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Sheikh:2016:SHJ,
  author =       "Hafiz Fahad Sheikh and Ishfaq Ahmad",
  title =        "Sixteen Heuristics for Joint Optimization of
                 Performance, Energy, and Temperature in Allocating
                 Tasks to Multi-Cores",
  journal =      j-TOPC,
  volume =       "3",
  number =       "2",
  pages =        "9:1--9:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2948973",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:52 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Three-way joint optimization of performance ( P ),
                 energy ( E ), and temperature ( T ) in scheduling
                 parallel tasks to multiple cores poses a challenge that
                 is staggering in its computational complexity. The goal
                 of the PET optimized scheduling ( PETOS ) problem is to
                 minimize three quantities: the completion time of a
                 task graph, the total energy consumption, and the peak
                 temperature of the system. Algorithms based on
                 conventional multi-objective optimization techniques
                 can be designed for solving the PETOS problem. But
                 their execution times are exceedingly high and hence
                 their applicability is restricted merely to problems of
                 modest size. Exacerbating the problem is the solution
                 space that is typically a Pareto front since no single
                 solution can be strictly best along all three
                 objectives. Thus, not only is the absolute quality of
                 the solutions important but ``the spread of the
                 solutions'' along each objective and the distribution
                 of solutions within the generated tradeoff front are
                 also desired. A natural alternative is to design
                 efficient heuristic algorithms that can generate good
                 solutions as well as good spreads --- note that most of
                 the prior work in energy-efficient task allocation is
                 predominantly single- or dual-objective oriented. Given
                 a directed acyclic graph (DAG) representing a parallel
                 program, a heuristic encompasses policies as to what
                 tasks should go to what cores and at what frequency
                 should that core operate. Various policies, such as
                 greedy, iterative, and probabilistic, can be employed.
                 However, the choice and usage of these policies can
                 influence a heuristic towards a particular objective
                 and can also profoundly impact its performance. This
                 article proposes 16 heuristics that utilize various
                 methods for task-to-core allocation and frequency
                 selection. This article also presents a methodical
                 classification scheme which not only categorizes the
                 proposed heuristics but can also accommodate additional
                 heuristics. Extensive simulation experiments compare
                 these algorithms while shedding light on their
                 strengths and tradeoffs.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Blanchard:2016:SMO,
  author =       "Jeffrey D. Blanchard and Erik Opavsky and Emircan
                 Uysaler",
  title =        "Selecting Multiple Order Statistics with a Graphics
                 Processing Unit",
  journal =      j-TOPC,
  volume =       "3",
  number =       "2",
  pages =        "10:1--10:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2948974",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:52 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Extracting a set of multiple order statistics from a
                 huge data set provides important information about the
                 distribution of the values in the full set of data.
                 This article introduces an algorithm,
                 bucketMultiSelect, for simultaneously selecting
                 multiple order statistics with a graphics processing
                 unit (GPU). Typically, when a large set of order
                 statistics is desired, the vector is sorted. When the
                 sorted version of the vector is not needed,
                 bucketMultiSelect significantly reduces computation
                 time by eliminating a large portion of the unnecessary
                 operations involved in sorting. For large vectors,
                 bucketMultiSelect returns thousands of order statistics
                 in less time than sorting the vector while typically
                 using less memory. For vectors containing $ 2^{28} $
                 values of type double, bucketMultiSelect selects the
                 101 percentile order statistics in less than 95ms and
                 is more than $ 8 \times $ faster than sorting the
                 vector with a GPU optimized merge sort.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bohme:2016:IRC,
  author =       "David B{\"o}hme and Markus Geimer and Lukas Arnold and
                 Felix Voigtlaender and Felix Wolf",
  title =        "Identifying the Root Causes of Wait States in
                 Large-Scale Parallel Applications",
  journal =      j-TOPC,
  volume =       "3",
  number =       "2",
  pages =        "11:1--11:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2934661",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:52 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Driven by growing application requirements and
                 accelerated by current trends in microprocessor design,
                 the number of processor cores on modern supercomputers
                 is increasing from generation to generation. However,
                 load or communication imbalance prevents many codes
                 from taking advantage of the available parallelism, as
                 delays of single processes may spread wait states
                 across the entire machine. Moreover, when employing
                 complex point-to-point communication patterns, wait
                 states may propagate along far-reaching cause-effect
                 chains that are hard to track manually and that
                 complicate an assessment of the actual costs of an
                 imbalance. Building on earlier work by Meira, Jr., et
                 al., we present a scalable approach that identifies
                 program wait states and attributes their costs in terms
                 of resource waste to their original cause. By replaying
                 event traces in parallel both forward and backward, we
                 can identify the processes and call paths responsible
                 for the most severe imbalances, even for runs with
                 hundreds of thousands of processes.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Dathathri:2016:CAL,
  author =       "Roshan Dathathri and Ravi Teja Mullapudi and Uday
                 Bondhugula",
  title =        "Compiling Affine Loop Nests for a Dynamic Scheduling
                 Runtime on Shared and Distributed Memory",
  journal =      j-TOPC,
  volume =       "3",
  number =       "2",
  pages =        "12:1--12:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2948975",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:52 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Current de-facto parallel programming models like
                 OpenMP and MPI make it difficult to extract task-level
                 dataflow parallelism as opposed to bulk-synchronous
                 parallelism. Task parallel approaches that use
                 point-to-point synchronization between dependent tasks
                 in conjunction with dynamic scheduling dataflow
                 runtimes are thus becoming attractive. Although good
                 performance can be extracted for both shared and
                 distributed memory using these approaches, there is
                 little compiler support for them. In this article, we
                 describe the design of compiler--runtime interaction to
                 automatically extract coarse-grained dataflow
                 parallelism in affine loop nests for both shared and
                 distributed-memory architectures. We use techniques
                 from the polyhedral compiler framework to extract tasks
                 and generate components of the runtime that are used to
                 dynamically schedule the generated tasks. The runtime
                 includes a distributed decentralized scheduler that
                 dynamically schedules tasks on a node. The schedulers
                 on different nodes cooperate with each other through
                 asynchronous point-to-point communication, and all of
                 this is achieved by code automatically generated by the
                 compiler. On a set of six representative affine loop
                 nest benchmarks, while running on 32 nodes with 8
                 threads each, our compiler-assisted runtime yields a
                 geometric mean speedup of $ 143.6 \times $ ($ 70.3
                 \times $ to $ 474.7 \times $) over the sequential
                 version and a geometric mean speedup of $ 1.64 \times $
                 ($ 1.04 \times $ to $ 2.42 \times $) over the
                 state-of-the-art automatic parallelization approach
                 that uses bulk synchronization. We also compare our
                 system with past work that addresses some of these
                 challenges on shared memory, and an emerging runtime
                 (Intel Concurrent Collections) that demands higher
                 programmer input and effort in parallelizing. To the
                 best of our knowledge, ours is also the first automatic
                 scheme that allows for dynamic scheduling of affine
                 loop nests on a cluster of multicores.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Benoit:2016:AGP,
  author =       "Anne Benoit and Aur{\'e}lien Cavelan and Yves Robert
                 and Hongyang Sun",
  title =        "Assessing General-Purpose Algorithms to Cope with
                 Fail-Stop and Silent Errors",
  journal =      j-TOPC,
  volume =       "3",
  number =       "2",
  pages =        "13:1--13:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2897189",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:52 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "In this article, we combine the traditional
                 checkpointing and rollback recovery strategies with
                 verification mechanisms to cope with both fail-stop and
                 silent errors. The objective is to minimize makespan
                 and/or energy consumption. For divisible load
                 applications, we use first-order approximations to find
                 the optimal checkpointing period to minimize execution
                 time, with an additional verification mechanism to
                 detect silent errors before each checkpoint, hence
                 extending the classical formula by Young and Daly for
                 fail-stop errors only. We further extend the approach
                 to include intermediate verifications, and to consider
                 a bicriteria problem involving both time and energy
                 (linear combination of execution time and energy
                 consumption). Then, we focus on application workflows
                 whose dependence graph is a linear chain of tasks.
                 Here, we determine the optimal checkpointing and
                 verification locations, with or without intermediate
                 verifications, for the bicriteria problem. Rather than
                 using a single speed during the whole execution, we
                 further introduce a new execution scenario, which
                 allows for changing the execution speed via Dynamic
                 Voltage and Frequency Scaling (DVFS). In this latter
                 scenario, we determine the optimal checkpointing and
                 verification locations, as well as the optimal speed
                 pairs for each task segment between any two consecutive
                 checkpoints. Finally, we conduct an extensive set of
                 simulations to support the theoretical study, and to
                 assess the performance of each algorithm, showing that
                 the best overall performance is achieved under the most
                 flexible scenario using intermediate verifications and
                 different speeds.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Koutis:2016:SPD,
  author =       "Ioannis Koutis and Shen Chen Xu",
  title =        "Simple Parallel and Distributed Algorithms for
                 Spectral Graph Sparsification",
  journal =      j-TOPC,
  volume =       "3",
  number =       "2",
  pages =        "14:1--14:??",
  month =        aug,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2948062",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 23 15:24:52 MDT 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We describe simple algorithms for spectral graph
                 sparsification, based on iterative computations of
                 weighted spanners and sampling. Leveraging the
                 algorithms of Baswana and Sen for computing spanners,
                 we obtain the first distributed spectral sparsification
                 algorithm in the CONGEST model. We also obtain a
                 parallel algorithm with improved work and time
                 guarantees, as well as other natural distributed
                 implementations. Combining this algorithm with the
                 parallel framework of Peng and Spielman for solving
                 symmetric diagonally dominant linear systems, we get a
                 parallel solver that is significantly more efficient in
                 terms of the total work.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Dorier:2016:DAP,
  author =       "Matthieu Dorier and Gabriel Antoniu and Franck
                 Cappello and Marc Snir and Robert Sisneros and Orcun
                 Yildiz and Shadi Ibrahim and Tom Peterka and Leigh
                 Orf",
  title =        "{Damaris}: Addressing Performance Variability in Data
                 Management for Post-Petascale Simulations",
  journal =      j-TOPC,
  volume =       "3",
  number =       "3",
  pages =        "15:1--15:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2987371",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Dec 26 17:40:41 MST 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "With exascale computing on the horizon, reducing
                 performance variability in data management tasks
                 (storage, visualization, analysis, etc.) is becoming a
                 key challenge in sustaining high performance. This
                 variability significantly impacts the overall
                 application performance at scale and its predictability
                 over time. In this article, we present Damaris, a
                 system that leverages dedicated cores in multicore
                 nodes to offload data management tasks, including I/O,
                 data compression, scheduling of data movements, in situ
                 analysis, and visualization. We evaluate Damaris with
                 the CM1 atmospheric simulation and the Nek5000
                 computational fluid dynamic simulation on four
                 platforms, including NICS's Kraken and NCSA's Blue
                 Waters. Our results show that (1) Damaris fully hides
                 the I/O variability as well as all I/O-related costs,
                 thus making simulation performance predictable; (2) it
                 increases the sustained write throughput by a factor of
                 up to 15 compared with standard I/O approaches; (3) it
                 allows almost perfect scalability of the simulation up
                 to over 9,000 cores, as opposed to state-of-the-art
                 approaches that fail to scale; and (4) it enables a
                 seamless connection to the VisIt visualization software
                 to perform in situ analysis and visualization in a way
                 that impacts neither the performance of the simulation
                 nor its variability. In addition, we extended our
                 implementation of Damaris to also support the use of
                 dedicated nodes and conducted a thorough comparison of
                 the two approaches-dedicated cores and dedicated
                 nodes-for I/O tasks with the aforementioned
                 applications.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Gao:2016:AOM,
  author =       "Jiaquan Gao and Yu Wang and Jun Wang and Ronghua
                 Liang",
  title =        "Adaptive Optimization Modeling of Preconditioned
                 Conjugate Gradient on Multi-{GPUs}",
  journal =      j-TOPC,
  volume =       "3",
  number =       "3",
  pages =        "16:1--16:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/2990849",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Dec 26 17:40:41 MST 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The preconditioned conjugate gradient (PCG) algorithm
                 is a well-known iterative method for solving sparse
                 linear systems in scientific computations.
                 GPU-accelerated PCG algorithms for large-sized problems
                 have attracted considerable attention recently.
                 However, on a specific multi-GPU platform, producing a
                 highly parallel PCG implementation for any large-sized
                 problem requires significant time because several
                 manual steps are involved in adjusting the related
                 parameters and selecting an appropriate storage format
                 for the matrix block that is assigned to each GPU. This
                 motivates us to propose adaptive optimization modeling
                 of PCG on multi-GPUs, which mainly involves the
                 following parts: (1) an optimization multi-GPU parallel
                 framework of PCG and (2) the profile-based optimization
                 modeling for each one of the main components of the PCG
                 algorithm, including vector operation, inner product,
                 and sparse matrix-vector multiplication (SpMV). Our
                 model does not construct a new storage format or kernel
                 but automatically and rapidly generates an optimal
                 parallel PCG algorithm for any problem on a specific
                 multi-GPU platform by integrating existing storage
                 formats and kernels. We take a vector operation kernel,
                 an inner-product kernel, and five popular SpMV kernels
                 for an example to present the idea of constructing the
                 model. Given that our model is general, independent of
                 the problems, and dependent on the resources of
                 devices, this model is constructed only once for each
                 type of GPU. The experiments validate the high
                 efficiency of our proposed model.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Creech:2016:TSS,
  author =       "Timothy Creech and Rajeev Barua",
  title =        "Transparently Space Sharing a Multicore Among Multiple
                 Processes",
  journal =      j-TOPC,
  volume =       "3",
  number =       "3",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3001910",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Dec 26 17:40:41 MST 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "As hardware becomes increasingly parallel and the
                 availability of scalable parallel software improves,
                 the problem of managing multiple multithreaded
                 applications (processes) becomes important. Malleable
                 processes, which can vary the number of threads used as
                 they run, enable sophisticated and flexible resource
                 management. Although many existing applications
                 parallelized for SMPs with parallel runtimes are in
                 fact already malleable, deployed runtime environments
                 provide no interface nor any strategy for intelligently
                 allocating hardware threads or even preventing
                 oversubscription. Prior research methods either depend
                 on profiling applications ahead of time to make good
                 decisions about allocations or do not account for
                 process efficiency at all, leading to poor performance.
                 None of these prior methods have been adapted widely in
                 practice. This article presents the Scheduling and
                 Allocation with Feedback (SCAF) system: a drop-in
                 runtime solution that supports existing malleable
                 applications in making intelligent allocation decisions
                 based on observed efficiency without any changes to
                 semantics, program modification, offline profiling, or
                 even recompilation. Our existing implementation can
                 control most unmodified OpenMP applications. Other
                 malleable threading libraries can also easily be
                 supported with small modifications without requiring
                 application modification or recompilation. In this
                 work, we present the SCAF daemon and a SCAF-aware port
                 of the GNU OpenMP runtime. We present a new technique
                 for estimating process efficiency purely at runtime
                 using available hardware counters and demonstrate its
                 effectiveness in aiding allocation decisions. We
                 evaluated SCAF using NAS NPB parallel benchmarks on
                 five commodity parallel platforms, enumerating
                 architectural features and their effects on our scheme.
                 We measured the benefit of SCAF in terms of sum of
                 speedups improvement (a common metric for
                 multiprogrammed environments) when running all
                 benchmark pairs concurrently compared to
                 equipartitioning-the best existing competing scheme in
                 the literature. We found that SCAF improves on
                 equipartitioning on four out of five machines, showing
                 a mean improvement factor in sum of speedups of 1.04 to
                 1.11x for benchmark pairs, depending on the machine,
                 and 1.09x on average. Since we are not aware of any
                 widely available tool for equipartitioning, we also
                 compare SCAF against multiprogramming using unmodified
                 OpenMP, which is the only environment available to end
                 users today. SCAF improves on the unmodified OpenMP
                 runtimes for all five machines, with a mean improvement
                 of 1.08 to 2.07x, depending on the machine, and 1.59x
                 on average.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ballard:2016:HPS,
  author =       "Grey Ballard and Alex Druinsky and Nicholas Knight and
                 Oded Schwartz",
  title =        "Hypergraph Partitioning for Sparse Matrix--Matrix
                 Multiplication",
  journal =      j-TOPC,
  volume =       "3",
  number =       "3",
  pages =        "18:1--18:??",
  month =        dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3015144",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Dec 26 17:40:41 MST 2016",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We propose a fine-grained hypergraph model for sparse
                 matrix-matrix multiplication (SpGEMM), a key
                 computational kernel in scientific computing and data
                 analysis whose performance is often communication
                 bound. This model correctly describes both the
                 interprocessor communication volume along a critical
                 path in a parallel computation and also the volume of
                 data moving through the memory hierarchy in a
                 sequential computation. We show that identifying a
                 communication-optimal algorithm for particular input
                 matrices is equivalent to solving a hypergraph
                 partitioning problem. Our approach is nonzero structure
                 dependent, meaning that we seek the best algorithm for
                 the given input matrices. In addition to our
                 three-dimensional fine-grained model, we also propose
                 coarse-grained one-dimensional and two-dimensional
                 models that correspond to simpler SpGEMM algorithms. We
                 explore the relations between our models theoretically,
                 and we study their performance experimentally in the
                 context of three applications that use SpGEMM as a key
                 computation. For each application, we find that at
                 least one coarse-grained model is as communication
                 efficient as the fine-grained model. We also observe
                 that different applications have affinities for
                 different algorithms. Our results demonstrate that
                 hypergraphs are an accurate model for reasoning about
                 the communication costs of SpGEMM as well as a
                 practical tool for exploring the SpGEMM algorithm
                 design space.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Grove:2017:ISS,
  author =       "David Grove",
  title =        "Introduction to the Special Section on {PPoPP'15}",
  journal =      j-TOPC,
  volume =       "3",
  number =       "4",
  pages =        "19:1--19:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3040224",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 25 07:55:06 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Majo:2017:LPC,
  author =       "Zoltan Majo and Thomas R. Gross",
  title =        "A Library for Portable and Composable Data Locality
                 Optimizations for {NUMA} Systems",
  journal =      j-TOPC,
  volume =       "3",
  number =       "4",
  pages =        "20:1--20:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3040222",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 25 07:55:06 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Many recent multiprocessor systems are realized with a
                 nonuniform memory architecture (NUMA) and accesses to
                 remote memory locations take more time than local
                 memory accesses. Optimizing NUMA memory system
                 performance is difficult and costly for three principal
                 reasons: (1) Today's programming languages/libraries
                 have no explicit support for NUMA systems, (2) NUMA
                 optimizations are not portable, and (3) optimizations
                 are not composable (i.e., they can become ineffective
                 or worsen performance in environments that support
                 composable parallel software). This article presents
                 TBB-NUMA, a parallel programming library based on Intel
                 Threading Building Blocks (TBB) that supports portable
                 and composable NUMA-aware programming. TBB-NUMA
                 provides a model of task affinity that captures a
                 programmer's insights on mapping tasks to resources.
                 NUMA-awareness affects all layers of the library (i.e.,
                 resource management, task scheduling, and high-level
                 parallel algorithm templates) and requires close
                 coupling between all these layers. Optimizations
                 implemented with TBB-NUMA (for a set of standard
                 benchmark programs) result in up to 44\% performance
                 improvement over standard TBB. But more important,
                 optimized programs are portable across different NUMA
                 architectures and preserve data locality also when
                 composed with other parallel computations sharing the
                 same resource management layer.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Golan-Gueta:2017:ASA,
  author =       "Guy Golan-Gueta and G. Ramalingam and Mooly Sagiv and
                 Eran Yahav",
  title =        "Automatic Scalable Atomicity via Semantic Locking",
  journal =      j-TOPC,
  volume =       "3",
  number =       "4",
  pages =        "21:1--21:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3040223",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 25 07:55:06 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "In this article, we consider concurrent programs in
                 which the shared state consists of instances of
                 linearizable abstract data types (ADTs). We present an
                 automated approach to concurrency control that
                 addresses a common need: the need to atomically execute
                 a code fragment, which may contain multiple ADT
                 operations on multiple ADT instances. We present a
                 synthesis algorithm that automatically enforces
                 atomicity of given code fragments (in a client program)
                 by inserting pessimistic synchronization that
                 guarantees atomicity and deadlock-freedom (without
                 using any rollback mechanism). Our algorithm takes a
                 commutativity specification as an extra input. This
                 specification indicates for every pair of ADT
                 operations the conditions under which the operations
                 commute. Our algorithm enables greater parallelism by
                 permitting commuting operations to execute
                 concurrently. We have implemented the synthesis
                 algorithm in a Java compiler and applied it to several
                 Java programs. Our results show that our approach
                 produces efficient and scalable synchronization.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Izraelevitz:2017:GSN,
  author =       "Joseph Izraelevitz and Michael L. Scott",
  title =        "Generality and Speed in Nonblocking Dual Containers",
  journal =      j-TOPC,
  volume =       "3",
  number =       "4",
  pages =        "22:1--22:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3040220",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 25 07:55:06 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Nonblocking dual data structures extend traditional
                 notions of nonblocking progress to accommodate partial
                 methods, both by bounding the number of steps that a
                 thread can execute after its preconditions have been
                 satisfied and by ensuring that a waiting thread
                 performs no remote memory accesses that could interfere
                 with the execution of other threads. A nonblocking dual
                 container, in particular, is designed to hold either
                 data or requests. An insert operation either adds data
                 to the container or removes and satisfies a request; a
                 remove operation either takes data out of the container
                 or inserts a request. We present the first
                 general-purpose construction for nonblocking dual
                 containers, allowing any nonblocking container for data
                 to be paired with almost any nonblocking container for
                 requests. We also present new custom algorithms, based
                 on the LCRQ of Morrison and Afek, that outperform the
                 fastest previously known dual containers by factors of
                 four to six.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Cole:2017:ROS,
  author =       "Richard Cole and Vijaya Ramachandran",
  title =        "Resource Oblivious Sorting on Multicores",
  journal =      j-TOPC,
  volume =       "3",
  number =       "4",
  pages =        "23:1--23:??",
  month =        mar,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3040221",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sat Mar 25 07:55:06 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present a deterministic sorting algorithm, Sample,
                 Partition, and Merge Sort (SPMS), that interleaves the
                 partitioning of a sample sort with merging.
                 Sequentially, it sorts $n$ elements in $ O(n \log n)$
                 time cache-obliviously with an optimal number of cache
                 misses. The parallel complexity (or critical path
                 length) of the algorithm is $ O(\log n \log \log n)$,
                 which improves on previous bounds for deterministic
                 sample sort. The algorithm also has low false sharing
                 costs. When scheduled by a work-stealing scheduler in a
                 multicore computing environment with a global shared
                 memory and p cores, each having a cache of size $M$
                 organized in blocks of size $B$, the costs of the
                 additional cache misses and false sharing misses due to
                 this parallel execution are bounded by the cost of $
                 O(S \cdot M / B)$ and $ O(S \cdot B)$ cache misses,
                 respectively, where $S$ is the number of steals
                 performed during the execution. Finally, SPMS is
                 resource oblivious in that the dependence on machine
                 parameters appear only in the analysis of its
                 performance and not within the algorithm itself.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ballard:2017:GEIa,
  author =       "Grey Ballard and Mary Hall and Tim Harris and Brandon
                 Lucia",
  title =        "{Guest Editor} Introduction {PPoPP 2016}, Special
                 Issue 2 of 2",
  journal =      j-TOPC,
  volume =       "4",
  number =       "1",
  pages =        "1:1--1:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3108141",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ashkiani:2017:GME,
  author =       "Saman Ashkiani and Andrew Davidson and Ulrich Meyer
                 and John D. Owens",
  title =        "{GPU Multisplit}: an Extended Study of a Parallel
                 Algorithm",
  journal =      j-TOPC,
  volume =       "4",
  number =       "1",
  pages =        "2:1--2:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3108139",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Multisplit is a broadly useful parallel primitive that
                 permutes its input data into contiguous buckets or
                 bins, where the function that categorizes an element
                 into a bucket is provided by the programmer. Due to the
                 lack of an efficient multisplit on Graphics Processing
                 Units (GPUs), programmers often choose to implement
                 multisplit with a sort. One way is to first generate an
                 auxiliary array of bucket IDs and then sort input data
                 based on it. In case smaller indexed buckets possess
                 smaller valued keys, another way for multisplit is to
                 directly sort input data. Both methods are inefficient
                 and require more work than necessary: The former
                 requires more expensive data movements while the latter
                 spends unnecessary effort in sorting elements within
                 each bucket. In this work, we provide a parallel model
                 and multiple implementations for the multisplit
                 problem. Our principal focus is multisplit for a small
                 (up to 256) number of buckets. We use warp-synchronous
                 programming models and emphasize warpwide
                 communications to avoid branch divergence and reduce
                 memory usage. We also hierarchically reorder input
                 elements to achieve better coalescing of global memory
                 accesses. On a GeForce GTX 1080 GPU, we can reach a
                 peak throughput of 18.93Gkeys/s (or 11.68Gpairs/s) for
                 a key-only (or key-value) multisplit. Finally, we
                 demonstrate how multisplit can be used as a building
                 block for radix sort. In our multisplit-based sort
                 implementation, we achieve comparable performance to
                 the fastest GPU sort routines, sorting 32-bit keys (and
                 key-value pairs) with a throughput of 3.0Gkeys/s (and
                 2.1Gpair/s).",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Wang:2017:GGG,
  author =       "Yangzihao Wang and Yuechao Pan and Andrew Davidson and
                 Yuduo Wu and Carl Yang and Leyuan Wang and Muhammad
                 Osama and Chenshan Yuan and Weitang Liu and Andy T.
                 Riffel and John D. Owens",
  title =        "{Gunrock}: {GPU} Graph Analytics",
  journal =      j-TOPC,
  volume =       "4",
  number =       "1",
  pages =        "3:1--3:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3108140",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "For large-scale graph analytics on the GPU, the
                 irregularity of data access and control flow, and the
                 complexity of programming GPUs, have presented two
                 significant challenges to developing a programmable
                 high-performance graph library. ``Gunrock,'' our
                 graph-processing system designed specifically for the
                 GPU, uses a high-level, bulk-synchronous, data-centric
                 abstraction focused on operations on a vertex or edge
                 frontier. Gunrock achieves a balance between
                 performance and expressiveness by coupling
                 high-performance GPU computing primitives and
                 optimization strategies with a high-level programming
                 model that allows programmers to quickly develop new
                 graph primitives with small code size and minimal GPU
                 programming knowledge. We characterize the performance
                 of various optimization strategies and evaluate
                 Gunrock's overall performance on different GPU
                 architectures on a wide range of graph primitives that
                 span from traversal-based algorithms and ranking
                 algorithms, to triangle counting and
                 bipartite-graph-based algorithms. The results show that
                 on a single GPU, Gunrock has on average at least an
                 order of magnitude speedup over Boost and PowerGraph,
                 comparable performance to the fastest GPU hardwired
                 primitives and CPU shared-memory graph libraries, such
                 as Ligra and Galois, and better performance than any
                 other GPU high-level graph library.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Chowdhury:2017:AAD,
  author =       "Rezaul Chowdhury and Pramod Ganapathi and Stephen
                 Tschudi and Jesmin Jahan Tithi and Charles Bachmeier
                 and Charles E. Leiserson and Armando Solar-Lezama and
                 Bradley C. Kuszmaul and Yuan Tang",
  title =        "{Autogen}: Automatic Discovery of Efficient Recursive
                 Divide-\&-Conquer Algorithms for Solving Dynamic
                 Programming Problems",
  journal =      j-TOPC,
  volume =       "4",
  number =       "1",
  pages =        "4:1--4:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3125632",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present Autogen --- an algorithm that for a wide
                 class of dynamic programming (DP) problems
                 automatically discovers highly efficient
                 cache-oblivious parallel recursive divide-and-conquer
                 algorithms from inefficient iterative descriptions of
                 DP recurrences. Autogen analyzes the set of DP table
                 locations accessed by the iterative algorithm when run
                 on a DP table of small size and automatically
                 identifies a recursive access pattern and a
                 corresponding provably correct recursive algorithm for
                 solving the DP recurrence. We use Autogen to
                 autodiscover efficient algorithms for several
                 well-known problems. Our experimental results show that
                 several autodiscovered algorithms significantly
                 outperform parallel looping and tiled loop-based
                 algorithms. Also, these algorithms are less sensitive
                 to fluctuations of memory and bandwidth compared with
                 their looping counterparts, and their running times and
                 energy profiles remain relatively more stable. To the
                 best of our knowledge, Autogen is the first algorithm
                 that can automatically discover new nontrivial
                 divide-and-conquer algorithms.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Steele:2017:AAC,
  author =       "Guy L. {Steele Jr.} and Jean-Baptiste Tristan",
  title =        "Adding Approximate Counters",
  journal =      j-TOPC,
  volume =       "4",
  number =       "1",
  pages =        "5:1--5:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3132167",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We describe a general framework for adding the values
                 of two approximate counters to produce a new
                 approximate counter value whose expected estimated
                 value is equal to the sum of the expected estimated
                 values of the given approximate counters. (To the best
                 of our knowledge, this is the first published
                 description of any algorithm for adding two approximate
                 counters.) We then work out implementation details for
                 five different kinds of approximate counter and provide
                 optimized pseudocode. For three of them, we present
                 proofs that the variance of a counter value produced by
                 adding two counter values in this way is bounded, and
                 in fact is no worse, or not much worse, than the
                 variance of the value of a single counter to which the
                 same total number of increment operations have been
                 applied. Addition of approximate counters is useful in
                 massively parallel divide-and-conquer algorithms that
                 use a distributed representation for large arrays of
                 counters. We describe two machine-learning algorithms
                 for topic modeling that use millions of integer
                 counters and confirm that replacing the integer
                 counters with approximate counters is effective,
                 speeding up a GPU-based implementation by over 65\% and
                 a CPU-based implementation by nearly 50\%, as well as
                 reducing memory requirements, without degrading their
                 statistical effectiveness.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ballard:2017:GEIb,
  author =       "Grey Ballard and Mary Hall and Tim Harris and Brandon
                 Lucia",
  title =        "{Guest Editor} Introduction {PPoPP 2016}, Special
                 Issue 2 of 2",
  journal =      j-TOPC,
  volume =       "4",
  number =       "2",
  pages =        "6:1--6:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3108142",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kalikar:2017:DNM,
  author =       "Saurabh Kalikar and Rupesh Nasre",
  title =        "{DomLock}: a New Multi-Granularity Locking Technique
                 for Hierarchies",
  journal =      j-TOPC,
  volume =       "4",
  number =       "2",
  pages =        "7:1--7:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3127584",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We present efficient locking mechanisms for
                 hierarchical data structures. Several applications work
                 on an abstract hierarchy of objects, and a parallel
                 execution on this hierarchy necessitates
                 synchronization across workers operating on different
                 parts of the hierarchy. Existing synchronization
                 mechanisms are too coarse, too inefficient, or too ad
                 hoc, resulting in reduced or unpredictable amount of
                 concurrency. We propose a new locking approach based on
                 the structural properties of the underlying hierarchy.
                 We show that the developed techniques are efficient
                 even when the hierarchy is an arbitrary graph.
                 Theoretically, we present our approach as a
                 locking-cost-minimizing instance of a generic algebraic
                 model of synchronization for hierarchies. Using
                 STMBench7, we illustrate considerable reduction in the
                 locking cost, resulting in an average throughput
                 improvement of 42\%.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Haider:2017:LRA,
  author =       "Syed Kamran Haider and William Hasenplaugh and Dan
                 Alistarh",
  title =        "{Lease\slash Release}: Architectural Support for
                 Scaling Contended Data Structures",
  journal =      j-TOPC,
  volume =       "4",
  number =       "2",
  pages =        "8:1--8:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3132168",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "High memory contention is generally agreed to be a
                 worst-case scenario for concurrent data structures.
                 There has been a significant amount of research effort
                 spent investigating designs that minimize contention,
                 and several programming techniques have been proposed
                 to mitigate its effects. However, there are currently
                 few architectural mechanisms to allow scaling contended
                 data structures at high thread counts. In this article,
                 we investigate hardware support for scalable contended
                 data structures. We propose Lease/Release, a simple
                 addition to standard directory-based MESI cache
                 coherence protocols, allowing participants to lease
                 memory, at the granularity of cache lines, by delaying
                 coherence messages for a short, bounded period of time.
                 Our analysis shows that Lease/Release can significantly
                 reduce the overheads of contention for both
                 non-blocking (lock-free) and lock-based data structure
                 implementations while ensuring that no deadlocks are
                 introduced. We validate Lease/Release empirically on
                 the Graphite multiprocessor simulator on a range of
                 data structures, including queue, stack, and priority
                 queue implementations, as well as on transactional
                 applications. Results show that Lease/Release
                 consistently improves both throughput and energy usage,
                 by up to 5x, both for lock-free and lock-based data
                 structure designs.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Cao:2017:HRD,
  author =       "Man Cao and Minjia Zhang and Aritra Sengupta and
                 Swarnendu Biswas and Michael D. Bond",
  title =        "Hybridizing and Relaxing Dependence Tracking for
                 Efficient Parallel Runtime Support",
  journal =      j-TOPC,
  volume =       "4",
  number =       "2",
  pages =        "9:1--9:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3108138",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "It is notoriously challenging to develop parallel
                 software systems that are both scalable and correct.
                 Runtime support for parallelism-such as multithreaded
                 record and replay, data race detectors, transactional
                 memory, and enforcement of stronger memory models-helps
                 achieve these goals, but existing commodity solutions
                 slow programs substantially to track (i.e., detect or
                 control) an execution's cross-thread dependencies
                 accurately. Prior work tracks cross-thread dependencies
                 either ``pessimistically,'' slowing every program
                 access, or ``optimistically,'' allowing for lightweight
                 instrumentation of most accesses but dramatically
                 slowing accesses that are conflicting (i.e., involved
                 in cross-thread dependencies). This article presents
                 two novel approaches that seek to improve the
                 performance of dependence tracking. Hybrid tracking
                 (HT) hybridizes pessimistic and optimistic tracking by
                 overcoming a fundamental mismatch between these two
                 kinds of tracking. HT uses an adaptive, profile-based
                 policy to make runtime decisions about switching
                 between pessimistic and optimistic tracking. Relaxed
                 tracking (RT) attempts to reduce optimistic tracking's
                 overhead on conflicting accesses by tracking
                 dependencies in a ``relaxed'' way-meaning that not all
                 dependencies are tracked accurately-while still
                 preserving both program semantics and runtime support's
                 correctness. To demonstrate the usefulness and
                 potential of HT and RT, we build runtime support based
                 on the two approaches. Our evaluation shows that both
                 approaches offer performance advantages over existing
                 approaches, but there exist challenges and
                 opportunities for further improvement. HT and RT are
                 distinct solutions to the same problem. It is easier to
                 build runtime support based on HT than on RT, although
                 RT does not incur the overhead of online profiling.
                 This article presents the two approaches together to
                 inform and inspire future designs for efficient
                 parallel runtime support.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Chatzopoulos:2017:EES,
  author =       "Georgios Chatzopoulos and Aleksandar Dragojevi{\'c}
                 and Rachid Guerraoui",
  title =        "{ESTIMA}: Extrapolating {ScalabiliTy} of In-Memory
                 Applications",
  journal =      j-TOPC,
  volume =       "4",
  number =       "2",
  pages =        "10:1--10:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3108137",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "This article presents estima, an easy-to-use tool for
                 extrapolating the scalability of in-memory
                 applications. estima is designed to perform a simple
                 yet important task: Given the performance of an
                 application on a small machine with a handful of cores,
                 estima extrapolates its scalability to a larger machine
                 with more cores, while requiring minimum input from the
                 user. The key idea underlying estima is the use of
                 stalled cycles (e.g., cycles that the processor spends
                 waiting for missed cache line fetches or busy locks).
                 estima measures stalled cycles on a few cores and
                 extrapolates them to more cores, estimating the amount
                 of waiting in the system. estima can be effectively
                 used to predict the scalability of in-memory
                 applications for bigger execution machines. For
                 instance, using measurements of memcached and SQLite on
                 a desktop machine, we obtain accurate predictions of
                 their scalability on a server. Our extensive evaluation
                 shows the effectiveness of estima on a large number of
                 in-memory benchmarks.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Gulisano:2017:EDS,
  author =       "Vincenzo Gulisano and Yiannis Nikolakopoulos and
                 Daniel Cederman and Marina Papatriantafilou and
                 Philippas Tsigas",
  title =        "Efficient Data Streaming Multiway Aggregation through
                 Concurrent Algorithmic Designs and New Abstract Data
                 Types",
  journal =      j-TOPC,
  volume =       "4",
  number =       "2",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3131272",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Oct 10 17:42:07 MDT 2017",
  bibsource =    "http://topc.acm.org/;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Data streaming relies on continuous queries to process
                 unbounded streams of data in a real-time fashion. It is
                 commonly demanding in computation capacity, given that
                 the relevant applications involve very large volumes of
                 data. Data structures act as articulation points and
                 maintain the state of data streaming operators,
                 potentially supporting high parallelism and balancing
                 the work among them. Prompted by this fact, in this
                 work we study and analyze parallelization needs of
                 these articulation points, focusing on the problem of
                 streaming multiway aggregation, where large data
                 volumes are received from multiple input streams. The
                 analysis of the parallelization needs, as well as of
                 the use and limitations of existing aggregate designs
                 and their data structures, leads us to identify needs
                 for appropriate shared objects that can achieve
                 low-latency and high-throughput multiway aggregation.
                 We present the requirements of such objects as abstract
                 data types and we provide efficient lock-free
                 linearizable algorithmic implementations of them, along
                 with new multiway aggregate algorithmic designs that
                 leverage them, supporting both deterministic
                 order-sensitive and order-insensitive aggregate
                 functions. Furthermore, we point out future directions
                 that open through these contributions. The article
                 includes an extensive experimental study, based on a
                 variety of continuous aggregation queries on two large
                 datasets extracted from SoundCloud, a music social
                 network, and from a Smart Grid network. In all the
                 experiments, the proposed data structures and the
                 enhanced aggregate operators improved the processing
                 performance significantly, up to one order of
                 magnitude, in terms of both throughput and latency,
                 over the commonly used techniques based on queues.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Malas:2018:MIP,
  author =       "Tareq M. Malas and Georg Hager and Hatem Ltaief and
                 David E. Keyes",
  title =        "Multidimensional Intratile Parallelization for
                 Memory-Starved Stencil Computations",
  journal =      j-TOPC,
  volume =       "4",
  number =       "3",
  pages =        "12:1--12:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3155290",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Optimizing the performance of stencil algorithms has
                 been the subject of intense research over the last two
                 decades. Since many stencil schemes have low arithmetic
                 intensity, most optimizations focus on increasing the
                 temporal data access locality, thus reducing the data
                 traffic through the main memory interface with the
                 ultimate goal of decoupling from this bottleneck. There
                 are, however, only a few approaches that explicitly
                 leverage the shared cache feature of modern multicore
                 chips. If every thread works on its private, separate
                 cache block, the available cache space can become too
                 small, and sufficient temporal locality may not be
                 achieved. We propose a flexible multidimensional
                 intratile parallelization method for stencil algorithms
                 on multicore CPUs with a shared outer-level cache. This
                 method leads to a significant reduction in the required
                 cache space without adverse effects from hardware
                 prefetching or TLB shortage. Our Girih framework
                 includes an autotuner to select optimal parameter
                 configurations on the target hardware. We conduct
                 performance experiments on two contemporary Intel
                 processors and compare with the state-of-the-art
                 stencil frameworks Pluto and Pochoir, using four
                 corner-case stencil schemes and a wide range of problem
                 sizes. Girih shows substantial performance advantages
                 and best arithmetic intensity at almost all problem
                 sizes, especially on low-intensity stencils with
                 variable coefficients. We study in detail the
                 performance behavior at varying grid sizes using
                 phenomenological performance modeling. Our analysis of
                 energy consumption reveals that our method can save
                 energy through reduced DRAM bandwidth usage even at a
                 marginal performance gain. It is thus well suited for
                 future architectures that will be strongly challenged
                 by the cost of data movement, be it in terms of
                 performance or energy consumption.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Akbudak:2018:PMS,
  author =       "Kadir Akbudak and Oguz Selvitopi and Cevdet Aykanat",
  title =        "Partitioning Models for Scaling Parallel Sparse
                 Matrix--Matrix Multiplication",
  journal =      j-TOPC,
  volume =       "4",
  number =       "3",
  pages =        "13:1--13:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3155292",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We investigate outer-product--parallel,
                 inner-product--parallel, and
                 row-by-row-product--parallel formulations of sparse
                 matrix-matrix multiplication (SpGEMM) on distributed
                 memory architectures. For each of these three
                 formulations, we propose a hypergraph model and a
                 bipartite graph model for distributing SpGEMM
                 computations based on one-dimensional (1D) partitioning
                 of input matrices. We also propose a communication
                 hypergraph model for each formulation for distributing
                 communication operations. The computational graph and
                 hypergraph models adopted in the first phase aim at
                 minimizing the total message volume and balancing the
                 computational loads of processors, whereas the
                 communication hypergraph models adopted in the second
                 phase aim at minimizing the total message count and
                 balancing the message volume loads of processors. That
                 is, the computational partitioning models reduce the
                 bandwidth cost and the communication hypergraph models
                 reduce the latency cost. Our extensive parallel
                 experiments on up to 2048 processors for a wide range
                 of realistic SpGEMM instances show that although the
                 outer-product--parallel formulation scales better, the
                 row-by-row-product--parallel formulation is more viable
                 due to its significantly lower partitioning overhead
                 and competitive scalability. For computational
                 partitioning models, our experimental findings indicate
                 that the proposed bipartite graph models are attractive
                 alternatives to their hypergraph counterparts because
                 of their lower partitioning overhead. Finally, we show
                 that by reducing the latency cost besides the bandwidth
                 cost through using the communication hypergraph models,
                 the parallel SpGEMM time can be further improved up to
                 32\%.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bilardi:2018:LBT,
  author =       "Gianfranco Bilardi and Michele Scquizzato and
                 Francesco Silvestri",
  title =        "A Lower Bound Technique for Communication in {BSP}",
  journal =      j-TOPC,
  volume =       "4",
  number =       "3",
  pages =        "14:1--14:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3181776",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Communication is a major factor determining the
                 performance of algorithms on current computing systems;
                 it is therefore valuable to provide tight lower bounds
                 on the communication complexity of computations. This
                 article presents a lower bound technique for the
                 communication complexity in the bulk-synchronous
                 parallel (BSP) model of a given class of DAG
                 computations. The derived bound is expressed in terms
                 of the switching potential of a DAG, that is, the
                 number of permutations that the DAG can realize when
                 viewed as a switching network. The proposed technique
                 yields tight lower bounds for the fast Fourier
                 transform (FFT), and for any sorting and permutation
                 network. A stronger bound is also derived for the
                 periodic balanced sorting network, by applying this
                 technique to suitable subnetworks. Finally, we
                 demonstrate that the switching potential captures
                 communication requirements even in computational models
                 different from BSP, such as the I/O model and the
                 LPRAM.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Sahin:2018:CSC,
  author =       "Semih Sahin and Bugra Gedik",
  title =        "{C-Stream}: a Co-routine-Based Elastic Stream
                 Processing Engine",
  journal =      j-TOPC,
  volume =       "4",
  number =       "3",
  pages =        "15:1--15:??",
  month =        apr,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3184120",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Stream processing is a computational paradigm for
                 on-the-fly processing of live data. This paradigm lends
                 itself to implementations that can provide high
                 throughput and low latency by taking advantage of
                 various forms of parallelism that are naturally
                 captured by the stream processing model of computation,
                 such as pipeline, task, and data parallelism. In this
                 article, we describe the design and implementation of
                 C-Stream, which is an elastic stream processing engine.
                 C-Stream encompasses three unique properties. First, in
                 contrast to the widely adopted event-based interface
                 for developing streaming operators, C-Stream provides
                 an interface wherein each operator has its own driver
                 loop and relies on data availability application
                 programming interfaces (APIs) to decide when to perform
                 its computations. This self-control-based model
                 significantly simplifies the development of operators
                 that require multiport synchronization. Second,
                 C-Stream contains a dynamic scheduler that manages the
                 multithreaded execution of the operators. The
                 scheduler, which is customizable via plug-ins, enables
                 the execution of the operators as co-routines, using
                 any number of threads. The base scheduler implements
                 back-pressure, provides data availability APIs, and
                 manages preemption and termination handling. Last,
                 C-Stream varies the degree of parallelism to resolve
                 bottlenecks by both dynamically changing the number of
                 threads used to execute an application and adjusting
                 the number of replicas of data-parallel operators. We
                 provide an experimental evaluation of C-Stream. The
                 results show that C-Stream is scalable, highly
                 customizable, and can resolve bottlenecks by
                 dynamically adjusting the level of data parallelism
                 used.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Agrawal:2018:ISI,
  author =       "Kunal Agrawal and I-Ting Angelina Lee and Michael
                 Spear",
  title =        "Introduction to Special Issue on {SPAA'15}",
  journal =      j-TOPC,
  volume =       "4",
  number =       "4",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3226041",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ahn:2018:ADN,
  author =       "Kook Jin Ahn and Sudipto Guha",
  title =        "Access to Data and Number of Iterations: Dual Primal
                 Algorithms for Maximum Matching under Resource
                 Constraints",
  journal =      j-TOPC,
  volume =       "4",
  number =       "4",
  pages =        "17:1--17:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3154855",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "In this article, we consider graph algorithms in
                 models of computation where the space usage (random
                 accessible storage, in addition to the read-only input)
                 is sublinear in the number of edges $m$ and the access
                 to input is constrained. These questions arise in many
                 natural settings, and in particular in the analysis of
                 streaming algorithms, MapReduce or similar algorithms,
                 or message passing distributed computing that model
                 constrained parallelism with sublinear central
                 processing. We focus on weighted nonbipartite maximum
                 matching in this article. For any constant $ p > 1$, we
                 provide an iterative sampling-based algorithm for
                 computing a $ (1 - \epsilon)$-approximation of the
                 weighted nonbipartite maximum matching that uses $ O(p
                 / \epsilon)$ rounds of sampling, and $ O(n^{1 + 1 /
                 p})$ space. The results extend to $b$-Matching with
                 small changes. This article combines adaptive sketching
                 literature and fast primal-dual algorithms based on
                 relaxed Dantzig--Wolfe decision procedures. Each round
                 of sampling is implemented through linear sketches and
                 can be executed in a single round of streaming or two
                 rounds of MapReduce. The article also proves that
                 nonstandard linear relaxations of a problem, in
                 particular penalty-based formulations, are helpful in
                 reducing the adaptive dependence of the iterations.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Alistarh:2018:TAS,
  author =       "Dan Alistarh and William Leiserson and Alexander
                 Matveev and Nir Shavit",
  title =        "{ThreadScan}: Automatic and Scalable Memory
                 Reclamation",
  journal =      j-TOPC,
  volume =       "4",
  number =       "4",
  pages =        "18:1--18:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3201897",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The concurrent memory reclamation problem is that of
                 devising a way for a deallocating thread to verify that
                 no other concurrent threads hold references to a memory
                 block being deallocated. To date, in the absence of
                 automatic garbage collection, there is no satisfactory
                 solution to this problem; existing tracking methods
                 like hazard pointers, reference counters, or
                 epoch-based techniques like RCU are either
                 prohibitively expensive or require significant
                 programming expertise to the extent that implementing
                 them efficiently can be worthy of a publication. None
                 of the existing techniques are automatic or even
                 semi-automated. In this article, we take a new approach
                 to concurrent memory reclamation. Instead of manually
                 tracking access to memory locations as done in
                 techniques like hazard pointers, or restricting shared
                 accesses to specific epoch boundaries as in RCU, our
                 algorithm, called ThreadScan, leverages operating
                 system signaling to automatically detect which memory
                 locations are being accessed by concurrent threads.
                 Initial empirical evidence shows that ThreadScan scales
                 surprisingly well and requires negligible programming
                 effort beyond the standard use of Malloc and Free.",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Dimitrov:2018:RDT,
  author =       "Dimitar Dimitrov and Martin Vechev and Vivek Sarkar",
  title =        "Race Detection in Two Dimensions",
  journal =      j-TOPC,
  volume =       "4",
  number =       "4",
  pages =        "19:1--19:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3264618",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Dynamic race detection is a program analysis technique
                 for detecting errors caused by undesired interleavings
                 of concurrent tasks. A primary challenge when designing
                 efficient race detection algorithms is to achieve
                 manageable space requirements. State-of-the-art
                 algorithms for unstructured parallelism require $
                 \Theta (n) $ space per monitored memory location, where
                 n is the total number of tasks. This is a serious
                 drawback when analyzing programs with many tasks. In
                 contrast, algorithms for programs with a
                 series-parallel (SP) structure require only $ \Theta
                 (1) $ space. Unfortunately, it is currently not well
                 understood if there are classes of parallelism beyond
                 SP that can also benefit from and be analyzed with $
                 \Theta (1) $ space complexity. In this work, we show
                 that structures richer than SP graphs, namely, that of
                 two-dimensional (2D) lattices, can also be analyzed in
                 $ \Theta (1) $ space. Toward that (a) we extend
                 Tarjan's algorithm for finding lowest common ancestors
                 to handle 2D lattices; (b) from that extension we
                 derive a serial algorithm for race detection that can
                 analyze arbitrary task graphs with a 2D lattice
                 structure; (c) we present a restriction to fork-join
                 that admits precisely the 2D lattices as task graphs
                 (e.g., it can express pipeline parallelism). Our work
                 generalizes prior work on structured race detection and
                 aims to provide a deeper understanding of the interplay
                 between structured parallelism and program analysis
                 efficiency.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Lee:2018:ERD,
  author =       "I-Ting Angelina Lee and Tao B. Schardl",
  title =        "Efficient Race Detection for Reducer Hyperobjects",
  journal =      j-TOPC,
  volume =       "4",
  number =       "4",
  pages =        "20:1--20:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3205914",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:25 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "A multithreaded Cilk program that is ostensibly
                 deterministic may nevertheless behave
                 nondeterministically due to programming errors in the
                 code. For a Cilk program that uses reducers-a general
                 reduction mechanism supported in various Cilk
                 dialects-such programming errors are especially
                 challenging to debug, because the errors can expose the
                 nondeterminism in how the Cilk runtime system manages
                 reducers. We identify two unique types of races that
                 arise from incorrect use of reducers in a Cilk program,
                 and we present two algorithms to catch these races. The
                 first algorithm, called the Peer-Set algorithm, detects
                 view-read races, which occur when the program attempts
                 to retrieve a value out of a reducer when the read may
                 result in a nondeterministic value, such as before all
                 previously spawned subcomputations that might update
                 the reducer have necessarily returned. The second
                 algorithm, called the SP+ algorithm, detects
                 determinacy races-instances where a write to a memory
                 location occurs logically in parallel with another
                 access to that location-even when the raced-on memory
                 locations relate to reducers. Both algorithms are
                 provably correct, asymptotically efficient, and can be
                 implemented efficiently in practice. We have
                 implemented both algorithms in our prototype race
                 detector, Rader. When running Peer-Set, Rader incurs a
                 geometric-mean multiplicative overhead of 2.56 over
                 running the benchmark without instrumentation. When
                 running SP+, Rader incurs a geometric-mean
                 multiplicative overhead of 16.94.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Gilbert:2018:ISI,
  author =       "Seth Gilbert",
  title =        "Introduction to the Special Issue for {SPAA 2016}",
  journal =      j-TOPC,
  volume =       "5",
  number =       "1",
  pages =        "1:1--1:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3230677",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Mitzenmacher:2018:BBC,
  author =       "Michael Mitzenmacher and Rajmohan Rajaraman and Scott
                 Roche",
  title =        "Better Bounds for Coalescing-Branching Random Walks",
  journal =      j-TOPC,
  volume =       "5",
  number =       "1",
  pages =        "2:1--2:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209688",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Coalescing-branching random walks, or cobra walks for
                 short, are a natural variant of random walks on graphs
                 that can model the spread of disease through contacts
                 or the spread of information in networks. In a k -cobra
                 walk, at each timestep, a subset of the vertices are
                 active; each active vertex chooses k random neighbors
                 (sampled independently and uniformly with replacement)
                 that become active at the next step, and these are the
                 only active vertices at the next step. A natural
                 quantity to study for cobra walks is the cover time,
                 which corresponds to the expected time when all nodes
                 have become infected or received the disseminated
                 information. In this article, we extend previous
                 results for cobra walks in multiple ways. We show that
                 the cover time for the 2-cobra walk on $ [0, n]^d $ is
                 $ O(n) $ (where the order notation hides constant
                 factors that depend on $d$); previous work had shown
                 the cover time was $ O(n \cdot \polylog (n))$. We show
                 that the cover time for a 2-cobra walk on an $n$-vertex
                 $d$ regular graph with conductance $ \phi_G$ is $ O(d^4
                 \phis^{-2}_G \log^2 n)$, significantly generalizing a
                 previous result that held only for expander graphs with
                 sufficiently high expansion. And, finally, we show that
                 the cover time for a 2-cobra walk on a graph with n
                 vertices and m edges is always $ O(m n^{3 / 4} \log
                 n)$; this is the first result showing that the bound of
                 $ \Theta (n^3)$ for the worst-case cover time for
                 random walks can be beaten using 2-cobra walks.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Liu:2018:RAN,
  author =       "Mingmou Liu and Xiaoyin Pan and Yitong Yin",
  title =        "Randomized Approximate Nearest Neighbor Search with
                 Limited Adaptivity",
  journal =      j-TOPC,
  volume =       "5",
  number =       "1",
  pages =        "3:1--3:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209884",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We study the complexity of parallel data structures
                 for approximate nearest neighbor search in
                 $d$-dimensional Hamming space $ \{ 0, 1 \}^d$. A
                 classic model for static data structures is the
                 cell-probe model [27]. We consider a cell-probe model
                 with limited adaptivity, where given a $ k \geq 1$, a
                 query is resolved by making at most $k$ rounds of
                 parallel memory accesses to the data structure. We give
                 two randomized algorithms that solve the approximate
                 nearest neighbor search using $k$ rounds of parallel
                 memory accesses: --- a simple algorithm with $ O(k
                 (\log d)^{1 / k})$ total number of memory accesses for
                 all $ k \geq 1$ --- an algorithm with $ O(k + (1 / k
                 \log d)^{O(1 / k)})$ total number of memory accesses
                 for all sufficiently large $k$. Both algorithms use
                 data structures of polynomial size. We prove an $
                 \Omega (1 / k (\log d)^{1 / k})$ lower bound for the
                 total number of memory accesses for any randomized
                 algorithm solving the approximate nearest neighbor
                 search within $ k \leq \log \log d / 2 \log \log \log
                 d$ rounds of parallel memory accesses on any data
                 structures of polynomial size. This lower bound shows
                 that our first algorithm is asymptotically optimal when
                 $ k = O(1)$. And our second algorithm achieves the
                 asymptotically optimal tradeoff between number of
                 rounds and total number of memory accesses. In the
                 extremal case, when $ k = O(\log \log d / \log \log
                 \log d)$ is big enough, our second algorithm matches
                 the $ \Theta (\log \log d / \log \log \log d)$ tight
                 bound for fully adaptive algorithms for approximate
                 nearest neighbor search in [11].",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Pandurangan:2018:FDA,
  author =       "Gopal Pandurangan and Peter Robinson and Michele
                 Scquizzato",
  title =        "Fast Distributed Algorithms for Connectivity and {MST}
                 in Large Graphs",
  journal =      j-TOPC,
  volume =       "5",
  number =       "1",
  pages =        "4:1--4:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209689",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Motivated by the increasing need to understand the
                 algorithmic foundations of distributed large-scale
                 graph computations, we study a number of fundamental
                 graph problems in a message-passing model for
                 distributed computing where $ k \geq 2 $ machines
                 jointly perform computations on graphs with $n$ nodes
                 (typically, $ n \gg k$). The input graph is assumed to
                 be initially randomly partitioned among the $k$
                 machines, a common implementation in many real-world
                 systems. Communication is point-to-point, and the goal
                 is to minimize the number of communication rounds of
                 the computation. Our main result is an (almost) optimal
                 distributed randomized algorithm for graph
                 connectivity. Our algorithm runs in $ {\tilde O}(n /
                 k^2)$ rounds ($ {\tilde O}$ notation hides a $ \polylog
                 (n)$ factor and an additive $ \polylog (n)$ term). This
                 improves over the best previously known bound of $
                 {\tilde O}(n / k)$ [Klauck et al., SODA 2015] and is
                 optimal (up to a polylogarithmic factor) in light of an
                 existing lower bound of $ \tilde \Omega (n / k^2)$. Our
                 improved algorithm uses a bunch of techniques,
                 including linear graph sketching, that prove useful in
                 the design of efficient distributed graph algorithms.
                 Using the connectivity algorithm as a building block,
                 we then present fast randomized algorithms for
                 computing minimum spanning trees, (approximate)
                 min-cuts, and for many graph verification problems. All
                 these algorithms take $ {\tilde O}(n / k^2)$ rounds and
                 are optimal up to polylogarithmic factors. We also show
                 an almost matching lower bound of $ \tilde \Omega (n /
                 k^2)$ rounds for many graph verification problems by
                 leveraging lower bounds in random-partition
                 communication complexity.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Korupolu:2018:RPF,
  author =       "Madhukar Korupolu and Rajmohan Rajaraman",
  title =        "Robust and Probabilistic Failure-Aware Placement",
  journal =      j-TOPC,
  volume =       "5",
  number =       "1",
  pages =        "5:1--5:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3210367",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Motivated by the growing complexity and heterogeneity
                 of modern data centers, and the prevalence of commodity
                 component failures, this article studies the
                 failure-aware placement problem of placing tasks of a
                 parallel job on machines in the data center with the
                 goal of increasing availability. We consider two models
                 of failures: adversarial and probabilistic. In the
                 adversarial model, each node has a weight (higher
                 weight implying higher reliability) and the adversary
                 can remove any subset of nodes of total weight at most
                 a given bound W and our goal is to find a placement
                 that incurs the least disruption against such an
                 adversary. In the probabilistic model, each node has a
                 probability of failure and we need to find a placement
                 that maximizes the probability that at least K out of N
                 tasks survive at any time. For adversarial failures, we
                 first show that (i) the problems are in $ \Sigma_2 $,
                 the second level of the polynomial hierarchy; (ii) a
                 variant of the problem that we call RobustFap (for
                 Robust Failure-Aware Placement) is co-NP-hard; and
                 (iii) an all-or-nothing version of RobustFap is $
                 \Sigma_2$-complete. We then give a polynomial-time
                 approximation scheme (PTAS) for RobustFap, a key
                 ingredient of which is a solution that we design for a
                 fractional version of RobustFap. We then study
                 HierRobustFap, which is the fractional RobustFap
                 problem over a hierarchical network, in which failures
                 can occur at any subset of nodes in the hierarchy, and
                 a failure at a node can adversely impact all of its
                 descendants in the hierarchy. To solve HierRobustFap,
                 we introduce a notion of hierarchical max-min fairness
                 and a novel Generalized Spreading algorithm, which is
                 simultaneously optimal for every upper bound W on the
                 total weight of nodes that an adversary can fail. These
                 generalize the classical notion of max-min fairness to
                 work with nodes of differing capacities, differing
                 reliability weights, and hierarchical structures. Using
                 randomized rounding, we extend this to give an
                 algorithm for integral HierRobustFap. For the
                 probabilistic version, we first give an algorithm that
                 achieves an additive $ \epsilon $ approximation in the
                 failure probability for the single level version,
                 called ProbFap, while giving up a $ (1 + \epsilon)$
                 multiplicative factor in the number of failures. We
                 then extend the result to the hierarchical version,
                 HierProbFap, achieving an \epsilon additive
                 approximation in failure probability while giving up an
                 $ (L + \epsilon)$ multiplicative factor in the number
                 of failures, where $L$ is the number of levels in the
                 hierarchy.",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Zhang:2018:LFT,
  author =       "Deli Zhang and Pierre Laborde and Lance Lebanoff and
                 Damian Dechev",
  title =        "Lock-Free Transactional Transformation for Linked Data
                 Structures",
  journal =      j-TOPC,
  volume =       "5",
  number =       "1",
  pages =        "6:1--6:??",
  month =        sep,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3209690",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Nonblocking data structures allow scalable and
                 thread-safe access to shared data. They provide
                 individual operations that appear to execute
                 atomically. However, it is often desirable to execute
                 multiple operations atomically in a transactional
                 manner. Previous solutions, such as Software
                 Transactional Memory (STM) and transactional boosting,
                 manage transaction synchronization separately from the
                 underlying data structure's thread synchronization.
                 Although this reduces programming effort, it leads to
                 overhead associated with additional synchronization and
                 the need to rollback aborted transactions. In this
                 work, we present a new methodology for transforming
                 high-performance lock-free linked data structures into
                 high-performance lock-free transactional linked data
                 structures without revamping the data structures'
                 original synchronization design. Our approach leverages
                 the semantic knowledge of the data structure to
                 eliminate the overhead of false conflicts and
                 rollbacks. We encapsulate all operations, operands, and
                 transaction status in a transaction descriptor, which
                 is shared among the nodes accessed by the same
                 transaction. We coordinate threads to help finish the
                 remaining operations of delayed transactions based on
                 their transaction descriptors. When a transaction
                 fails, we recover the correct abstract state by
                 reversely interpreting the logical status of a node. We
                 also present an obstruction-free version of our
                 algorithm that can be applied to dynamic execution
                 scenarios and an example of our approach applied to a
                 hash map. In our experimental evaluation using
                 transactions with randomly generated operations, our
                 lock-free transactional data structures outperform the
                 transactional boosted ones by 70\% on average. They
                 also outperform the alternative STM-based approaches by
                 a factor of 2 to 13 across all scenarios. More
                 importantly, we achieve 4,700 to 915,000 times fewer
                 spurious aborts than the alternatives.",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Muller:2018:NHP,
  author =       "Michel M{\"u}ller and Takayuki Aoki",
  title =        "New High Performance {GPGPU} Code Transformation
                 Framework Applied to Large Production Weather
                 Prediction Code",
  journal =      j-TOPC,
  volume =       "5",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291523",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We introduce ``Hybrid Fortran,'' a new approach that
                 allows a high-performance GPGPU port for structured
                 grid Fortran codes. This technique only requires
                 minimal changes for a CPU targeted codebase, which is a
                 significant advancement in terms of productivity. It
                 has been successfully applied to both dynamical core
                 and physical processes of ASUCA, a Japanese mesoscale
                 weather prediction model with more than 150k lines of
                 code. By means of a minimal weather application that
                 resembles ASUCA's code structure, Hybrid Fortran is
                 compared to both a performance model as well as today's
                 commonly used method, OpenACC. As a result, the Hybrid
                 Fortran implementation is shown to deliver the same or
                 better performance than OpenACC, and its performance
                 agrees with the model both on CPU and GPU. In a
                 full-scale production run, using an ASUCA grid with
                 1581 $ \times $ 1301 $ \times $ 58 cells and real-world
                 weather data in 2km resolution, 24 NVIDIA Tesla P100
                 running the Hybrid Fortran-based GPU port are shown to
                 replace more than fifty 18-core Intel Xeon Broadwell
                 E5-2695 v4 running the reference implementation-an
                 achievement comparable to more invasive GPGPU rewrites
                 of other weather models.",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Burtscher:2018:HQF,
  author =       "Martin Burtscher and Sindhu Devale and Sahar Azimi and
                 Jayadharini Jaiganesh and Evan Powers",
  title =        "A High-Quality and Fast Maximal Independent Set
                 Implementation for {GPUs}",
  journal =      j-TOPC,
  volume =       "5",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291525",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Computing a maximal independent set is an important
                 step in many parallel graph algorithms. This article
                 introduces ECL-MIS, a maximal independent set
                 implementation that works well on GPUs. It includes key
                 optimizations to speed up computation, reduce the
                 memory footprint, and increase the set size. Its CUDA
                 implementation requires fewer than 30 kernel
                 statements, runs asynchronously, and produces a
                 deterministic result. It outperforms the maximal
                 independent set implementations of Pannotia, CUSP, and
                 IrGL on each of the 16 tested graphs of various types
                 and sizes. On a Titan X GPU, ECL-MIS is between 3.9 and
                 100 times faster (11.5 times, on average). ECL-MIS
                 running on the GPU is also faster than the parallel CPU
                 codes Ligra, Ligra+, and PBBS running on 20 Xeon cores,
                 which it outperforms by 4.1 times, on average. At the
                 same time, ECL-MIS produces maximal independent sets
                 that are up to 52\% larger (over 10\%, on average)
                 compared to these preexisting CPU and GPU
                 implementations. Whereas these codes produce maximal
                 independent sets that are, on average, about 15\%
                 smaller than the largest possible such sets, ECL-MIS
                 sets are less than 6\% smaller than the maximum
                 independent sets.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Liu:2018:APR,
  author =       "Junhong Liu and Guangming Tan and Yulong Luo and
                 Jiajia Li and Zeyao Mo and Ninghui Sun",
  title =        "An Autotuning Protocol to Rapidly Build Autotuners",
  journal =      j-TOPC,
  volume =       "5",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291527",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Automatic performance tuning (Autotuning) is an
                 increasingly critical tuning technique for the high
                 portable performance of Exascale applications. However,
                 constructing an autotuner from scratch remains a
                 challenge, even for domain experts. In this work, we
                 propose a performance tuning and knowledge management
                 suite (PAK) to help rapidly build autotuners. In order
                 to accommodate existing autotuning techniques, we
                 present an autotuning protocol that is composed of an
                 extractor, producer, optimizer, evaluator, and learner.
                 To achieve modularity and reusability, we also define
                 programming interfaces for each protocol component as
                 the fundamental infrastructure, which provides a
                 customizable mechanism to deploy knowledge mining in
                 the performance database. PAK's usability is
                 demonstrated by studying two important computational
                 kernels: stencil computation and sparse matrix-vector
                 multiplication (SpMV). Our proposed autotuner based on
                 PAK shows comparable performance and higher
                 productivity than traditional autotuners by writing
                 just a few tens of code using our autotuning
                 protocol.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Anta:2018:SDP,
  author =       "Antonio Fern{\'a}ndez Anta and Dariusz R. Kowalski and
                 Miguel A. Mosteiro and Prudence W. H. Wong",
  title =        "Scheduling Dynamic Parallel Workload of Mobile Devices
                 with Access Guarantees",
  journal =      j-TOPC,
  volume =       "5",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3291529",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We study a dynamic resource-allocation problem that
                 arises in various parallel computing scenarios, such as
                 mobile cloud computing, cloud computing systems,
                 Internet of Things systems, and others. Generically, we
                 model the architecture as client mobile devices and
                 static base stations. Each client ``arrives'' to the
                 system to upload data to base stations by radio
                 transmissions and then ``leaves.'' The problem, called
                 Station Assignment, is to assign clients to stations so
                 that every client uploads their data under some
                 restrictions, including a target subset of stations, a
                 maximum delay between transmissions, a volume of data
                 to upload, and a maximum bandwidth for each station. We
                 study the solvability of Station Assignment under an
                 adversary that controls the arrival and departure of
                 clients, limited to maximum rate and burstiness of such
                 arrivals. We show upper and lower bounds on the rate
                 and burstiness for various client arrival schedules and
                 protocol classes. To the best of our knowledge, this is
                 the first time that Station Assignment is studied under
                 adversarial arrivals and departures.",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Mirhosseini:2018:BBA,
  author =       "Amirhossein Mirhosseini and Mohammad Sadrosadati and
                 Fatemeh Aghamohammadi and Mehdi Modarressi and Hamid
                 Sarbazi-Azad",
  title =        "{BARAN}: Bimodal Adaptive Reconfigurable-Allocator
                 Network-on-Chip",
  journal =      j-TOPC,
  volume =       "5",
  number =       "3",
  pages =        "11:1--11:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3294049",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Virtual channels are employed to improve the
                 throughput under high traffic loads in
                 Networks-on-Chips (NoCs). However, they can impose
                 non-negligible overheads on performance by prolonging
                 clock cycle time, especially under low traffic loads
                 where the impact of virtual channels on performance is
                 trivial. In this article, we propose a novel
                 architecture, called BARAN, that can either improve
                 on-chip network performance or reduce its power
                 consumption (depending on the specific implementation
                 chosen), not both at the same time, when virtual
                 channels are underutilized; that is, the average number
                 of virtual channel allocation requests per cycle is
                 lower than the number of total virtual channels. We
                 also introduce a reconfigurable arbitration logic
                 within the BARAN architecture that can be configured to
                 have multiple latencies and, hence, multiple slack
                 times. The increased slack times are then used to
                 reduce the supply voltage of the routers or increase
                 their clock frequency in order to reduce power
                 consumption or improve the performance of the whole NoC
                 system. The power-centric design of BARAN reduces NoC
                 power consumption by 43.4\% and 40.6\% under CMP and
                 GPU workloads, on average, respectively, compared to a
                 baseline architecture while imposing negligible area
                 and performance overheads. The performance-centric
                 design of BARAN reduces the average packet latency by
                 45.4\% and 42.1\%, on average, under CMP and GPU
                 workloads, respectively, compared to the baseline
                 architecture while increasing power consumption by
                 39.7\% and 43.7\%, on average. Moreover, the
                 performance-centric BARAN postpones the network
                 saturation rate by 11.5\% under uniform random traffic
                 compared to the baseline architecture.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Amer:2018:LCM,
  author =       "Abdelhalim Amer and Huiwei Lu and Pavan Balaji and
                 Milind Chabbi and Yanjie Wei and Jeff Hammond and
                 Satoshi Matsuoka",
  title =        "Lock Contention Management in Multithreaded {MPI}",
  journal =      j-TOPC,
  volume =       "5",
  number =       "3",
  pages =        "12:1--12:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3275443",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3275443",
  abstract =     "In this article, we investigate contention management
                 in lock-based thread-safe MPI libraries. Specifically,
                 we make two assumptions: (1) locks are the only form of
                 synchronization when protecting communication paths;
                 and (2) contention occurs, and thus serialization is
                 unavoidable. Our work distinguishes between lock
                 acquisitions with respect to work being performed
                 inside a critical section; productive vs. unproductive.
                 Waiting for message reception without doing anything
                 else inside a critical section is an example of
                 unproductive lock acquisition. We show that the
                 high-throughput nature of modern scalable locking
                 protocols translates into better communication progress
                 for throughput-intensive MPI communication but
                 negatively impacts latency-sensitive communication
                 because of overzealous unproductive lock acquisition.
                 To reduce unproductive lock acquisitions, we devised a
                 method that promotes threads with productive work using
                 a generic two-level priority locking protocol. Our
                 results show that using a high-throughput protocol for
                 productive work and a fair protocol for less productive
                 code paths ensures the best tradeoff for fine-grained
                 communication, whereas a fair protocol is sufficient
                 for more coarse-grained communication. Although these
                 efforts have been rewarding, scalability degradation
                 remains significant. We discuss techniques that diverge
                 from the pure locking model and offer the potential to
                 further improve scalability.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Chen:2018:PDG,
  author =       "Rong Chen and Jiaxin Shi and Yanzhe Chen and Binyu
                 Zang and Haibing Guan and Haibo Chen",
  title =        "{PowerLyra}: Differentiated Graph Computation and
                 Partitioning on Skewed Graphs",
  journal =      j-TOPC,
  volume =       "5",
  number =       "3",
  pages =        "13:1--13:??",
  month =        jan,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3298989",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Jan 23 16:12:26 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Natural graphs with skewed distributions raise unique
                 challenges to distributed graph computation and
                 partitioning. Existing graph-parallel systems usually
                 use a ``one-size-fits-all'' design that uniformly
                 processes all vertices, which either suffer from
                 notable load imbalance and high contention for
                 high-degree vertices (e.g., Pregel and GraphLab) or
                 incur high communication cost and memory consumption
                 even for low-degree vertices (e.g., PowerGraph and
                 GraphX). In this article, we argue that skewed
                 distributions in natural graphs also necessitate
                 differentiated processing on high-degree and low-degree
                 vertices. We then introduce PowerLyra, a new
                 distributed graph processing system that embraces the
                 best of both worlds of existing graph-parallel systems.
                 Specifically, PowerLyra uses centralized computation
                 for low-degree vertices to avoid frequent
                 communications and distributes the computation for
                 high-degree vertices to balance workloads. PowerLyra
                 further provides an efficient hybrid graph partitioning
                 algorithm (i.e., hybrid-cut) that combines edge-cut
                 (for low-degree vertices) and vertex-cut (for
                 high-degree vertices) with heuristics. To improve cache
                 locality of inter-node graph accesses, PowerLyra
                 further provides a locality-conscious data layout
                 optimization. PowerLyra is implemented based on the
                 latest GraphLab and can seamlessly support various
                 graph algorithms running in both synchronous and
                 asynchronous execution modes. A detailed evaluation on
                 three clusters using various graph-analytics and MLDM
                 (Machine Learning and Data Mining) applications shows
                 that PowerLyra outperforms PowerGraph by up to 5.53X
                 (from 1.24X) and 3.26X (from 1.49X) for real-world and
                 synthetic graphs, respectively, and is much faster than
                 other systems like GraphX and Giraph, yet with much
                 less memory consumption. A porting of hybrid-cut to
                 GraphX further confirms the efficiency and generality
                 of PowerLyra.",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Aravind:2019:GME,
  author =       "Alex Aravind and Wim H. Hesselink",
  title =        "Group Mutual Exclusion by Fetch-and-increment",
  journal =      j-TOPC,
  volume =       "5",
  number =       "4",
  pages =        "14:1--14:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3309202",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Mar 11 18:54:51 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309202",
  abstract =     "The group mutual exclusion (GME) problem (also called
                 the room synchronization problem) arises in various
                 practical applications that require concurrent data
                 sharing. Group mutual exclusion aims to achieve
                 exclusive access to a shared resource (a shared room)
                 while facilitating concurrency among non-conflicting
                 requests. The problem is that threads with distinct
                 interests are not allowed to access the shared resource
                 concurrently, but multiple threads with same interest
                 can. In Blelloch et al. (2003), the authors presented a
                 simple solution to the room synchronization problem
                 using fetch8add (F8A) and test-and-set (T8S) atomic
                 operations. This algorithm has $ O(m) $ remote memory
                 references (RMRs) in the cache coherent (CC) model,
                 where $m$ is the number of forums. In Bhatt and Huang
                 (2010), an open problem was posed: `` Is it possible to
                 design a GME algorithm with constant RMR for the CC
                 model using fetch8add instructions? '' This question is
                 partially answered in this article by presenting a
                 group mutual exclusion algorithm using
                 fetch-and-increment instructions. The algorithm is
                 simple and scalable.",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Behzad:2019:OPH,
  author =       "Babak Behzad and Surendra Byna and Prabhat and Marc
                 Snir",
  title =        "Optimizing {I/O} Performance of {HPC} Applications
                 with Autotuning",
  journal =      j-TOPC,
  volume =       "5",
  number =       "4",
  pages =        "15:1--15:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3309205",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Mar 11 18:54:51 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309205",
  abstract =     "Parallel Input output is an essential component of
                 modern high-performance computing (HPC). Obtaining good
                 I/O performance for a broad range of applications on
                 diverse HPC platforms is a major challenge, in part,
                 because of complex inter dependencies between I/O
                 middleware and hardware. The parallel file system and
                 I/O middleware layers all offer optimization parameters
                 that can, in theory, result in better I/O performance.
                 Unfortunately, the right combination of parameters is
                 highly dependent on the application, HPC platform,
                 problem size, and concurrency. Scientific application
                 developers do not have the time or expertise to take on
                 the substantial burden of identifying good parameters
                 for each problem configuration. They resort to using
                 system defaults, a choice that frequently results in
                 poor I/O performance. We expect this problem to be
                 compounded on exascale-class machines, which will
                 likely have a deeper software stack with hierarchically
                 arranged hardware resources. We present as a solution
                 to this problem an autotuning system for optimizing I/O
                 performance, I/O performance modeling, I/O tuning, and
                 I/O patterns. We demonstrate the value of this
                 framework across several HPC platforms and applications
                 at scale.",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Maier:2019:CHT,
  author =       "Tobias Maier and Peter Sanders and Roman Dementiev",
  title =        "Concurrent Hash Tables: Fast and General(?)!",
  journal =      j-TOPC,
  volume =       "5",
  number =       "4",
  pages =        "16:1--16:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3309206",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Mar 11 18:54:51 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309206",
  abstract =     "Concurrent hash tables are one of the most important
                 concurrent data structures, which are used in numerous
                 applications. For some applications, it is common that
                 hash table accesses dominate the execution time. To
                 efficiently solve these problems in parallel, we need
                 implementations that achieve speedups in highly
                 concurrent scenarios. Unfortunately, currently
                 available concurrent hashing libraries are far away
                 from this requirement, in particular, when adaptively
                 sized tables are necessary or contention on some
                 elements occurs. Our starting point for better
                 performing data structures is a fast and simple
                 lock-free concurrent hash table based on linear probing
                 that is, however, limited to word-sized key-value types
                 and does not support dynamic size adaptation. We
                 explain how to lift these limitations in a provably
                 scalable way and demonstrate that dynamic growing has a
                 performance overhead comparable to the same
                 generalization in sequential hash tables. We perform
                 extensive experiments comparing the performance of our
                 implementations with six of the most widely used
                 concurrent hash tables. Ours are considerably faster
                 than the best algorithms with similar restrictions and
                 an order of magnitude faster than the best more general
                 tables. In some extreme cases, the difference even
                 approaches four orders of magnitude. All our
                 implementations discussed in this publication can be
                 found on github [17].",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Cruz:2019:ETM,
  author =       "Eduardo H. M. Cruz and Matthias Diener and La{\'e}rcio
                 L. Pilla and Philippe O. A. Navaux",
  title =        "{EagerMap}: a Task Mapping Algorithm to Improve
                 Communication and Load Balancing in Clusters of
                 Multicore Systems",
  journal =      j-TOPC,
  volume =       "5",
  number =       "4",
  pages =        "17:1--17:??",
  month =        mar,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3309711",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Mar 11 18:54:51 MDT 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3309711",
  abstract =     "Communication between tasks and load imbalance have
                 been identified as a major challenge for the
                 performance and energy efficiency of parallel
                 applications. A common way to improve communication is
                 to increase its locality, that is, to reduce the
                 distances of data transfers, prioritizing the usage of
                 faster and more efficient local interconnections over
                 remote ones. Regarding load imbalance, cores should
                 execute a similar amount of work. An important problem
                 to be solved in this context is how to determine an
                 optimized mapping of tasks to cluster nodes and cores
                 that increases the overall locality and load balancing.
                 In this article, we propose the EagerMap algorithm to
                 determine task mappings, which is based on a greedy
                 heuristic to match application communication patterns
                 to hardware hierarchies and which can also consider the
                 task load. Compared to previous algorithms, EagerMap is
                 faster, scales better, and supports more types of
                 computer systems, while maintaining the same or better
                 quality of the determined task mapping. EagerMap is
                 therefore an interesting choice for task mapping on a
                 variety of modern parallel architectures.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bader:2019:EEC,
  author =       "David A. Bader",
  title =        "Editorial from the {Editor-in-Chief}",
  journal =      j-TOPC,
  volume =       "6",
  number =       "1",
  pages =        "1:1--1:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3325883",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:58 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3325883",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kronbichler:2019:MMF,
  author =       "Martin Kronbichler and Karl Ljungkvist",
  title =        "Multigrid for Matrix-Free High-Order Finite Element
                 Computations on Graphics Processors",
  journal =      j-TOPC,
  volume =       "6",
  number =       "1",
  pages =        "2:1--2:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3322813",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:58 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3322813",
  abstract =     "This article presents matrix-free finite-element
                 techniques for efficiently solving partial differential
                 equations on modern many-core processors, such as
                 graphics cards. We develop a GPU parallelization of a
                 matrix-free geometric multigrid iterative solver
                 targeting moderate and high polynomial degrees, with
                 support for general curved and adaptively refined
                 hexahedral meshes with hanging nodes. The central
                 algorithmic component is the matrix-free operator
                 evaluation with sum factorization. We compare the
                 node-level performance of our implementation running on
                 an Nvidia Pascal P100 GPU to a highly optimized
                 multicore implementation running on comparable Intel
                 Broadwell CPUs and an Intel Xeon Phi. Our experiments
                 show that the GPU implementation is approximately 1.5
                 to 2 times faster across four different scenarios of
                 the Poisson equation and a variety of element degrees
                 in 2D and 3D. The lowest time to solution per degree of
                 freedom is recorded for moderate polynomial degrees
                 between 3 and 5. A detailed performance analysis
                 highlights the capabilities of the GPU architecture and
                 the chosen execution model with threading within the
                 element, particularly with respect to the evaluation of
                 the matrix-vector product. Atomic intrinsics are shown
                 to provide a fast way for avoiding the possible race
                 conditions in summing the elemental residuals into the
                 global vector associated to shared vertices, edges, and
                 surfaces. In addition, the solver infrastructure allows
                 for using mixed-precision arithmetic that performs the
                 multigrid V-cycle in single precision with an outer
                 correction in double precision, increasing throughput
                 by up to 83",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bonifaci:2019:GPT,
  author =       "Vincenzo Bonifaci and Andreas Wiese and Sanjoy K.
                 Baruah and Alberto Marchetti-Spaccamela and Sebastian
                 Stiller and Leen Stougie",
  title =        "A Generalized Parallel Task Model for Recurrent
                 Real-Time Processes",
  journal =      j-TOPC,
  volume =       "6",
  number =       "1",
  pages =        "3:1--3:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3322809",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:58 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3322809",
  abstract =     "A model is considered for representing recurrent
                 precedence-constrained tasks that are to execute on
                 multiprocessor platforms. A recurrent task is specified
                 as a directed acyclic graph (DAG), a period, and a
                 relative deadline. Each vertex of the DAG represents a
                 sequential job, while the edges of the DAG represent
                 precedence constraints between these jobs. All the jobs
                 of the DAG are released simultaneously and need to
                 complete execution within the specified relative
                 deadline of their release. Each task may release jobs
                 in this manner an unbounded number of times, with
                 successive releases occurring at least the specified
                 period apart. Conditional control structures are also
                 allowed. The scheduling problem is to determine whether
                 a set of such recurrent tasks can be scheduled to
                 always meet all deadlines upon a specified number of
                 identical processors. This problem is shown to be
                 computationally intractable, but amenable to efficient
                 approximate solutions. Earliest Deadline First (EDF)
                 and Deadline Monotonic (DM) are shown to be good
                 approximate global scheduling algorithms. Polynomial
                 and pseudo-polynomial time schedulability tests, of
                 differing effectiveness, are presented for determining
                 whether a given task set can be scheduled by EDF or DM
                 to always meet deadlines on a specified number of
                 processors.",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ltaief:2019:MPP,
  author =       "Hatem Ltaief and Dalal Sukkari and Aniello Esposito
                 and Yuji Nakatsukasa and David Keyes",
  title =        "Massively Parallel Polar Decomposition on
                 Distributed-memory Systems",
  journal =      j-TOPC,
  volume =       "6",
  number =       "1",
  pages =        "4:1--4:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3328723",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:58 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3328723",
  abstract =     "We present a high-performance implementation of the
                 Polar Decomposition (PD) on distributed-memory systems.
                 Building upon on the QR-based Dynamically Weighted
                 Halley (QDWH) algorithm, the key idea lies in finding
                 the best rational approximation for the scalar sign
                 function, which also corresponds to the polar factor
                 for symmetric matrices, to further accelerate the QDWH
                 convergence. Based on the Zolotarev rational
                 functions-introduced by Zolotarev (ZOLO) in 1877-this
                 new PD algorithm ZOLO-PD converges within two
                 iterations even for ill-conditioned matrices, instead
                 of the original six iterations needed for QDWH. ZOLO-PD
                 uses the property of Zolotarev functions that
                 optimality is maintained when two functions are
                 composed in an appropriate manner. The resulting
                 ZOLO-PD has a convergence rate up to 17, in contrast to
                 the cubic convergence rate for QDWH. This comes at the
                 price of higher arithmetic costs and memory footprint.
                 These extra floating-point operations can, however, be
                 processed in an embarrassingly parallel fashion. We
                 demonstrate performance using up to 102,400 cores on
                 two supercomputers. We demonstrate that, in the
                 presence of a large number of processing units, ZOLO-PD
                 is able to outperform QDWH by up to 2.3$ \times $
                 speedup, especially in situations where QDWH runs out
                 of work, for instance, in the strong scaling mode of
                 operation.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Saha:2019:OSA,
  author =       "Dibakar Saha and Koushik Sinha",
  title =        "Optimal Schedule for All-to-All Personalized
                 Communication in Multiprocessor Systems",
  journal =      j-TOPC,
  volume =       "6",
  number =       "1",
  pages =        "5:1--5:??",
  month =        jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3329867",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:58 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3329867",
  abstract =     "In this article, we address the problem of finding an
                 optimal schedule for all-to-all personalized message
                 communication among the processors in a multiprocessor
                 system where every processor has a unique message for
                 every other processor. When there are n processors and
                 \lfloor n /2 \rfloor parallel databus or channels for
                 message communications, there exist algorithms that
                 require O ( n$^2$ ) time for assigning the
                 databus/channels to the processor-pairs to obtain a
                 schedule with minimum number of time slots. However, in
                 recent massively parallel processing systems with a
                 huge number of processors, the number k of available
                 databus/channels is usually much smaller than \lfloor n
                 /2",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Pumma:2019:SDL,
  author =       "Sarunya Pumma and Min Si and Wu-Chun Feng and Pavan
                 Balaji",
  title =        "Scalable Deep Learning via {I/O} Analysis and
                 Optimization",
  journal =      j-TOPC,
  volume =       "6",
  number =       "2",
  pages =        "6:1--6:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3331526",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3331526",
  abstract =     "Scalable deep neural network training has been gaining
                 prominence because of the increasing importance of deep
                 learning in a multitude of scientific and commercial
                 domains. Consequently, a number of researchers have
                 investigated techniques to optimize deep learning
                 systems. Much of the prior work has focused on runtime
                 and algorithmic enhancements to optimize the
                 computation and communication. Despite these
                 enhancements, however, deep learning systems still
                 suffer from scalability limitations, particularly with
                 respect to data I/O. This situation is especially true
                 for training models where the computation can be
                 effectively parallelized, leaving I/O as the major
                 bottleneck. In fact, our analysis shows that I/O can
                 take up to 90",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Aupy:2019:SSP,
  author =       "Guilllaume Aupy and Ana Gainaru and Valentin {Le
                 F{\`e}vre}",
  title =        "{I/O} Scheduling Strategy for Periodic Applications",
  journal =      j-TOPC,
  volume =       "6",
  number =       "2",
  pages =        "7:1--7:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3338510",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3338510",
  abstract =     "With the ever-growing need of data in HPC
                 applications, the congestion at the I/O level becomes
                 critical in supercomputers. Architectural enhancement
                 such as burst buffers and pre-fetching are added to
                 machines but are not sufficient to prevent congestion.
                 Recent online I/O scheduling strategies have been put
                 in place, but they add an additional congestion point
                 and overheads in the computation of applications. In
                 this work, we show how to take advantage of the
                 periodic nature of HPC applications to develop
                 efficient periodic scheduling strategies for their I/O
                 transfers. Our strategy computes once during the job
                 scheduling phase a pattern that defines the I/O
                 behavior for each application, after which the
                 applications run independently, performing their I/O at
                 the specified times. Our strategy limits the amount of
                 congestion at the I/O node level and can be easily
                 integrated into current job schedulers. We validate
                 this model through extensive simulations and
                 experiments on an HPC cluster by comparing it to
                 state-of-the-art online solutions, showing that not
                 only does our scheduler have the advantage of being
                 de-centralized and thus overcoming the overhead of
                 online schedulers, but also that it performs better
                 than the other solutions, improving the application
                 dilation up to 16",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Kagaris:2019:SME,
  author =       "Dimitri Kagaris and Sourav Dutta",
  title =        "Scheduling Mutual Exclusion Accesses in Equal-Length
                 Jobs",
  journal =      j-TOPC,
  volume =       "6",
  number =       "2",
  pages =        "8:1--8:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3342562",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3342562",
  abstract =     "A fundamental problem in parallel and distributed
                 processing is the partial serialization that is imposed
                 due to the need for mutually exclusive access to common
                 resources. In this article, we investigate the problem
                 of optimally scheduling (in terms of makespan) a set of
                 jobs, where each job consists of the same number L of
                 unit-duration tasks, and each task either accesses
                 exclusively one resource from a given set of resources
                 or accesses a fully shareable resource. We develop and
                 establish the optimality of a fast polynomial-time
                 algorithm to find a schedule with the shortest makespan
                 for any number of jobs and for any number of resources
                 for the case of L = 2. In the notation commonly used
                 for job-shop scheduling problems, this result means
                 that the problem J | d$_{ij}$ =1, n$_j$ =2| C$_{max}$
                 is polynomially solvable, adding to the polynomial
                 solutions known for the problems J 2 | n$_j$ {$<$}= 2 |
                 C$_{max}$ and J 2 | d$_{ij}$ = 1 | C$_{max}$ (whereas
                 other closely related versions such as J 2 | n$_j$ \leq
                 3 | C$_{max}$, J 2 | d$_{ij}$ ) \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Mollah:2019:MUG,
  author =       "Md Atiqul Mollah and Wenqi Wang and Peyman Faizian and
                 MD Shafayat Rahman and Xin Yuan and Scott Pakin and
                 Michael Lang",
  title =        "Modeling Universal Globally Adaptive Load-Balanced
                 Routing",
  journal =      j-TOPC,
  volume =       "6",
  number =       "2",
  pages =        "9:1--9:??",
  month =        sep,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3349620",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3349620",
  abstract =     "Universal globally adaptive load-balanced (UGAL)
                 routing has been proposed for various interconnection
                 networks and has been deployed in a number of
                 current-generation supercomputers. Although UGAL-based
                 schemes have been extensively studied, most existing
                 results are based on either simulation or measurement.
                 Without a theoretical understanding of UGAL, multiple
                 questions remain: For which traffic patterns is UGAL
                 most suited? In addition, what determines the
                 performance of the UGAL-based scheme on a particular
                 network configuration? In this work, we develop a set
                 of throughput models for UGALbased on linear
                 programming. We show that the throughput models are
                 valid across the torus, Dragonfly, and Slim Fly network
                 topologies. Finally, we identify a robust model that
                 can accurately and efficiently predict UGAL throughput
                 for a set of representative traffic patterns across
                 different topologies. Our models not only provide a
                 mechanism to predict UGAL performance on large-scale
                 interconnection networks but also reveal the inner
                 working of UGAL and further our understanding of this
                 type of routing.",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Bateni:2019:ISI,
  author =       "Mohammed Hossein Bateni and Mohammad T. Hajiaghayi and
                 Silvio Lattanzi",
  title =        "Introduction to the Special Issue for {SPAA'17}",
  journal =      j-TOPC,
  volume =       "6",
  number =       "3",
  pages =        "10:1--10:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3363417",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3363417",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Guha:2019:DPC,
  author =       "Sudipto Guha and Yi Li and Qin Zhang",
  title =        "Distributed Partial Clustering",
  journal =      j-TOPC,
  volume =       "6",
  number =       "3",
  pages =        "11:1--11:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3322808",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3322808",
  abstract =     "Recent years have witnessed an increasing popularity
                 of algorithm design for distributed data, largely due
                 to the fact that massive datasets are often collected
                 and stored in different locations. In the distributed
                 setting, communication typically dominates the query
                 processing time. Thus, it becomes crucial to design
                 communication-efficient algorithms for queries on
                 distributed data. Simultaneously, it has been widely
                 recognized that partial optimizations, where we are
                 allowed to disregard a small part of the data, provide
                 us significantly better solutions. The motivation for
                 disregarded points often arises from noise and other
                 phenomena that are pervasive in large data scenarios.
                 In this article, we focus on partial clustering
                 problems, k -center, k -median, and k -means objectives
                 in the distributed model, and provide algorithms with
                 communication sublinear of the input size. As a
                 consequence, we develop the first algorithms for the
                 partial k -median and means objectives that run in
                 subquadratic running time. We also initiate the study
                 of distributed algorithms for clustering uncertain
                 data, where each data point can possibly fall into
                 multiple locations under certain probability
                 distribution.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Fraigniaud:2019:DDC,
  author =       "Pierre Fraigniaud and Dennis Olivetti",
  title =        "Distributed Detection of Cycles",
  journal =      j-TOPC,
  volume =       "6",
  number =       "3",
  pages =        "12:1--12:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3322811",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3322811",
  abstract =     "Distributed property testing in networks has been
                 introduced by Brakerski and Patt-Shamir [6], with the
                 objective of detecting the presence of large dense
                 sub-networks in a distributed manner. Recently,
                 Censor-Hillel et al. [7] have revisited this notion and
                 formalized it in a broader context. In particular, they
                 have shown how to detect 3-cycles in a constant number
                 of rounds by a distributed algorithm. In a follow-up
                 work, Fraigniaud et al. [21] have shown how to detect
                 4-cycles in a constant number of rounds as well.
                 However, the techniques in these latter works were
                 shown not to generalize to larger cycles C$_k$ with k
                 {$>$}= 5. In this article, we completely settle the
                 problem of cycle detection by establishing the
                 following result: For every k {$>$}= 3, there exists a
                 distributed property testing algorithm for C$_k$
                 -freeness, performing in a constant number of rounds.
                 All these results hold in the classical congest model
                 for distributed network computing. Our algorithm is
                 1-sided error. Its round-complexity is O (1 \epsilon )
                 where \epsilon \in (0,1) is the property-testing
                 parameter measuring the gap between legal and illegal
                 instances.",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Albers:2019:ECD,
  author =       "Susanne Albers",
  title =        "On Energy Conservation in Data Centers",
  journal =      j-TOPC,
  volume =       "6",
  number =       "3",
  pages =        "13:1--13:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3364210",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3364210",
  abstract =     "We formulate and study an optimization problem that
                 arises in the energy management of data centers and,
                 more generally, multiprocessor environments. Data
                 centers host a large number of heterogeneous servers.
                 Each server has an active state and several
                 standby/sleep states with individual power consumption
                 rates. The demand for computing capacity varies over
                 time. Idle servers may be transitioned to low-power
                 modes so as to rightsize the pool of active servers.
                 The goal is to find a state transition schedule for the
                 servers that minimizes the total energy consumed. On a
                 small scale, the same problem arises in multicore
                 architectures with heterogeneous processors on a chip.
                 One has to determine active and idle periods for the
                 cores so as to guarantee a certain service and minimize
                 the consumed energy. For this power/capacity management
                 problem, we develop two main results. We use the
                 terminology of the data center setting. First, we
                 investigate the scenario that each server has two
                 states: an active state and a sleep state. We show that
                 an optimal solution, minimizing energy consumption, can
                 be computed in polynomial time by a combinatorial
                 algorithm. The algorithm resorts to a single-commodity
                 minimum-cost flow computation. Second, we study the
                 general scenario that each server has an active state
                 and multiple standby/sleep states. We devise a",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Feldkord:2019:MSP,
  author =       "Bj{\"o}rn Feldkord and Friedhelm Meyer Auf Der Heide",
  title =        "The Mobile Server Problem",
  journal =      j-TOPC,
  volume =       "6",
  number =       "3",
  pages =        "14:1--14:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3364204",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3364204",
  abstract =     "We introduce the Mobile Server problem, inspired by
                 current trends to move computational tasks from cloud
                 structures to multiple devices close to the end user.
                 An example of this is embedded systems in autonomous
                 cars that communicate to coordinate their actions. Our
                 model is a variant of the classical Page Migration
                 problem. More formally, we consider a mobile server
                 holding a data page. The server can move in the
                 Euclidean space (of arbitrary dimension). In every
                 round, requests for data items from the page pop up at
                 arbitrary points in the space. The requests are served,
                 each at a cost of the distance from the requesting
                 point and the server, and the mobile server may move,
                 at a cost D times the distance traveled for some
                 constant D. We assume a maximum distance m that the
                 server is allowed to move per round. We show that no
                 online algorithm can achieve a competitive ratio
                 independent of the length of the input sequence in this
                 setting. Hence, we augment the maximum movement
                 distance of the online algorithms to \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Azar:2019:TBC,
  author =       "Yossi Azar and Danny Vainstein",
  title =        "Tight Bounds for Clairvoyant Dynamic Bin Packing",
  journal =      j-TOPC,
  volume =       "6",
  number =       "3",
  pages =        "15:1--15:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3364214",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3364214",
  abstract =     "In this article, we focus on the Clairvoyant Dynamic
                 Bin Packing (DBP) problem, which extends the Classical
                 Online Bin Packing problem in that items arrive and
                 depart over time and the departure time of an item is
                 known upon its arrival. The problem naturally arises
                 when handling cloud-based networks. We focus
                 specifically on the MinUsageTime objective function,
                 which aims to minimize the overall usage time of all
                 bins that are opened during the packing process.
                 Earlier work has shown a O (log \mu / log log \mu)
                 upper bound on the algorithm's competitiveness, where
                 \mu is defined as the ratio between the maximal and
                 minimal durations of all items. We improve the upper
                 bound by giving a \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Cooper:2019:NCT,
  author =       "Colin Cooper and Tomasz Radzik and Nicolas Rivera",
  title =        "New Cover Time Bounds for the Coalescing-Branching
                 Random Walk on Graphs",
  journal =      j-TOPC,
  volume =       "6",
  number =       "3",
  pages =        "16:1--16:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3364206",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3364206",
  abstract =     "We present new bounds on the cover time of the
                 coalescing-branching random walk process COBRA. The
                 COBRA process, introduced in Dutta et al. [9], can be
                 viewed as spreading a single item of information
                 throughout an undirected graph in synchronised rounds.
                 In each round, each vertex that has received the
                 information in the previous round (possibly
                 simultaneously from more than one neighbour and
                 possibly not for the first time), ``pushes'' the
                 information to k randomly selected neighbours. The
                 COBRA process is typically studied for integer
                 branching rates k {$>$}= 2 (with the case k =1
                 corresponding to a random walk). The aim of the process
                 is to propagate the information quickly, but with a
                 limited number of transmissions per vertex per round.
                 The COBRA cover time is the expected number of rounds
                 until all vertices have received the information at
                 least once. Our main results are bounds of O ( m + (
                 d$_{max}$ )$^2$ log n ) and O ( m log n ) on the COBRA
                 cover time for arbitrary connected graphs with n
                 vertices, m edges and maximum graph degree d$_{max}$,
                 and bounds of \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Sun:2019:DGC,
  author =       "He Sun and Luca Zanetti",
  title =        "Distributed Graph Clustering and Sparsification",
  journal =      j-TOPC,
  volume =       "6",
  number =       "3",
  pages =        "17:1--17:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3364208",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3364208",
  abstract =     "Graph clustering is a fundamental computational
                 problem with a number of applications in algorithm
                 design, machine learning, data mining, and analysis of
                 social networks. Over the past decades, researchers
                 have proposed a number of algorithmic design methods
                 for graph clustering. Most of these methods, however,
                 are based on complicated spectral techniques or convex
                 optimisation and cannot be directly applied for
                 clustering many networks that occur in practice, whose
                 information is often collected on different sites.
                 Designing a simple and distributed clustering algorithm
                 is of great interest and has comprehensive applications
                 for processing big datasets. In this article, we
                 present a simple and distributed algorithm for graph
                 clustering: For a wide class of graphs that are
                 characterised by a strong cluster-structure, our
                 algorithm finishes in a poly-logarithmic number of
                 rounds and recovers a partition of the graph close to
                 optimal. One of the main procedures behind our
                 algorithm is a sampling scheme that, given a dense
                 graph as input, produces a sparse subgraph that
                 provably preserves the cluster-structure of the input.
                 Compared with previous sparsification algorithms that
                 require Laplacian solvers or involve combinatorial
                 constructions, this procedure is easy to implement in a
                 distributed setting and runs fast in practice.",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Khan:2019:NOP,
  author =       "Shahbaz Khan",
  title =        "Near Optimal Parallel Algorithms for Dynamic {DFS} in
                 Undirected Graphs",
  journal =      j-TOPC,
  volume =       "6",
  number =       "3",
  pages =        "18:1--18:??",
  month =        oct,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3364212",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3364212",
  abstract =     "Depth first search (DFS) tree is a fundamental data
                 structure for solving various graph problems. The
                 classical algorithm [54] for building a DFS tree
                 requires O ( m + n ) time for a given undirected graph
                 G having n vertices and m edges. Recently, Baswana et
                 al. [5] presented a simple algorithm for updating the
                 DFS tree of an undirected graph after an edge/vertex
                 update in {\tilde O}( n )$^1$ time. However, their
                 algorithm is strictly sequential. \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Rauchwerger:2019:ISI,
  author =       "Lawrence Rauchwerger and Jaejin Lee and Armando
                 Solar-Lezama and Guy Steele",
  title =        "Introduction to the Special Issue on {PPoPP} 2017
                 (Part 1)",
  journal =      j-TOPC,
  volume =       "6",
  number =       "4",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3373151",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Dec 27 16:13:12 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  acknowledgement = ack-nhfb,
  articleno =    "19e",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Schardl:2019:TER,
  author =       "Tao B. Schardl and William S. Moses and Charles E.
                 Leiserson",
  title =        "{Tapir}: Embedding Recursive Fork-join Parallelism
                 into {LLVM}'s Intermediate Representation",
  journal =      j-TOPC,
  volume =       "6",
  number =       "4",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365655",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Dec 27 16:13:12 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3365655",
  abstract =     "Tapir (pronounced TAY-per) is a compiler intermediate
                 representation (IR) that embeds recursive fork-join
                 parallelism, as supported by task-parallel programming
                 platforms such as Cilk and OpenMP, into a mainstream
                 compiler's IR. Mainstream compilers typically treat
                 parallel linguistic constructs as syntactic sugar for
                 function calls into a parallel runtime. These calls
                 prevent the compiler from performing optimizations on
                 and across parallel control constructs. Remedying this
                 situation has generally been thought to require an
                 extensive reworking of compiler analyses and code
                 transformations to handle parallel semantics. Tapir
                 leverages the ``serial-projection property,'' which is
                 commonly satisfied by task-parallel programs, to handle
                 the semantics of these programs without an extensive
                 rework of the compiler. For recursive fork-join
                 programs that satisfy the serial-projection property,
                 Tapir enables effective compiler optimization of
                 parallel programs with only minor changes to existing
                 compiler analyses and code transformations. Tapir uses
                 the serial-projection property to order logically
                 parallel fine-grained tasks in the program's
                 control-flow graph. This ordered representation of
                 parallel tasks allows the compiler to optimize parallel
                 codes effectively with only minor modifications. For
                 example, to implement Tapir/LLVM, a prototype of Tapir
                 in the LLVM compiler, we added or modified less than
                 3,000 lines of LLVM's half-million-line core middle-end
                 functionality. These changes sufficed to enable LLVM's
                 existing compiler optimizations for serial
                 code-including loop-invariant-code motion,
                 common-subexpression elimination, and tail-recursion
                 elimination-to work with parallel control constructs
                 such as parallel loops and Cilk's Cilk\_Spawn keyword.
                 Tapir also supports parallel optimizations, such as
                 loop scheduling, which restructure the parallel control
                 flow of the program. By making use of existing LLVM
                 optimizations and new parallel optimizations,
                 Tapir/LLVM can optimize recursive fork-join programs
                 more effectively than traditional compilation methods.
                 On a suite of 35 Cilk application benchmarks,
                 Tapir/LLVM produces more efficient executables for 30
                 benchmarks, with faster 18-core running times for 26 of
                 them, compared to a nearly identical compiler that
                 compiles parallel linguistic constructs the traditional
                 way.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Utterback:2019:POR,
  author =       "Robert Utterback and Kunal Agrawal and I-Ting Angelina
                 Lee and Milind Kulkarni",
  title =        "Processor-Oblivious Record and Replay",
  journal =      j-TOPC,
  volume =       "6",
  number =       "4",
  pages =        "20:1--20:??",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365659",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Dec 27 16:13:12 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/ft_gateway.cfm?id=3365659",
  abstract =     "Record-and-replay systems are useful tools for
                 debugging non-deterministic parallel programs by first
                 recording an execution and then replaying that
                 execution to produce the same access pattern. Existing
                 record-and-replay systems generally target thread-based
                 execution models, and record the behaviors and
                 interleavings of individual threads. Dynamic
                 multithreaded languages and libraries, such as the Cilk
                 family, OpenMP, TBB, and the like, do not have a notion
                 of threads. Instead, these languages provide a
                 processor-oblivious model of programming, where
                 programs expose task parallelism using high-level
                 constructs such as spawn/sync without regard to the
                 number of threads/cores available to run the program.
                 Thread-based record-and-replay would violate the
                 processor-oblivious nature of these programs, as they
                 incorporate the number of threads into the recorded
                 information, constraining the replayed execution to the
                 same number of threads. In this article, we present a
                 processor-oblivious record-and-replay scheme for
                 dynamic multithreaded languages where record and replay
                 can use different number of processors and both are
                 scheduled using work stealing. We provide theoretical
                 guarantees for our record and replay scheme-namely that
                 record is optimal for programs with one lock and replay
                 is near-optimal for all cases. In addition, we
                 implemented this scheme in the Cilk Plus runtime system
                 and our evaluation indicates that
                 processor-obliviousness does not cause substantial
                 overheads.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Yeh:2019:PGR,
  author =       "Tsung Tai Yeh and Amit Sabne and Putt Sakdhnagool and
                 Rudolf Eigenmann and Timothy G. Rogers",
  title =        "{Pagoda}: a {GPU} Runtime System for Narrow Tasks",
  journal =      j-TOPC,
  volume =       "6",
  number =       "4",
  pages =        "21:1--21:??",
  month =        nov,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365657",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "Massively multithreaded GPUs achieve high throughput
                 by running thousands of threads in parallel. To fully
                 utilize the their hardware, contemporary workloads
                 spawn work to the GPU in bulk by launching large tasks,
                 where each task is a kernel that contains thousands of
                 threads that occupy the entire GPU. GPUs face severe
                 underutilization and their performance benefits vanish
                 if the tasks are narrow, i.e., they contain less than
                 512 threads. Latency-sensitive applications in network,
                 signal, and image processing that generate a large
                 number of tasks with relatively small inputs are
                 examples of such limited parallelism. This article
                 presents Pagoda, a runtime system that virtualizes GPU
                 resources, using an OS-like daemon kernel called
                 MasterKernel. Tasks are spawned from the CPU onto
                 Pagoda as they become available, and are scheduled by
                 the MasterKernel at the warp granularity. This level of
                 control enables the GPU to keep scheduling and
                 executing tasks as long as free warps are found,
                 dramatically reducing underutilization. Experimental
                 results on real hardware demonstrate that Pagoda
                 achieves a geometric mean speedup of 5.52X over
                 PThreads running on a 20-core CPU, 1.76X over
                 CUDA-HyperQ, and 1.44X over GeMTC, the state-of-the-art
                 runtime GPU task scheduling system.",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Steele:2019:UBP,
  author =       "Guy L. {Steele Jr.} and Jean-Baptiste Tristan",
  title =        "Using Butterfly-patterned Partial Sums to Draw from
                 Discrete Distributions",
  journal =      j-TOPC,
  volume =       "6",
  number =       "4",
  pages =        "22:1--22:??",
  month =        nov,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365662",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/prng.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "We describe a SIMD technique for drawing values from
                 multiple discrete distributions, such as sampling from
                 the random variables of a mixture model, that avoids
                 computing a complete table of partial sums of the
                 relative probabilities. A table of alternate
                 (``butterfly-patterned'') form is faster to compute,
                 making better use of coalesced memory accesses; from
                 this table, complete partial sums are computed on the
                 fly during a binary search. Measurements using Cuda 7.5
                 on an NVIDIA Titan Black GPU show that this technique
                 makes an entire machine-learning application that uses
                 a Latent Dirichlet Allocation topic model with 1,024
                 topics about 13\% faster (when using single-precision
                 floating-point data) or about 35\% faster (when using
                 double-precision floating-point data) than doing a
                 straightforward matrix transposition after using
                 coalesced accesses.",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Vandierendonck:2019:HDI,
  author =       "Hans Vandierendonck and Dimitrios S. Nikolopoulos",
  title =        "Hyperqueues: Design and Implementation of
                 Deterministic Concurrent Queues",
  journal =      j-TOPC,
  volume =       "6",
  number =       "4",
  pages =        "23:1--23:??",
  month =        nov,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365660",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Nov 20 07:59:59 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The hyperqueue is a programming abstraction for queues
                 that results in deterministic and scale-free parallel
                 programs. Hyperqueues extend the concept of Cilk++
                 hyperobjects to provide thread-local views on a shared
                 data structure. While hyperobjects are organized around
                 private local views, hyperqueues provide a shared view
                 on a queue data structure. Hereby, hyperqueues
                 guarantee determinism for programs using concurrent
                 queues. We define the programming API and semantics of
                 two instances of the hyperqueue concept. These
                 hyperqueues differ in their API and the degree of
                 concurrency that is extracted. We describe the
                 implementation of the hyperqueues in a work-stealing
                 scheduler and demonstrate scalable performance on
                 pipeline-parallel benchmarks from PARSEC and
                 StreamIt.",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Ren:2019:ESP,
  author =       "Bin Ren and Shruthi Balakrishna and Youngjoon Jo and
                 Sriram Krishnamoorthy and Kunal Agrawal and Milind
                 Kulkarni",
  title =        "Extracting {SIMD} Parallelism from Recursive
                 Task-Parallel Programs",
  journal =      j-TOPC,
  volume =       "6",
  number =       "4",
  pages =        "24:1--24:??",
  month =        dec,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3365663",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Dec 27 16:13:12 MST 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  abstract =     "The pursuit of computational efficiency has led to the
                 proliferation of throughput-oriented hardware, from
                 GPUs to increasingly wide vector units on commodity
                 processors and accelerators. This hardware is designed
                 to execute data-parallel computations in a vectorized
                 manner efficiently. However, many algorithms are more
                 naturally expressed as divide-and-conquer, recursive,
                 task-parallel computations. In the absence of data
                 parallelism, it seems that such algorithms are not well
                 suited to throughput-oriented architectures. This
                 article presents a set of novel code transformations
                 that expose the data parallelism latent in recursive,
                 task-parallel programs. These transformations
                 facilitate straightforward vectorization of
                 task-parallel programs on commodity hardware. We also
                 present scheduling policies that maintain high
                 utilization of vector resources while limiting space
                 usage. Across several task-parallel benchmarks, we
                 demonstrate both efficient vector resource utilization
                 and substantial speedup on chips using Intel's SSE4.2
                 vector units, as well as accelerators using Intel's
                 AVX512 units. We then show through rigorous sampling
                 that, in practice, our vectorization techniques are
                 effective for a much larger class of programs.",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "http://dl.acm.org/citation.cfm?id=2632163",
}

@Article{Tumeo:2020:ITSa,
  author =       "Antonino Tumeo and Fabrizio Petrini and John Feo and
                 Mahantesh Halappanavar",
  title =        "Introduction to the {TOPC} Special Issue on
                 Innovations in Systems for Irregular Applications, {Part
                 1}",
  journal =      j-TOPC,
  volume =       "7",
  number =       "1",
  pages =        "1:1--1:2",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3383318",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Apr 6 08:56:55 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3383318",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Anzt:2020:LBS,
  author =       "Hartwig Anzt and Terry Cojean and Chen Yen-Chen and
                 Jack Dongarra and Goran Flegar and Pratik Nayak and
                 Stanimire Tomov and Yuhsiang M. Tsai and Weichung
                 Wang",
  title =        "Load-balancing Sparse Matrix Vector Product Kernels on
                 {GPUs}",
  journal =      j-TOPC,
  volume =       "7",
  number =       "1",
  pages =        "2:1--2:26",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3380930",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Apr 6 08:56:55 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3380930",
  abstract =     "Efficient processing of Irregular Matrices on Single
                 Instruction, Multiple Data (SIMD)-type architectures is
                 a persistent challenge. Resolving it requires
                 innovations in the development of data formats,
                 computational techniques, and implementations that
                 strike a balance between thread divergence, which is
                 inherent for Irregular Matrices, and padding, which
                 alleviates the performance-detrimental thread
                 divergence but introduces artificial overheads. To this
                 end, in this article, we address the challenge of
                 designing high performance sparse matrix-vector product
                 (SpMV) kernels designed for Nvidia Graphics Processing
                 Units (GPUs). We present a compressed sparse row (CSR)
                 format suitable for unbalanced matrices. We also
                 provide a load-balancing kernel for the coordinate
                 (COO) matrix format and extend it to a hybrid algorithm
                 that stores part of the matrix in SIMD-friendly Ellpack
                 format (ELL) format. The ratio between the ELL- and the
                 COO-part is determined using a theoretical analysis of
                 the nonzeros-per-row distribution. For the over 2,800
                 test matrices available in the Suite Sparse matrix
                 collection, we compare the performance against SpMV
                 kernels provided by NVIDIA's cuSPARSE library and a
                 heavily-tuned sliced ELL (SELL-P) kernel that prevents
                 unnecessary padding by considering the irregular
                 matrices as a combination of matrix blocks stored in
                 ELL format.",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Hadade:2020:SPU,
  author =       "Ioan Hadade and Timothy M. Jones and Feng Wang and
                 Luca di Mare",
  title =        "Software Prefetching for Unstructured Mesh
                 Applications",
  journal =      j-TOPC,
  volume =       "7",
  number =       "1",
  pages =        "3:1--3:23",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3380932",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Apr 6 08:56:55 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3380932",
  abstract =     "This article demonstrates the utility and
                 implementation of software prefetching in an
                 unstructured finite volume computational fluid dynamics
                 code of representative size and complexity to an
                 industrial application and across a number of modern
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Grutzmacher:2020:APC,
  author =       "Thomas Gr{\"u}tzmacher and Terry Cojean and Goran
                 Flegar and Hartwig Anzt and Enrique S.
                 Quintana-Ort{\'\i}",
  title =        "Acceleration of {PageRank} with Customized Precision
                 Based on Mantissa Segmentation",
  journal =      j-TOPC,
  volume =       "7",
  number =       "1",
  pages =        "4:1--4:19",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3380934",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Apr 6 08:56:55 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/pagerank.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3380934",
  abstract =     "We describe the application of a
                 communication-reduction technique for the PageRank
                 algorithm that dynamically adapts the precision of the
                 data access to the numerical requirements of the
                 algorithm as the iteration converges. Our
                 variable-precision strategy, using a customized
                 precision format based on mantissa segmentation (CPMS),
                 abandons the IEEE 754 single- and double-precision
                 number representation formats employed in the standard
                 implementation of PageRank, and instead handles the
                 data in memory using a customized floating-point
                 format. The customized format enables fast data access
                 in different accuracy, prevents overflow/underflow by
                 preserving the IEEE 754 double-precision exponent, and
                 efficiently avoids data duplication, since all bits of
                 the original IEEE 754 double-precision mantissa are
                 preserved in memory, but re-organized for efficient
                 reduced precision access. With this approach, the
                 truncated values (omitting significand bits), as well
                 as the original IEEE double-precision values, can be
                 retrieved without duplicating the data in different
                 formats.\par

                 Our numerical experiments on an NVIDIA V100 GPU (Volta
                 architecture) and a server equipped with two Intel Xeon
                 Platinum 8168 CPUs (48 cores in total) expose that,
                 compared with a standard IEEE double-precision
                 implementation, the CPMS-based PageRank completes about
                 10\% faster if high-accuracy output is needed, and
                 about 30\% faster if reduced output accuracy is
                 acceptable.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Das:2020:SMP,
  author =       "Apurba Das and Seyed-Vahid Sanei-Mehri and Srikanta
                 Tirthapura",
  title =        "Shared-memory Parallel Maximal Clique Enumeration from
                 Static and Dynamic Graphs",
  journal =      j-TOPC,
  volume =       "7",
  number =       "1",
  pages =        "5:1--5:28",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3380936",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Apr 6 08:56:55 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3380936",
  abstract =     "Maximal Clique Enumeration (MCE) is a fundamental
                 graph mining problem and is useful as a primitive in
                 identifying dense structures in a graph. Due to the
                 high computational cost of MCE, parallel methods are
                 imperative for dealing with large graphs. We \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Hamilton:2020:ASC,
  author =       "Kathleen E. Hamilton and Catherine D. Schuman and
                 Steven R. Young and Ryan S. Bennink and Neena Imam and
                 Travis S. Humble",
  title =        "Accelerating Scientific Computing in the
                 Post-{Moore}'s Era",
  journal =      j-TOPC,
  volume =       "7",
  number =       "1",
  pages =        "6:1--6:31",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3380940",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Apr 6 08:56:55 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3380940",
  abstract =     "Novel uses of graphical processing units for
                 accelerated computation revolutionized the field of
                 high-performance scientific computing by providing
                 specialized workflows tailored to algorithmic
                 requirements. As the era of Moore's law draws to a
                 close, \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Lakhotia:2020:GSC,
  author =       "Kartik Lakhotia and Rajgopal Kannan and Sourav Pati
                 and Viktor Prasanna",
  title =        "{GPOP}: a Scalable Cache- and Memory-efficient
                 Framework for Graph Processing over Parts",
  journal =      j-TOPC,
  volume =       "7",
  number =       "1",
  pages =        "7:1--7:24",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3380942",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Apr 6 08:56:55 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3380942",
  abstract =     "The past decade has seen the development of many
                 shared-memory graph processing frameworks intended to
                 reduce the effort of developing high-performance
                 parallel applications. However, many of these
                 frameworks, based on Vertex-centric or Edge-centric
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Anderson:2020:RRO,
  author =       "Jeff Anderson and Engin Kayraklioglu and Shuai Sun and
                 Joseph Crandall and Yousra Alkabani and Vikram Narayana
                 and Volker Sorger and Tarek El-Ghazawi",
  title =        "{ROC}: a Reconfigurable Optical Computer for
                 Simulating Physical Processes",
  journal =      j-TOPC,
  volume =       "7",
  number =       "1",
  pages =        "8:1--8:29",
  month =        apr,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3380944",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Apr 6 08:56:55 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3380944",
  abstract =     "Due to the end of Moore's law and Dennard scaling, we
                 are entering a new era of processors. Computing systems
                 are increasingly facing power and performance
                 challenges due to both device- and circuit-related
                 challenges with resistive and capacitive \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Monemi:2020:EDW,
  author =       "Alireza Monemi and Farshad Khunjush and Maurizio
                 Palesi and Hamid Sarbazi-Azad",
  title =        "An Enhanced Dynamic Weighted Incremental Technique for
                 {QoS} Support in {NoC}",
  journal =      j-TOPC,
  volume =       "7",
  number =       "2",
  pages =        "9:1--9:31",
  month =        may,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3391442",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Jun 1 09:19:25 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3391442",
  abstract =     "Providing Quality-of-Service (QoS) in many-core
                 network-on-chip (NoC) platforms is critical due to the
                 high level of resource sharing in such systems. This
                 article presents a hard-built Equality-of-Service (EoS)
                 and Differential-Service (DS) as subsets \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Natarajan:2020:FLL,
  author =       "Aravind Natarajan and Arunmoezhi Ramachandran and
                 Neeraj Mittal",
  title =        "{FEAST}: a Lightweight Lock-free Concurrent Binary
                 Search Tree",
  journal =      j-TOPC,
  volume =       "7",
  number =       "2",
  pages =        "10:1--10:64",
  month =        may,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3391438",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Jun 1 09:19:25 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3391438",
  abstract =     "We present a lock-free algorithm for concurrent
                 manipulation of a binary search tree (BST) in an
                 asynchronous shared memory system that supports search,
                 insert, and delete operations. In addition to read and
                 write instructions, our algorithm uses \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Salah:2020:TSE,
  author =       "Ahmad Salah and Kenli Li and Qing Liao and Mervat
                 Hashem and Zhiyong Li and Anthony T. Chronopoulos and
                 Albert Y. Zomaya",
  title =        "A Time-space Efficient Algorithm for Parallel $k$-way
                 In-place Merging based on Sequence Partitioning and
                 Perfect Shuffle",
  journal =      j-TOPC,
  volume =       "7",
  number =       "2",
  pages =        "11:1--11:23",
  month =        may,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3391443",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Jun 1 09:19:25 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3391443",
  abstract =     "The huge data volumes, big data, and the emergence of
                 new parallel architectures lead to revisiting classic
                 computer science topics. The motivation of the proposed
                 work for revisiting the parallel k -way in-place
                 merging is primarily related to the \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Duan:2020:CSR,
  author =       "Shaohua Duan and Pradeep Subedi and Philip Davis and
                 Keita Teranishi and Hemanth Kolla and Marc Gamell and
                 Manish Parashar",
  title =        "{CoREC}: Scalable and Resilient In-memory Data Staging
                 for In-situ Workflows",
  journal =      j-TOPC,
  volume =       "7",
  number =       "2",
  pages =        "12:1--12:29",
  month =        may,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3391448",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Jun 1 09:19:25 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3391448",
  abstract =     "The dramatic increase in the scale of current and
                 planned high-end HPC systems is leading new challenges,
                 such as the growing costs of data movement and IO, and
                 the reduced mean time between failures (MTBF) of system
                 components. In-situ workflows, i.e., \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Alam:2020:GMS,
  author =       "Maksudul Alam and Maleq Khan and Kalyan S. Perumalla
                 and Madhav Marathe",
  title =        "Generating Massive Scale-free Networks: Novel Parallel
                 Algorithms using the Preferential Attachment Model",
  journal =      j-TOPC,
  volume =       "7",
  number =       "2",
  pages =        "13:1--13:35",
  month =        may,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3391446",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Mon Jun 1 09:19:25 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3391446",
  abstract =     "Recently, there has been substantial interest in the
                 study of various random networks as mathematical models
                 of complex systems. As real-life complex systems grow
                 larger, the ability to generate progressively large
                 random networks becomes all the more \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Lee:2020:ISI,
  author =       "Jaejin Lee and Lawrence Rauchwerger and Armando
                 Solar-Lezama and Guy Steele",
  title =        "Introduction to the Special Issue on {PPoPP 2017}
                 (Part 2)",
  journal =      j-TOPC,
  volume =       "7",
  number =       "3",
  pages =        "14:1--14:2",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3407185",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Aug 6 08:56:07 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3407185",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Jiang:2020:CSM,
  author =       "Peng Jiang and Yang Xia and Gagan Agrawal",
  title =        "Combining {SIMD} and Many\slash Multi-core Parallelism
                 for Finite-state Machines with Enumerative
                 Speculation",
  journal =      j-TOPC,
  volume =       "7",
  number =       "3",
  pages =        "15:1--15:26",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3399714",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Aug 6 08:56:07 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3399714",
  abstract =     "Finite-state Machine (FSM) is the key kernel behind
                 many popular applications, including regular expression
                 matching, text tokenization, and Huffman decoding.
                 Parallelizing FSMs is extremely difficult because of
                 the strong dependencies and unpredictable \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Basin:2020:KKV,
  author =       "Dmitry Basin and Edward Bortnikov and Anastasia
                 Braginsky and Guy Golan-Gueta and Eshcar Hillel and
                 Idit Keidar and Moshe Sulamy",
  title =        "{KiWi}: a Key--value Map for Scalable Real-time
                 Analytics",
  journal =      j-TOPC,
  volume =       "7",
  number =       "3",
  pages =        "16:1--16:28",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3399718",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Aug 6 08:56:07 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/java2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3399718",
  abstract =     "We present KiWi, the first atomic KV-map to
                 efficiently support simultaneous large scans and
                 real-time access. The key to achieving this is treating
                 scans as first class citizens and organizing the data
                 structure around them. KiWi provides wait-free scans,
                 whereas its put operations are lightweight and
                 lock-free. It optimizes memory management jointly with
                 data structure access. We implement KiWi and compare it
                 to state-of-the-art solutions. Compared to other
                 KV-maps providing atomic scans, KiWi performs either
                 long scans or concurrent puts an order of magnitude
                 faster. Its scans are twice as fast as non-atomic ones
                 implemented via iterators in the Java skiplist.",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Chabbi:2020:EAL,
  author =       "Milind Chabbi and Abdelhalim Amer and Xu Liu",
  title =        "Efficient Abortable-locking Protocol for Multi-level
                 {NUMA} Systems: Design and Correctness",
  journal =      j-TOPC,
  volume =       "7",
  number =       "3",
  pages =        "17:1--17:32",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3399728",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Aug 6 08:56:07 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3399728",
  abstract =     "The popularity of Non-Uniform Memory Access (NUMA)
                 architectures has led to numerous locality-preserving
                 hierarchical lock designs, such as HCLH, HMCS, and
                 cohort locks. Locality-preserving locks trade fairness
                 for higher throughput. Hence, some \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Ben-Nun:2020:GAM,
  author =       "Tal Ben-Nun and Michael Sutton and Sreepathi Pai and
                 Keshav Pingali",
  title =        "{Groute}: Asynchronous Multi-{GPU} Programming Model
                 with Applications to Large-scale Graph Processing",
  journal =      j-TOPC,
  volume =       "7",
  number =       "3",
  pages =        "18:1--18:27",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3399730",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Aug 6 08:56:07 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3399730",
  abstract =     "Nodes with multiple GPUs are becoming the platform of
                 choice for high-performance computing. However, most
                 applications are written using bulk-synchronous
                 programming models, which may not be optimal for
                 irregular algorithms that benefit from low-. \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Alappat:2020:RAC,
  author =       "Christie Alappat and Achim Basermann and Alan R.
                 Bishop and Holger Fehske and Georg Hager and Olaf
                 Schenk and Jonas Thies and Gerhard Wellein",
  title =        "A Recursive Algebraic Coloring Technique for
                 Hardware-efficient Symmetric Sparse Matrix--vector
                 Multiplication",
  journal =      j-TOPC,
  volume =       "7",
  number =       "3",
  pages =        "19:1--19:37",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3399732",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Aug 6 08:56:07 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3399732",
  abstract =     "The symmetric sparse matrix-vector multiplication
                 (SymmSpMV) is an important building block for many
                 numerical linear algebra kernel operations or graph
                 traversal applications. Parallelizing SymmSpMV on
                 today's multicore platforms with up to 100 cores is
                 difficult due to the need to manage conflicting updates
                 on the result vector. Coloring approaches can be used
                 to solve this problem without data duplication, but
                 existing coloring algorithms do not take load balancing
                 and deep memory hierarchies into account, hampering
                 scalability and full-chip performance. In this work, we
                 propose the recursive algebraic coloring engine (RACE),
                 a novel coloring algorithm and open-source library
                 implementation that eliminates the shortcomings of
                 previous coloring methods in terms of hardware
                 efficiency and parallelization overhead. We describe
                 the level construction, distance-$k$ coloring, and load
                 balancing steps in RACE, use it to parallelize
                 SymmSpMV, and compare its performance on 31 sparse
                 matrices with other state-of-the-art coloring
                 techniques and Intel MKL on two modern multicore
                 processors. RACE outperforms all other approaches
                 substantially. By means of a parameterized roofline
                 model, we analyze the SymmSpMV performance in detail
                 and discuss outliers. While we focus on SymmSpMV in
                 this article, our algorithm and software are applicable
                 to any sparse matrix operation with data dependencies
                 that can be resolved by distance-$k$ coloring.",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Davydov:2020:ADS,
  author =       "Denis Davydov and Martin Kronbichler",
  title =        "Algorithms and Data Structures for Matrix-Free Finite
                 Element Operators with {MPI}-Parallel Sparse
                 Multi-Vectors",
  journal =      j-TOPC,
  volume =       "7",
  number =       "3",
  pages =        "20:1--20:30",
  month =        aug,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3399736",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Aug 6 08:56:07 MDT 2020",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/abs/10.1145/3399736",
  abstract =     "Traditional solution approaches for problems in
                 quantum mechanics scale as $ O(M^3) $, where $M$ is the
                 number of electrons. Various methods have been proposed
                 to address this issue and obtain a linear scaling $
                 O(M)$. One promising formulation is the direct
                 minimization of energy. Such methods take advantage of
                 physical localization of the solution, allowing users
                 to seek it in terms of non-orthogonal orbitals with
                 local support.\par

                 This work proposes a numerically efficient
                 implementation of sparse parallel vectors within the
                 open-source finite element library deal.II. The main
                 algorithmic ingredient is the matrix-free evaluation of
                 the Hamiltonian operator by cell-wise quadrature. Based
                 on an a-priori chosen support for each vector, we
                 develop algorithms and data structures to perform (i)
                 matrix-free sparse matrix multivector products (SpMM),
                 (ii) the projection of an operator onto a sparse
                 sub-space (inner products), and (iii)
                 post-multiplication of a sparse multivector with a
                 square matrix. The node-level performance is analyzed
                 using a roofline model. Our matrix-free implementation
                 of finite element operators with sparse multivectors
                 achieves a performance of 157 GFlop/s on an Intel
                 Cascade Lake processor with 20 cores. Strong and weak
                 scaling results are reported for a representative
                 benchmark problem using quadratic and quartic finite
                 element bases.",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Sengupta:2020:HAP,
  author =       "Tapan K. Sengupta and Prasannabalaji Sundaram and
                 Vajjala K. Suman and Swagata Bhaumik",
  title =        "A High Accuracy Preserving Parallel Algorithm for
                 Compact Schemes for {DNS}",
  journal =      j-TOPC,
  volume =       "7",
  number =       "4",
  pages =        "21:1--21:32",
  month =        dec,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418073",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sun Mar 28 08:05:40 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418073",
  abstract =     "A new accuracy-preserving parallel algorithm employing
                 compact schemes is presented for direct numerical
                 simulation of the Navier--Stokes equations. Here the
                 connotation of accuracy preservation is having the same
                 level of accuracy obtained by the \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Aggarwal:2020:OLF,
  author =       "Karan Aggarwal and Uday Bondhugula",
  title =        "Optimizing the Linear Fascicle Evaluation Algorithm
                 for Multi-core and Many-core Systems",
  journal =      j-TOPC,
  volume =       "7",
  number =       "4",
  pages =        "22:1--22:45",
  month =        dec,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418075",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sun Mar 28 08:05:40 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418075",
  abstract =     "Sparse matrix-vector multiplication ( SpMV )
                 operations are commonly used in various scientific and
                 engineering applications. The performance of the SpMV
                 operation often depends on exploiting regularity
                 patterns in the matrix. Various representations and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Tumeo:2020:ITSb,
  author =       "Antonino Tumeo and Fabrizio Petrini and John Feo and
                 Mahantesh Halappanavar",
  title =        "Introduction to the {TOPC} Special Issue on
                 Innovations in Systems for Irregular Applications,
                 {Part 2}",
  journal =      j-TOPC,
  volume =       "7",
  number =       "4",
  pages =        "23:1--23:2",
  month =        dec,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3419771",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sun Mar 28 08:05:40 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3419771",
  acknowledgement = ack-nhfb,
  articleno =    "23",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Namashivayam:2020:MFI,
  author =       "Naveen Namashivayam and Bill Long and Deepak
                 Eachempati and Bob Cernohous and Mark Pagel",
  title =        "A Modern {Fortran} Interface in {OpenSHMEM} Need for
                 Interoperability with {Parallel Fortran} Using
                 Coarrays",
  journal =      j-TOPC,
  volume =       "7",
  number =       "4",
  pages =        "24:1--24:25",
  month =        dec,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418084",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sun Mar 28 08:05:40 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fortran3.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418084",
  abstract =     "Languages and libraries based on Partitioned Global
                 Address Space (PGAS) programming models are convenient
                 for exploiting scalable parallelism on large
                 applications across different domains with irregular
                 memory access patterns. OpenSHMEM is a PGAS-.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "24",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Hein:2020:PSI,
  author =       "Eric R. Hein and Srinivas Eswar and Abdurrahman Yasar
                 and Jiajia Li and Jeffrey S. Young and Thomas M. Conte
                 and {\"U}mit V. {\c{C}}ataly{\"u}rek and Richard Vuduc
                 and Jason Riedy and Bora U{\c{c}}ar",
  title =        "Programming Strategies for Irregular Algorithms on the
                 {Emu Chick}",
  journal =      j-TOPC,
  volume =       "7",
  number =       "4",
  pages =        "25:1--25:25",
  month =        dec,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418077",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sun Mar 28 08:05:40 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418077",
  abstract =     "The Emu Chick prototype implements migratory
                 memory-side processing in a novel hardware system.
                 Rather than transferring large amounts of data across
                 the system interconnect, the Emu Chick moves
                 lightweight thread contexts to near-memory cores before
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "25",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Leidel:2020:TME,
  author =       "John D. Leidel and Xi Wang and Brody Williams and Yong
                 Chen",
  title =        "Toward a Microarchitecture for Efficient Execution of
                 Irregular Applications",
  journal =      j-TOPC,
  volume =       "7",
  number =       "4",
  pages =        "26:1--26:24",
  month =        dec,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418082",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sun Mar 28 08:05:40 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418082",
  abstract =     "Given the increasing importance of efficient
                 data-intensive computing, we find that modern processor
                 designs are not well suited to the irregular memory
                 access patterns often found in these algorithms.
                 Applications and algorithms that do not exhibit
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "26",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Fezzardi:2020:ABD,
  author =       "Pietro Fezzardi and Fabrizio Ferrandi",
  title =        "Automated Bug Detection for High-level Synthesis of
                 Multi-threaded Irregular Applications",
  journal =      j-TOPC,
  volume =       "7",
  number =       "4",
  pages =        "27:1--27:26",
  month =        dec,
  year =         "2020",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3418086",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Sun Mar 28 08:05:40 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3418086",
  abstract =     "Field Programmable Gate Arrays (FPGAs) are becoming an
                 appealing technology in datacenters and High
                 Performance Computing. High-Level Synthesis (HLS) of
                 multi-threaded parallel programs is increasingly used
                 to extract parallelism. Despite great leaps \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "27",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Savoie:2021:MIJ,
  author =       "Lee Savoie and David K. Lowenthal and Bronis R. {De
                 Supinski} and Kathryn Mohror and Nikhil Jain",
  title =        "Mitigating Inter-Job Interference via Process-Level
                 Quality-of-Service",
  journal =      j-TOPC,
  volume =       "8",
  number =       "1",
  pages =        "1:1--1:26",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3434397",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Apr 23 17:58:56 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3434397",
  abstract =     "Jobs on most high-performance computing (HPC) systems
                 share the network with other concurrently executing
                 jobs. Network sharing leads to contention that can
                 severely degrade performance. This article investigates
                 the use of Quality of Service (QoS) \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Reza:2021:SPM,
  author =       "Tahsin Reza and Hassan Halawa and Matei Ripeanu and
                 Geoffrey Sanders and Roger A. Pearce",
  title =        "Scalable Pattern Matching in Metadata Graphs via
                 Constraint Checking",
  journal =      j-TOPC,
  volume =       "8",
  number =       "1",
  pages =        "2:1--2:45",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3434391",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Apr 23 17:58:56 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3434391",
  abstract =     "Pattern matching is a fundamental tool for answering
                 complex graph queries. Unfortunately, existing
                 solutions have limited capabilities: They do not scale
                 to process large graphs and/or support only a
                 restricted set of search templates or usage \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Fineman:2021:ISIa,
  author =       "Jeremy Fineman and Aydin Buluc and Seth Gilbert",
  title =        "Introduction to the Special Issue for {SPAA 2018}:
                 {Part 1}",
  journal =      j-TOPC,
  volume =       "8",
  number =       "1",
  pages =        "3e:1--3e:1",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3456774",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Apr 23 17:58:56 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3456774",
  acknowledgement = ack-nhfb,
  articleno =    "3e",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Alon:2021:PBP,
  author =       "Noga Alon and Yossi Azar and Mark Berlin",
  title =        "The Price of Bounded Preemption",
  journal =      j-TOPC,
  volume =       "8",
  number =       "1",
  pages =        "3:1--3:21",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3434377",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Apr 23 17:58:56 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3434377",
  abstract =     "In this article we provide a tight bound for the price
                 of preemption for scheduling jobs on a single machine
                 (or multiple machines). The input consists of a set of
                 jobs to be scheduled and of an integer parameter k
                 {$>$}= 1. Each job has a release time, \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Dhulipala:2021:TEP,
  author =       "Laxman Dhulipala and Guy E. Blelloch and Julian Shun",
  title =        "Theoretically Efficient Parallel Graph Algorithms Can
                 Be Fast and Scalable",
  journal =      j-TOPC,
  volume =       "8",
  number =       "1",
  pages =        "4:1--4:70",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3434393",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Apr 23 17:58:56 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3434393",
  abstract =     "There has been significant recent interest in parallel
                 graph processing due to the need to quickly analyze the
                 large graphs available today. Many graph codes have
                 been designed for distributed memory or external
                 memory. However, today even the largest \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Kaplan:2021:DRS,
  author =       "Haim Kaplan and Shay Solomon",
  title =        "Dynamic Representations of Sparse Distributed
                 Networks: a Locality-sensitive Approach",
  journal =      j-TOPC,
  volume =       "8",
  number =       "1",
  pages =        "5:1--5:26",
  month =        apr,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3434395",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Apr 23 17:58:56 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3434395",
  abstract =     "In 1999, Brodal and Fagerberg (BF) gave an algorithm
                 for maintaining a low outdegree orientation of a
                 dynamic uniformly sparse graph. Specifically, for a
                 dynamic graph on $n$-vertices, with arboricity bounded
                 by $\alpha$ at all times, the BF algorithm supports
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Fineman:2021:ISIb,
  author =       "Jeremy Fineman and Aydin Buluc and Seth Gilbert",
  title =        "Introduction to the Special Issue for {SPAA 2018} ---
                 {Part 2}",
  journal =      j-TOPC,
  volume =       "8",
  number =       "2",
  pages =        "6:1--6:1",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3463366",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Aug 24 07:42:49 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3463366",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Pandurangan:2021:DCL,
  author =       "Gopal Pandurangan and Peter Robinson and Michele
                 Scquizzato",
  title =        "On the Distributed Complexity of Large-Scale Graph
                 Computations",
  journal =      j-TOPC,
  volume =       "8",
  number =       "2",
  pages =        "7:1--7:28",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460900",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Aug 24 07:42:49 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460900",
  abstract =     "Motivated by the increasing need to understand the
                 distributed algorithmic foundations of large-scale
                 graph computations, we study some fundamental graph
                 problems in a message-passing model for distributed
                 computing where $ k \geq 2 $ machines jointly perform
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Geissmann:2021:PMC,
  author =       "Barbara Geissmann and Lukas Gianinazzi",
  title =        "Parallel Minimum Cuts in Near-linear Work and Low
                 Depth",
  journal =      j-TOPC,
  volume =       "8",
  number =       "2",
  pages =        "8:1--8:20",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460890",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Aug 24 07:42:49 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460890",
  abstract =     "We present the first near-linear work and
                 poly-logarithmic depth algorithm for computing a
                 minimum cut in an undirected graph. Previous parallel
                 algorithms with poly-logarithmic depth required at
                 least quadratic work in the number of vertices. In a
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Lucarelli:2021:ONP,
  author =       "Giorgio Lucarelli and Benjamin Moseley and Nguyen Kim
                 Thang and Abhinav Srivastav and Denis Trystram",
  title =        "Online Non-preemptive Scheduling on Unrelated Machines
                 with Rejections",
  journal =      j-TOPC,
  volume =       "8",
  number =       "2",
  pages =        "9:1--9:22",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460880",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Aug 24 07:42:49 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460880",
  abstract =     "When a computer system schedules jobs there is
                 typically a significant cost associated with preempting
                 a job during execution. This cost can be incurred from
                 the expensive task of saving the memory's state or from
                 loading data into and out of memory. \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Winblad:2021:LFC,
  author =       "Kjell Winblad and Konstantinos Sagonas and Bengt
                 Jonsson",
  title =        "Lock-free Contention Adapting Search Trees",
  journal =      j-TOPC,
  volume =       "8",
  number =       "2",
  pages =        "10:1--10:38",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460874",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Aug 24 07:42:49 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460874",
  abstract =     "Concurrent key-value stores with range query support
                 are crucial for the scalability and performance of many
                 applications. Existing lock-free data structures of
                 this kind use a fixed synchronization granularity.
                 Using a fixed synchronization granularity \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Green:2021:HSH,
  author =       "Oded Green",
  title =        "{HashGraph} --- Scalable Hash Tables Using a Sparse
                 Graph Data Structure",
  journal =      j-TOPC,
  volume =       "8",
  number =       "2",
  pages =        "11:1--11:17",
  month =        jun,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3460872",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Aug 24 07:42:49 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3460872",
  abstract =     "In this article, we introduce HashGraph, a new
                 scalable approach for building hash tables that uses
                 concepts taken from sparse graph representations ---
                 hence, the name HashGraph. HashGraph introduces a new
                 way to deal with hash-collisions that does not use
                 ``open-addressing'' or ``separate-chaining,'' yet it
                 has the benefits of both these approaches. HashGraph
                 currently works for static inputs. Recent progress with
                 dynamic graph data structures suggests that HashGraph
                 might be extendable to dynamic inputs as well. We show
                 that HashGraph can deal with a large number of hash
                 values per entry without loss of performance. Last, we
                 show a new querying algorithm for value lookups. We
                 experimentally compare HashGraph to several
                 state-of-the-art implementations and find that it
                 outperforms them on average $ 2 \times $ when the
                 inputs are unique and by as much as $ 40 \times $ when
                 the input contains duplicates. The implementation of
                 HashGraph in this article is for NVIDIA GPUs. HashGraph
                 can build a hash table at a rate of 2.5 billion keys
                 per second on a NVIDIA GV100 GPU and can query at
                 nearly the same rate.",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Berenbrink:2021:ISI,
  author =       "Petra Berenbrink",
  title =        "Introduction to the Special Issue for {SPAA 2019}",
  journal =      j-TOPC,
  volume =       "8",
  number =       "3",
  pages =        "12:1--12:1",
  month =        sep,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3477610",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Sep 21 07:18:25 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3477610",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Behnezhad:2021:MPC,
  author =       "Soheil Behnezhad and Laxman Dhulipala and Hossein
                 Esfandiari and Jakub Lacki and Vahab Mirrokni and
                 Warren Schudy",
  title =        "Massively Parallel Computation via Remote Memory
                 Access",
  journal =      j-TOPC,
  volume =       "8",
  number =       "3",
  pages =        "13:1--13:25",
  month =        sep,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470631",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Sep 21 07:18:25 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470631",
  abstract =     "We introduce the Adaptive Massively Parallel
                 Computation (AMPC) model, which is an extension of the
                 Massively Parallel Computation (MPC) model. At a high
                 level, the AMPC model strengthens the MPC model by
                 storing all messages sent within a round in a
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Ellen:2021:CLL,
  author =       "Faith Ellen and Barun Gorain and Avery Miller and
                 Andrzej Pelc",
  title =        "Constant-Length Labeling Schemes for Deterministic
                 Radio Broadcast",
  journal =      j-TOPC,
  volume =       "8",
  number =       "3",
  pages =        "14:1--14:17",
  month =        sep,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470633",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Sep 21 07:18:25 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470633",
  abstract =     "Broadcast is one of the fundamental network
                 communication primitives. One node of a network, called
                 the source, has a message that has to be learned by
                 all other nodes. We consider broadcast in radio
                 networks, modeled as simple undirected connected graphs
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Bender:2021:EMD,
  author =       "Michael A. Bender and Alex Conway and Mart{\'\i}n
                 Farach-Colton and William Jannen and Yizheng Jiao and
                 Rob Johnson and Eric Knorr and Sara Mcallister and
                 Nirjhar Mukherjee and Prashant Pandey and Donald E.
                 Porter and Jun Yuan and Yang Zhan",
  title =        "External-memory Dictionaries in the Affine and {PDAM}
                 Models",
  journal =      j-TOPC,
  volume =       "8",
  number =       "3",
  pages =        "15:1--15:20",
  month =        sep,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470635",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Sep 21 07:18:25 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470635",
  abstract =     "Storage devices have complex performance profiles,
                 including costs to initiate IOs (e.g., seek times in
                 hard drives), parallelism and bank conflicts (in SSDs),
                 costs to transfer data, and firmware-internal
                 operations. The Disk-access Machine (DAM) model
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Maier:2021:EPC,
  author =       "Matthias Maier and Martin Kronbichler",
  title =        "Efficient Parallel {$3$D} Computation of the
                 Compressible {Euler} Equations with an Invariant-domain
                 Preserving Second-order Finite-element Scheme",
  journal =      j-TOPC,
  volume =       "8",
  number =       "3",
  pages =        "16:1--16:30",
  month =        sep,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470637",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Sep 21 07:18:25 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470637",
  abstract =     "We discuss the efficient implementation of a
                 high-performance second-order collocation-type
                 finite-element scheme for solving the compressible
                 Euler equations of gas dynamics on unstructured meshes.
                 The solver is based on the convex-limiting technique
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Edwards:2021:SFG,
  author =       "James Edwards and Uzi Vishkin",
  title =        "Study of Fine-grained Nested Parallelism in {CDCL SAT}
                 Solvers",
  journal =      j-TOPC,
  volume =       "8",
  number =       "3",
  pages =        "17:1--17:18",
  month =        sep,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470639",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Sep 21 07:18:25 MDT 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470639",
  abstract =     "Boolean satisfiability (SAT) is an important
                 performance-hungry problem with applications in many
                 problem domains. However, most work on parallelizing
                 SAT solvers has focused on coarse-grained, mostly
                 embarrassing, parallelism. Here, we study fine-grained
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Feuilloley:2021:RLN,
  author =       "Laurent Feuilloley and Pierre Fraigniaud",
  title =        "Randomized Local Network Computing: Derandomization
                 Beyond Locally Checkable Labelings",
  journal =      j-TOPC,
  volume =       "8",
  number =       "4",
  pages =        "18:1--18:25",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470640",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Dec 10 10:52:35 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470640",
  abstract =     "We carry on investigating the line of research
                 questioning the power of randomization for the design
                 of distributed algorithms. In their seminal paper, Naor
                 and Stockmeyer [STOC 1993] established that, in the
                 context of network computing in which all \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "18",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Monfared:2021:HTP,
  author =       "Saleh Khalaj Monfared and Omid Hajihassani and Vahid
                 Mohsseni and Dara Rahmati and Saeid Gorgin",
  title =        "A High-throughput Parallel {Viterbi} Algorithm via
                 Bitslicing",
  journal =      j-TOPC,
  volume =       "8",
  number =       "4",
  pages =        "19:1--19:25",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470642",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Dec 10 10:52:35 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470642",
  acknowledgement = ack-nhfb,
  articleno =    "19",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Wang:2021:PBD,
  author =       "Shao-Chung Wang and Lin-Ya Yu and Li-An Her and
                 Yuan-Shin Hwang and Jenq-Kuen Lee",
  title =        "Pointer-Based Divergence Analysis for {OpenCL 2.0}
                 Programs",
  journal =      j-TOPC,
  volume =       "8",
  number =       "4",
  pages =        "20:1--20:23",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3470644",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Dec 10 10:52:35 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3470644",
  abstract =     "A modern GPU is designed with many large thread groups
                 to achieve a high throughput and performance. Within
                 these groups, the threads are grouped into fixed-size
                 SIMD batches in which the same instruction is applied
                 to vectors of data in a lockstep. This \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Kang:2021:AEC,
  author =       "Xuejiao Kang and David F. Gleich and Ahmed Sameh and
                 Ananth Grama",
  title =        "Adaptive Erasure Coded Fault Tolerant Linear System
                 Solver",
  journal =      j-TOPC,
  volume =       "8",
  number =       "4",
  pages =        "21:1--21:19",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3490557",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Dec 10 10:52:35 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3490557",
  abstract =     "As parallel and distributed systems scale, fault
                 tolerance is an increasingly important
                 problem-particularly on systems with limited I/O
                 capacity and bandwidth. Erasure coded computations
                 address this problem by augmenting a given problem
                 instance with \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "21",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Jayanti:2021:DCA,
  author =       "Prasad Jayanti and Siddhartha Jayanti",
  title =        "Deterministic Constant-Amortized-{RMR} Abortable Mutex
                 for {CC} and {DSM}",
  journal =      j-TOPC,
  volume =       "8",
  number =       "4",
  pages =        "22:1--22:26",
  month =        dec,
  year =         "2021",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3490559",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Dec 10 10:52:35 MST 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3490559",
  abstract =     "The abortable mutual exclusion problem, proposed by
                 Scott and Scherer in response to the needs in real-time
                 systems and databases, is a variant of mutual exclusion
                 that allows processes to abort from their attempt to
                 acquire the lock. Worst-case constant \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "22",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Leinhauser:2022:MDI,
  author =       "Matthew Leinhauser and Ren{\'e} Widera and Sergei
                 Bastrakov and Alexander Debus and Michael Bussmann and
                 Sunita Chandrasekaran",
  title =        "Metrics and Design of an Instruction Roofline Model
                 for {AMD GPUs}",
  journal =      j-TOPC,
  volume =       "9",
  number =       "1",
  pages =        "1:1--1:14",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3505285",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Mar 24 08:01:07 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3505285",
  abstract =     "Due to the recent announcement of the Frontier
                 supercomputer, many scientific application developers
                 are working to make their applications compatible with
                 AMD (CPU-GPU) architectures, which means moving away
                 from the traditional CPU and NVIDIA-GPU \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Axtmann:2022:EPS,
  author =       "Michael Axtmann and Sascha Witt and Daniel Ferizovic
                 and Peter Sanders",
  title =        "Engineering In-place (Shared-memory) Sorting
                 Algorithms",
  journal =      j-TOPC,
  volume =       "9",
  number =       "1",
  pages =        "2:1--2:62",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3505286",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Mar 24 08:01:07 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3505286",
  abstract =     "We present new sequential and parallel sorting
                 algorithms that now represent the fastest known
                 techniques for a wide range of input sizes, input
                 distributions, data types, and machines. Somewhat
                 surprisingly, part of the speed advantage is due to the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Mitchell:2022:BOR,
  author =       "Rory Mitchell and Daniel Stokes and Eibe Frank and
                 Geoffrey Holmes",
  title =        "Bandwidth-Optimal Random Shuffling for {GPUs}",
  journal =      j-TOPC,
  volume =       "9",
  number =       "1",
  pages =        "3:1--3:20",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3505287",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Mar 24 08:01:07 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3505287",
  abstract =     "Linear-time algorithms that are traditionally used to
                 shuffle data on CPUs, such as the method of
                 Fisher-Yates, are not well suited to implementation on
                 GPUs due to inherent sequential dependencies, and
                 existing parallel shuffling algorithms are \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Alomairy:2022:HPU,
  author =       "Rabab Alomairy and Wael Bader and Hatem Ltaief and
                 Youssef Mesri and David Keyes",
  title =        "High-performance {$3$D} Unstructured Mesh Deformation
                 Using Rank Structured Matrix Computations",
  journal =      j-TOPC,
  volume =       "9",
  number =       "1",
  pages =        "4:1--4:23",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3512756",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Mar 24 08:01:07 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3512756",
  abstract =     "The Radial Basis Function (RBF) technique is an
                 interpolation method that produces high-quality
                 unstructured adaptive meshes. However, the RBF-based
                 boundary problem necessitates solving a large dense
                 linear system with cubic arithmetic complexity that is
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Milman-Sela:2022:BLF,
  author =       "Gal Milman-Sela and Alex Kogan and Yossi Lev and
                 Victor Luchangco and Erez Petrank",
  title =        "{BQ}: a Lock-Free Queue with Batching",
  journal =      j-TOPC,
  volume =       "9",
  number =       "1",
  pages =        "5:1--5:49",
  month =        mar,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3512757",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Mar 24 08:01:07 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3512757",
  abstract =     "Concurrent data structures provide fundamental
                 building blocks for concurrent programming. Standard
                 concurrent data structures may be extended by allowing
                 a sequence of operations to be submitted as a batch for
                 later execution. A sequence of such \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Rinberg:2022:FCD,
  author =       "Arik Rinberg and Alexander Spiegelman and Edward
                 Bortnikov and Eshcar Hillel and Idit Keidar and Lee
                 Rhodes and Hadar Serviansky",
  title =        "Fast Concurrent Data Sketches",
  journal =      j-TOPC,
  volume =       "9",
  number =       "2",
  pages =        "6:1--6:35",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3512758",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 2 10:15:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3512758",
  abstract =     "Data sketches are approximate succinct summaries of
                 long data streams. They are widely used for processing
                 massive amounts of data and answering statistical
                 queries about it. Existing libraries producing sketches
                 are very fast, but do not allow \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Blelloch:2022:JPB,
  author =       "Guy Blelloch and Daniel Ferizovic and Yihan Sun",
  title =        "Joinable Parallel Balanced Binary Trees",
  journal =      j-TOPC,
  volume =       "9",
  number =       "2",
  pages =        "7:1--7:41",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3512769",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 2 10:15:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3512769",
  abstract =     "In this article, we show how a single function, join,
                 can be used to implement parallel balanced binary
                 search trees (BSTs) simply and efficiently. Based on
                 join, our approach applies to multiple balanced tree
                 data structures, and a variety of functions \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Chen:2022:FFG,
  author =       "Yuedan Chen and Guoqing Xiao and Kenli Li and
                 Francesco Piccialli and Albert Y. Zomaya",
  title =        "{fgSpMSpV}: a Fine-grained Parallel {SpMSpV} Framework
                 on {HPC} Platforms",
  journal =      j-TOPC,
  volume =       "9",
  number =       "2",
  pages =        "8:1--8:29",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3512770",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 2 10:15:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3512770",
  abstract =     "Sparse matrix--sparse vector (SpMSpV) multiplication
                 is one of the fundamental and important operations in
                 many high-performance scientific and engineering
                 applications. The inherent irregularity and poor data
                 locality lead to two main challenges to to scaling
                 SpMSpV over high-performance computing (HPC) systems:
                 (i) a large amount of redundant data limits the
                 utilization of bandwidth and parallel resources; (ii)
                 the irregular access pattern limits the exploitation of
                 computing resources. This paper proposes a fine-grained
                 parallel SpMSpV (fgSpMSpV) framework on Sunway
                 TaihuLight supercomputer to alleviate the challenges
                 for large-scale real-world applications. First,
                 fgSpMSpV adopts an MPI+OpenMP+X parallelization model
                 to exploit the multi-stage and hybrid parallelism of
                 heterogeneous HPC architectures and accelerate both
                 pre-/post-processing and main SpMSpV computation.
                 Second, fgSpMSpV utilizes an adaptive parallel
                 execution to reduce the pre-processing, adapt to the
                 parallelism and memory hierarchy of the Sunway system,
                 while still tame redundant and random memory accesses
                 in SpMSpV, including a set of techniques like the
                 fine-grained partitioner, re-collection method, and
                 Compressed Sparse Column Vector (CSCV) matrix format.
                 Third, fgSpMSpV uses several optimization techniques to
                 further utilize the computing resources. fgSpMSpV on
                 the Sunway TaihuLight gains a noticeable performance
                 improvement from the key optimization techniques with
                 various sparsity of the input. Additionally, fgSpMSpV
                 is implemented on an NVIDIA Tesal P100 GPU and applied
                 to the breath-first-search (BFS) application. fgSpMSpV
                 on a P100 GPU obtains the speedup of up to 134.38 $
                 \times $ over the state-of-the-art SpMSpV algorithms,
                 and the BFS application using fgSpMSpV achieves the
                 speedup of up to 21.68 $ \times $ over the
                 state-of-the-arts.",
  acknowledgement = ack-nhfb,
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Liu:2022:SCC,
  author =       "Sixue Cliff Liu and Robert Endre Tarjan",
  title =        "Simple Concurrent Connected Components Algorithms",
  journal =      j-TOPC,
  volume =       "9",
  number =       "2",
  pages =        "9:1--9:26",
  month =        jun,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3543546",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 2 10:15:52 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3543546",
  abstract =     "We study a class of simple algorithms for concurrently
                 computing the connected components of an n-vertex,
                 m-edge graph. Our algorithms are easy to implement in
                 either the COMBINING CRCW PRAM or the MPC computing
                 model. For two related algorithms in this \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Alabandi:2022:ISQ,
  author =       "Ghadeer Alabandi and Martin Burtscher",
  title =        "Improving the Speed and Quality of Parallel Graph
                 Coloring",
  journal =      j-TOPC,
  volume =       "9",
  number =       "3",
  pages =        "10:1--10:35",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3543545",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Sep 20 09:34:53 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3543545",
  abstract =     "Graph coloring assigns a color to each vertex of a
                 graph such that no two adjacent vertices get the same
                 color. It is a key building block in many applications.
                 In practice, solutions that require fewer distinct
                 colors and that can be computed faster are \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Nguyen:2022:DIC,
  author =       "Hung K. Nguyen and Xuan-Tu Tran",
  title =        "Design and Implementation of a Coarse-grained
                 Dynamically Reconfigurable Multimedia Accelerator",
  journal =      j-TOPC,
  volume =       "9",
  number =       "3",
  pages =        "11:1--11:23",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3543544",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Sep 20 09:34:53 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3543544",
  abstract =     "This article proposes and implements a Coarse-grained
                 dynamically Reconfigurable Architecture, named
                 Reconfigurable Multimedia Accelerator (REMAC). REMAC
                 architecture is driven by the pipelined
                 multi-instruction-multi-data execution model for
                 exploiting \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Anju:2022:MID,
  author =       "M. A. Anju and Rupesh Nasre",
  title =        "Multi-Interval {DomLock}: Toward Improving Concurrency
                 in Hierarchies",
  journal =      j-TOPC,
  volume =       "9",
  number =       "3",
  pages =        "12:1--12:27",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3543543",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Sep 20 09:34:53 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3543543",
  abstract =     "Locking has been a predominant technique depended upon
                 for achieving thread synchronization and ensuring
                 correctness in multi-threaded applications. It has been
                 established that the concurrent applications working
                 with hierarchical data witness \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Ji:2022:IPI,
  author =       "Yuede Ji and Hang Liu and Yang Hu and H. Howie Huang",
  title =        "{iSpan}: Parallel Identification of Strongly Connected
                 Components with Spanning Trees",
  journal =      j-TOPC,
  volume =       "9",
  number =       "3",
  pages =        "13:1--13:27",
  month =        sep,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3543542",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Tue Sep 20 09:34:53 MDT 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3543542",
  abstract =     "Detecting strongly connected components (SCCs) in a
                 directed graph is crucial for understanding the
                 structure of graphs. Most real-world graphs have one
                 large SCC that contains the majority of the vertices as
                 well as many small SCCs whose sizes are \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Benoit:2022:CWY,
  author =       "Anne Benoit and Luca Perotin and Yves Robert and
                 Hongyang Sun",
  title =        "Checkpointing Workflows {\`a} la {Young\slash Daly} Is
                 Not Good Enough",
  journal =      j-TOPC,
  volume =       "9",
  number =       "4",
  pages =        "14:1--14:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3548607",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Apr 5 10:44:10 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3548607",
  abstract =     "This article revisits checkpointing strategies when
                 workflows composed of multiple tasks execute on a
                 parallel platform. The objective is to minimize the
                 expectation of the total execution time. For a single
                 task, the Young\slash Daly formula provides the
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Albers:2022:OAR,
  author =       "Susanne Albers and Jens Quedenfeld",
  title =        "Optimal Algorithms for Right-sizing Data Centers",
  journal =      j-TOPC,
  volume =       "9",
  number =       "4",
  pages =        "15:1--15:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3565513",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Apr 5 10:44:10 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3565513",
  abstract =     "Electricity cost is a dominant and rapidly growing
                 expense in data centers. Unfortunately, much of the
                 consumed energy is wasted, because servers are idle for
                 extended periods of time. We study a capacity
                 management problem that dynamically right-sizes a
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Kappes:2022:FRC,
  author =       "Giorgos Kappes and Stergios V. Anastasiadis",
  title =        "A Family of Relaxed Concurrent Queues for Low-Latency
                 Operations and Item Transfers",
  journal =      j-TOPC,
  volume =       "9",
  number =       "4",
  pages =        "16:1--16:??",
  month =        dec,
  year =         "2022",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3565514",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Apr 5 10:44:10 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3565514",
  abstract =     "The producer-consumer communication over shared memory
                 is a critical function of current scalable systems.
                 Queues that provide low latency and high throughput on
                 highly utilized systems can improve the overall
                 performance perceived by the end users. In \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Sundaram:2023:NOH,
  author =       "Prasannabalaji Sundaram and Aditi Sengupta and Vajjala
                 K. Suman and Tapan K. Sengupta",
  title =        "Non-overlapping High-accuracy Parallel Closure for
                 Compact Schemes: Application in Multiphysics and
                 Complex Geometry",
  journal =      j-TOPC,
  volume =       "10",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3580005",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Apr 5 10:44:11 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3580005",
  abstract =     "Compact schemes are often preferred in performing
                 scientific computing for their superior spectral
                 resolution. Error-free parallelization of a compact
                 scheme is a challenging task due to the requirement of
                 additional closures at the inter-processor \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Lockhart:2023:PAO,
  author =       "Shelby Lockhart and Amanda Bienz and William Gropp and
                 Luke Olson",
  title =        "Performance Analysis and Optimal Node-aware
                 Communication for Enlarged Conjugate Gradient Methods",
  journal =      j-TOPC,
  volume =       "10",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3580003",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Apr 5 10:44:11 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3580003",
  abstract =     "Krylov methods are a key way of solving large sparse
                 linear systems of equations but suffer from poor strong
                 scalability on distributed memory machines. This is due
                 to high synchronization costs from large numbers of
                 collective communication calls \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Munch:2023:EDM,
  author =       "Peter Munch and Timo Heister and Laura Prieto Saavedra
                 and Martin Kronbichler",
  title =        "Efficient Distributed Matrix-free Multigrid Methods on
                 Locally Refined Meshes for {FEM} Computations",
  journal =      j-TOPC,
  volume =       "10",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3580314",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Apr 5 10:44:11 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3580314",
  abstract =     "This work studies three multigrid variants for
                 matrix-free finite-element computations on locally
                 refined meshes: geometric local smoothing, geometric
                 global coarsening (both h -multigrid), and polynomial
                 global coarsening (a variant of p -multigrid). We
                 \ldots{}",
  acknowledgement = ack-nhfb,
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Klein:2023:TGL,
  author =       "Christoph Klein and Robert Strzodka",
  title =        "{Tridigpu}: a {GPU} Library for Block Tridiagonal and
                 Banded Linear Equation Systems",
  journal =      j-TOPC,
  volume =       "10",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3580373",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Apr 5 10:44:11 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
                 https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3580373",
  abstract =     "In this article, we present a CUDA library with a C
                 API for solving block cyclic tridiagonal and banded
                 systems on one GPU. The library can process block
                 tridiagonal systems with block sizes from $ 1 \times 1
                 $ (scalar) to $ 4 \times 4 $ and banded systems with up
                 to four sub- and superdiagonals. For the
                 compute-intensive block size cases and cases with many
                 right-hand sides, we write out an explicit
                 factorization to memory; however, for the scalar case,
                 the fastest approach is to only output the coarse
                 system and recompute the factorization. Prominent
                 features of the library are (scaled) partial pivoting
                 for improved numeric stability; highest-performance
                 kernels, which completely utilize GPU memory bandwidth;
                 and support for multiple sparse or dense right-hand
                 side and solution vectors. The additional memory
                 consumption is only 5\% of the original tridiagonal
                 system, which enables the solution of systems up to GPU
                 memory size. The performance of the state-of-the-art
                 scalar tridiagonal solver of cuSPARSE is outperformed
                 by factor 5 for large problem sizes of $ 2^{25} $
                 unknowns, on a GeForce RTX 2080 Ti.",
  acknowledgement = ack-nhfb,
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Lakhotia:2023:PPB,
  author =       "Kartik Lakhotia and Rajgopal Kannan and Viktor
                 Prasanna",
  title =        "Parallel Peeling of Bipartite Networks for
                 Hierarchical Dense Subgraph Discovery",
  journal =      j-TOPC,
  volume =       "10",
  number =       "2",
  pages =        "5:1--5:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3583084",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 29 08:18:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3583084",
  abstract =     "Wing and Tip decomposition are motif-based analytics
                 for bipartite graphs that construct a hierarchy of
                 butterfly (2,2-biclique) dense edge and vertex induced
                 subgraphs, respectively. They have applications in
                 several domains, including e-commerce, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Zheng:2023:DGD,
  author =       "Weijian Zheng and Dali Wang and Fengguang Song",
  title =        "A Distributed-{GPU} Deep Reinforcement Learning System
                 for Solving Large Graph Optimization Problems",
  journal =      j-TOPC,
  volume =       "10",
  number =       "2",
  pages =        "6:1--6:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3589188",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 29 08:18:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3589188",
  abstract =     "Graph optimization problems (such as minimum vertex
                 cover, maximum cut, traveling salesman problems) appear
                 in many fields including social sciences, power
                 systems, chemistry, and bioinformatics. Recently, deep
                 reinforcement learning (DRL) has shown \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Brown:2023:PED,
  author =       "Andrew D. Brown and Jonathan R. Beaumont and David B.
                 Thomas and Julian C. Shillcock and Matthew F. Naylor
                 and Graeme M. Bragg and Mark L. Vousden and Simon W.
                 Moore and Shane T. Fleming",
  title =        "{POETS}: an Event-driven Approach to Dissipative
                 Particle Dynamics: Implementing a Massively
                 Compute-intensive Problem on a Novel Hard\slash
                 Software Architecture.",
  journal =      j-TOPC,
  volume =       "10",
  number =       "2",
  pages =        "7:1--7:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3580372",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 29 08:18:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3580372",
  abstract =     "HPC clusters have become ever more expensive, both in
                 terms of capital cost and energy consumption; some
                 estimates suggest that competitive installations at the
                 end of the next decade will require their own power
                 station. One way around this looming \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "7",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Aleeva:2023:IIP,
  author =       "Valentina Aleeva and Rifkhat Aleev",
  title =        "Investigation and Implementation of Parallelism
                 Resources of Numerical Algorithms",
  journal =      j-TOPC,
  volume =       "10",
  number =       "2",
  pages =        "8:1--8:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3583755",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 29 08:18:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3583755",
  abstract =     "This article is devoted to an approach to solving a
                 problem of the efficiency of parallel computing. The
                 theoretical basis of this approach is the concept of a
                 Q -determinant. Any numerical algorithm has a Q
                 -determinant. The Q -determinant of the algorithm
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "8",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Wang:2023:HPC,
  author =       "Haotian Wang and Wangdong Yang and Renqiu Ouyang and
                 Rong Hu and Kenli Li and Keqin Li",
  title =        "A Heterogeneous Parallel Computing Approach Optimizing
                 {SpTTM} on {CPU-GPU} via {GCN}",
  journal =      j-TOPC,
  volume =       "10",
  number =       "2",
  pages =        "9:1--9:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3584373",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 29 08:18:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3584373",
  abstract =     "Sparse Tensor-Times-Matrix (SpTTM) is the core
                 calculation in tensor analysis. The sparse
                 distributions of different tensors vary greatly, which
                 poses a big challenge to designing efficient and
                 general SpTTM. In this paper, we describe SpTTM on
                 CPU-GPU \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "9",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Miao:2023:PIT,
  author =       "Zheng Miao and Jon C. Calhoun and Rong Ge and Jiajia
                 Li",
  title =        "Performance Implication of Tensor Irregularity and
                 Optimization for Distributed Tensor Decomposition",
  journal =      j-TOPC,
  volume =       "10",
  number =       "2",
  pages =        "10:1--10:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3580315",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 29 08:18:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3580315",
  abstract =     "Tensors are used by a wide variety of applications to
                 represent multi-dimensional data; tensor decompositions
                 are a class of methods for latent data analytics, data
                 compression, and so on. Many of these applications
                 generate large tensors with irregular \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "10",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Hesselink:2023:MLS,
  author =       "Wim H. Hesselink and Peter A. Buhr",
  title =        "{MCSH}, a Lock with the Standard Interface",
  journal =      j-TOPC,
  volume =       "10",
  number =       "2",
  pages =        "11:1--11:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3584696",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 29 08:18:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3584696",
  abstract =     "The MCS lock of Mellor-Crummey and Scott (1991), 23
                 pages. is a very efficient first-come first-served
                 mutual-exclusion algorithm that uses the atomic
                 hardware primitives fetch-and-store and
                 compare-and-swap. However, it has the disadvantage that
                 the \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "11",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Zamani:2023:GEE,
  author =       "Hadi Zamani and Laxmi Bhuyan and Jieyang Chen and
                 Zizhong Chen",
  title =        "{GreenMD}: Energy-efficient Matrix Decomposition on
                 Heterogeneous Multi-{GPU} Systems",
  journal =      j-TOPC,
  volume =       "10",
  number =       "2",
  pages =        "12:1--12:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3583590",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 29 08:18:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3583590",
  abstract =     "The current trend of performance growth in HPC systems
                 is accompanied by a massive increase in energy
                 consumption. In this article, we introduce GreenMD, an
                 energy-efficient framework for heterogeneous systems
                 for LU factorization utilizing multi-GPUs. LU
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "12",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Kamenev:2023:FSA,
  author =       "Aleksandar Kamenev and Dariusz R. Kowalski and Miguel
                 A. Mosteiro",
  title =        "Faster Supervised Average Consensus in Adversarial and
                 Stochastic Anonymous Dynamic Networks",
  journal =      j-TOPC,
  volume =       "10",
  number =       "2",
  pages =        "13:1--13:??",
  month =        jun,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3593426",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 29 08:18:36 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3593426",
  abstract =     "How do we reach consensus on an average value in a
                 dynamic crowd without revealing identity? In this work,
                 we study the problem of average network consensus in
                 Anonymous Dynamic Networks (ADN). Network dynamicity is
                 specified by the sequence of topology-. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "13",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Baruah:2023:CCF,
  author =       "Sanjoy Baruah and Alberto Marchetti-Spaccamela",
  title =        "The Computational Complexity of Feasibility Analysis
                 for Conditional {DAG} Tasks",
  journal =      j-TOPC,
  volume =       "10",
  number =       "3",
  pages =        "14:1--14:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3606342",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 29 08:18:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3606342",
  abstract =     "The Conditional DAG (CDAG) task model is used for
                 modeling multiprocessor real-time systems containing
                 conditional expressions for which outcomes are not
                 known prior to their evaluation. Feasibility analysis
                 for CDAG tasks upon multiprocessor platforms is
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "14",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Blanusa:2023:FPA,
  author =       "Jovan Blanusa and Kubilay Atasu and Paolo Ienne",
  title =        "Fast Parallel Algorithms for Enumeration of Simple,
                 Temporal, and Hop-constrained Cycles",
  journal =      j-TOPC,
  volume =       "10",
  number =       "3",
  pages =        "15:1--15:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3611642",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 29 08:18:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3611642",
  abstract =     "Cycles are one of the fundamental subgraph patterns
                 and being able to enumerate them in graphs enables
                 important applications in a wide variety of fields,
                 including finance, biology, chemistry, and network
                 science. However, to enable cycle enumeration in
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "15",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Alvermann:2023:OLP,
  author =       "Andreas Alvermann and Georg Hager and Holger Fehske",
  title =        "Orthogonal Layers of Parallelism in Large-Scale
                 Eigenvalue Computations",
  journal =      j-TOPC,
  volume =       "10",
  number =       "3",
  pages =        "16:1--16:??",
  month =        sep,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3614444",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Fri Sep 29 08:18:37 MDT 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3614444",
  abstract =     "We address the communication overhead of distributed
                 sparse matrix-(multiple)-vector multiplication in the
                 context of large-scale eigensolvers, using filter
                 diagonalization as an example. The basis of our study
                 is a performance model, which includes a communication
                 metric that is computed directly from the matrix
                 sparsity pattern without running any code. The
                 performance model quantifies to which extent
                 scalability and parallel efficiency are lost due to
                 communication overhead.\par

                 To restore scalability, we identify two orthogonal
                 layers of parallelism in the filter diagonalization
                 technique. In the horizontal layer the rows of the
                 sparse matrix are distributed across individual
                 processes. In the vertical layer bundles of multiple
                 vectors are distributed across separate process groups.
                 An analysis in terms of the communication metric
                 predicts that scalability can be restored if, and only
                 if, one implements the two orthogonal layers of
                 parallelism via different distributed vector
                 layouts.\par

                 Our theoretical analysis is corroborated by benchmarks
                 for application matrices from quantum and solid state
                 physics, road networks, and nonlinear programming. We
                 finally demonstrate the benefits of using orthogonal
                 layers of parallelism with two exemplary application
                 cases --- an exciton and a strongly correlated electron
                 system --- which incur either small or large
                 communication overhead.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "16",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Azar:2023:ISI,
  author =       "Yossi Azar and Julian Shun",
  title =        "Introduction to the Special Issue for {SPAA'21}",
  journal =      j-TOPC,
  volume =       "10",
  number =       "4",
  pages =        "17:1--17:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3630608",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Dec 21 10:57:17 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3630608",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "17",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Anderson:2023:PMC,
  author =       "Daniel Anderson and Guy E. Blelloch",
  title =        "Parallel Minimum Cuts in {$ O(m \log_2 n) $} Work and
                 Low Depth",
  journal =      j-TOPC,
  volume =       "10",
  number =       "4",
  pages =        "18:1--18:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3565557",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Dec 21 10:57:17 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3565557",
  abstract =     "We present a randomized $O(m \log^2 n)$ work,
                 $O(\polylog n)$ depth parallel algorithm for minimum
                 cut. This algorithm matches the work bounds of a recent
                 sequential algorithm by Gawrychowski, Mozes, and
                 Weimann [ICALP'20], and improves on the previously best
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "18",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Im:2023:NCS,
  author =       "Sungjin Im and Ravi Kumar and Mahshid Montazer Qaem
                 and Manish Purohit",
  title =        "Non-clairvoyant Scheduling with Predictions",
  journal =      j-TOPC,
  volume =       "10",
  number =       "4",
  pages =        "19:1--19:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3593969",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Dec 21 10:57:17 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3593969",
  abstract =     "In the single-machine non-clairvoyant scheduling
                 problem, the goal is to minimize the total completion
                 time of jobs whose processing times are unknown a
                 priori. We revisit this well-studied problem and
                 consider the question of how to effectively use (.
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "19",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Albers:2023:ARS,
  author =       "Susanne Albers and Jens Quedenfeld",
  title =        "Algorithms for Right-sizing Heterogeneous Data
                 Centers",
  journal =      j-TOPC,
  volume =       "10",
  number =       "4",
  pages =        "20:1--20:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3595286",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Dec 21 10:57:17 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3595286",
  abstract =     "Power consumption is a dominant and still growing cost
                 factor in data centers. In time periods with low load,
                 the energy consumption can be reduced by powering down
                 unused servers. We resort to a model introduced by Lin,
                 Wierman, Andrew, and Thereska [23, \ldots{}]",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "20",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Maus:2023:DGC,
  author =       "Yannic Maus",
  title =        "Distributed Graph Coloring Made Easy",
  journal =      j-TOPC,
  volume =       "10",
  number =       "4",
  pages =        "21:1--21:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3605896",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Dec 21 10:57:17 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3605896",
  abstract =     "In this article, we present a deterministic $ \mathsf
                 {CONGEST} $ algorithm to compute an $ O(k
                 \Delta)$-vertex coloring in $ O(\Delta / k) + \log^* n$
                 rounds, where $ \Delta $ is the maximum degree of the
                 network graph and $ k \geq 1$ can be freely chosen. The
                 algorithm is extremely simple: \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "21",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Ahmad:2023:FAA,
  author =       "Zafar Ahmad and Rezaul Chowdhury and Rathish Das and
                 Pramod Ganapathi and Aaron Gregory and Yimin Zhu",
  title =        "A Fast Algorithm for Aperiodic Linear Stencil
                 Computation using {Fast Fourier Transforms}",
  journal =      j-TOPC,
  volume =       "10",
  number =       "4",
  pages =        "22:1--22:??",
  month =        dec,
  year =         "2023",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3606338",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Thu Dec 21 10:57:17 MST 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3606338",
  abstract =     "Stencil computations are widely used to simulate the
                 change of state of physical systems across a
                 multidimensional grid over multiple timesteps. The
                 state-of-the-art techniques in this area fall into
                 three groups: cache-aware tiled looping algorithms,
                 cache-oblivious divide-and-conquer trapezoidal
                 algorithms, and Krylov subspace methods.

                 In this article, we present two efficient parallel
                 algorithms for performing linear stencil computations.
                 Current direct solvers in this domain are
                 computationally inefficient, and Krylov methods require
                 manual labor and mathematical training. We solve these
                 problems for linear stencils by using discrete Fourier
                 transforms preconditioning on a Krylov method to
                 achieve a direct solver that is both fast and general.
                 Indeed, while all currently available algorithms for
                 solving general linear stencils perform (NT) work,
                 where N is the size of the spatial grid and T is the
                 number of timesteps, our algorithms perform o(NT)
                 work.

                 To the best of our knowledge, we give the first
                 algorithms that use fast Fourier transforms to compute
                 final grid data by evolving the initial data for many
                 timesteps at once. Our algorithms handle both periodic
                 and aperiodic boundary conditions and achieve
                 polynomially better performance bounds (i.e.,
                 computational complexity and parallel runtime) than all
                 other existing solutions. Initial experimental results
                 show that implementations of our algorithms that evolve
                 grids of roughly 107 cells for around 105 timesteps run
                 orders of magnitude faster than state-of-the-art
                 implementations for periodic stencil problems, and $
                 1.3 \times $ to $ 8.5 \times $ faster for aperiodic
                 stencil problems.",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "22",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Benoit:2024:CST,
  author =       "Anne Benoit and Lucas Perotin and Yves Robert and
                 Fr{\'e}d{\'e}ric Vivien",
  title =        "Checkpointing Strategies to Tolerate Non-Memoryless
                 Failures on {HPC} Platforms",
  journal =      j-TOPC,
  volume =       "11",
  number =       "1",
  pages =        "1:1--1:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3624560",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Mar 13 07:22:23 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3624560",
  abstract =     "This article studies checkpointing strategies for
                 parallel applications subject to failures. The optimal
                 strategy to minimize total execution time, or makespan,
                 is well known when failure IATs obey an Exponential
                 distribution, but it is unknown for non-. \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "1",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Perotin:2024:IOS,
  author =       "Lucas Perotin and Hongyang Sun",
  title =        "Improved Online Scheduling of Moldable Task Graphs
                 under Common Speedup Models",
  journal =      j-TOPC,
  volume =       "11",
  number =       "1",
  pages =        "2:1--2:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3630052",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Mar 13 07:22:23 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3630052",
  abstract =     "We consider the online scheduling problem of moldable
                 task graphs on multiprocessor systems for minimizing
                 the overall completion time (or makespan). Moldable job
                 scheduling has been widely studied in the literature,
                 in particular when tasks have \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "2",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Lin:2024:HCH,
  author =       "Shengle Lin and Wangdong Yang and Yikun Hu and Qinyun
                 Cai and Minlu Dai and Haotian Wang and Kenli Li",
  title =        "{HPS Cholesky}: Hierarchical Parallelized Supernodal
                 {Cholesky} with Adaptive Parameters",
  journal =      j-TOPC,
  volume =       "11",
  number =       "1",
  pages =        "3:1--3:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3630051",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Mar 13 07:22:23 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3630051",
  abstract =     "Sparse supernodal Cholesky on multi-NUMAs is
                 challenging due to the supernode relaxation and load
                 balancing. In this work, we propose a novel approach to
                 improve the performance of sparse Cholesky by combining
                 deep learning with a relaxation parameter and
                 \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "3",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Marotta:2024:CRL,
  author =       "Romolo Marotta and Mauro Ianni and Alessandro
                 Pellegrini and Francesco Quaglia",
  title =        "A Conflict-Resilient Lock-Free Linearizable Calendar
                 Queue",
  journal =      j-TOPC,
  volume =       "11",
  number =       "1",
  pages =        "4:1--4:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3635163",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Mar 13 07:22:23 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3635163",
  abstract =     "In the last two decades, great attention has been
                 devoted to the design of non-blocking and linearizable
                 data structures, which enable exploiting the scaled-up
                 degree of parallelism in off-the-shelf shared-memory
                 multi-core machines. In this context, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "4",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Muller:2024:MAE,
  author =       "Stefan K. Muller and Jan Hoffmann",
  title =        "Modeling and Analyzing Evaluation Cost of {CUDA}
                 Kernels",
  journal =      j-TOPC,
  volume =       "11",
  number =       "1",
  pages =        "5:1--5:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3639403",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Mar 13 07:22:23 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3639403",
  abstract =     "Motivated by the increasing importance of
                 general-purpose Graphic Processing Units (GPGPU)
                 programming, exemplified by NVIDIA's CUDA framework, as
                 well as the difficulty, especially for novice
                 programmers, of reasoning about performance in GPGPU
                 kernels, \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "5",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}

@Article{Cai:2024:AAB,
  author =       "Qinyun Cai and Guoqing Xiao and Shengle Lin and
                 Wangdong Yang and Keqin Li and Kenli Li",
  title =        "{ABSS}: an Adaptive Batch-Stream Scheduling Module for
                 Dynamic Task Parallelism on Chiplet-based Multi-Chip
                 Systems",
  journal =      j-TOPC,
  volume =       "11",
  number =       "1",
  pages =        "6:1--6:??",
  month =        mar,
  year =         "2024",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1145/3643597",
  ISSN =         "2329-4949 (print), 2329-4957 (electronic)",
  ISSN-L =       "2329-4949",
  bibdate =      "Wed Mar 13 07:22:23 MDT 2024",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/topc.bib",
  URL =          "https://dl.acm.org/doi/10.1145/3643597",
  abstract =     "Thanks to the recognition and promotion of
                 chiplet-based High-Performance Computing (HPC) system
                 design technology by semiconductor industry/market
                 leaders, chiplet-based multi-chip systems have
                 gradually become the mainstream. Unfortunately,
                 programming \ldots{}",
  acknowledgement = ack-nhfb,
  ajournal =     "ACM Trans. Parallel Comput.",
  articleno =    "6",
  fjournal =     "ACM Transactions on Parallel Computing",
  journal-URL =  "https://dl.acm.org/loi/topc",
}