Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.09",
%%%     date            = "12 April 2024",
%%%     time            = "08:51:59 MST",
%%%     filename        = "ieeecomputarchitlett.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "https://www.math.utah.edu/~beebe",
%%%     checksum        = "48775 33292 170290 1740324",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "BibTeX; bibliography; IEEE Computer
%%%                        Architecture Letters",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE bibliography of
%%%                        publications in the journal IEEE Computer
%%%                        Architecture Letters (CODEN none, ISSN
%%%                        1556-6056 (print), 1556-6064 (electronic)).
%%%                        Publication began with volume 1, number 1,
%%%                        in January 2002, and there was only one
%%%                        issue per annual volume through 2005.  Since
%%%                        volume 5 (2006), there are only two issues
%%%                        per volume.
%%%
%%%                        The journal has Web sites at
%%%
%%%                            https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208
%%%                            https://ieeexplore.ieee.org/xpl/issues?punumber=10208&isnumber=8610345
%%%
%%%                        At version 1.09, the COMPLETE year coverage
%%%                        looked like this:
%%%
%%%                             2002 (  12)    2010 (  32)    2018 (  61)
%%%                             2003 (   7)    2011 (  25)    2019 (  45)
%%%                             2004 (   9)    2012 (  27)    2020 (  42)
%%%                             2005 (   2)    2013 (  29)    2021 (  45)
%%%                             2006 (  18)    2014 (  36)    2022 (  38)
%%%                             2007 (  14)    2015 (  52)    2023 (  41)
%%%                             2008 (  21)    2016 (  49)    2024 (  18)
%%%                             2009 (  34)    2017 (  42)
%%%
%%%                             Article:        699
%%%
%%%                             Total entries:  699
%%%
%%%                        Data for this bibliography have been derived
%%%                        primarily from the publisher Web site, and
%%%                        from the Web of Science Web site.
%%%
%%%                        Numerous errors in the Web sources noted
%%%                        above have been corrected.  Spelling has been
%%%                        verified with the UNIX spell and GNU ispell
%%%                        programs using the exception dictionary
%%%                        stored in the companion file with extension
%%%                        .sok.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================
@Preamble{ "\ifx \undefined \booktitle \def \booktitle#1{{{\em #1}}} \fi" }

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|https://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-IEEE-COMPUT-ARCHIT-LETT = "IEEE Computer Architecture Letters"}

%%% ====================================================================
%%% Bibliography entries, sorted in publication order with ``bibsort
%%% --byvolume'':
@Article{Alvarez:2002:IRF,
  author =       "C. Alvarez and J. Corbal and E. Salami and M. Valero",
  title =        "Initial Results on Fuzzy Floating Point Computation
                 for Multimedia Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "1--1",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "During the recent years the market of mid low end
                 portable systems such as PDAs or mobile digital phones
                 have experimented a revolution in both selling volume
                 and features as handheld devices incorporate Multimedia
                 applications. This fact brings to an increase in the
                 computational demands of the devices while still having
                 the limitation of power and energy consumption.
                 Instruction memoization is a promising technique to
                 help alleviate the problem of power consumption of
                 expensive functional units such as the floating point
                 one. Unfortunately this technique could be energy
                 inefficient for low end systems due to the additional
                 power consumption of the relatively big tables
                 required. In this paper we present a novel way of
                 understanding multimedia floating point operations
                 based on the fuzzy computation paradigm losses in the
                 computation precision may exchange performance for
                 negligible errors in the output. Exploiting the
                 implicit characteristics of media FP computation we
                 propose a new technique called fuzzy memoization. Fuzzy
                 memoization expands the capabilities of classic
                 memoization by attaching entries with similar inputs to
                 the same output. We present a case of study for a SH
                 like processor and report good performance and power
                 delay improvements with feasible hardware
                 requirements",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Delay; Energy consumption; Fuzzy systems; Handheld
                 computers; Joining processes; Mobile computing;
                 Multimedia systems; Performance loss; Personal digital
                 assistants; Portable computers",
}

@Article{Gordon-Ross:2002:EFP,
  author =       "A. Gordon-Ross and S. Cotterell and F. Vahid",
  title =        "Exploiting Fixed Programs in Embedded Systems: a Loop
                 Cache Example",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "2--2",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Embedded systems commonly execute one program for
                 their lifetime. Designing embedded system architectures
                 with configurable components, such that those
                 components can be tuned to that one program based on a
                 program pre-analysis, can yield significant power and
                 performance benefits. We illustrate such benefits by
                 designing a loop cache specifically with tuning in
                 mind. Our results show a 70\% reduction in instruction
                 memory access, for MIPS and 8051 processors
                 representing twice the reduction from a regular loop
                 cache, translating to good power savings.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architecture tuning; Computer architecture; Computer
                 science; Costs; Digital cameras; Embedded computing;
                 Embedded system; embedded systems.; fixed program; Loop
                 cache; low power; Microcomputers; Microprocessor chips;
                 Portable computers; Power engineering computing",
}

@Article{Choi:2002:LPT,
  author =       "Jin-Hyuck Choi and Jung-Hoon Lee and Seh-Woong Jeong
                 and Shin-Dug Kim and C. Weems",
  title =        "A Low Power {TLB} Structure for Embedded Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "3--3",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We present a new two-level TLB (translation look-aside
                 buffer) architecture that integrates a 2-way banked
                 filter TLB with a 2-way banked main TLB. The objective
                 is to reduce power consumption in embedded processors
                 by distributing the accesses to TLB entries across the
                 banks in a balanced manner. First, an advanced
                 filtering technique is devised to reduce access power
                 by adopting a sub-bank structure. Second, a
                 bank-associative structure is applied to each level of
                 the TLB hierarchy. Simulation results show that the
                 Energy*Delay product can be reduced by about 40.9\%
                 compared to a fully associative TLB, 24.9\% compared to
                 a micro-TLB with 4+32 entries, and 12.18\% compared to
                 a micro-TLB with 16+32 entries.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bank associative structure; CADCAM; Circuits; Computer
                 aided manufacturing; Degradation; Embedded system;
                 Energy consumption; Filter bank; filter mechanism;
                 Filtering; low power design; Power filters; translation
                 look-aside buffer; Virtual private networks",
}

@Article{Towles:2002:WCT,
  author =       "B. Towles and W. J. Dally",
  title =        "Worst-case Traffic for Oblivious Routing Functions",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "4--4",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper presents an algorithm to find a worst-case
                 traffic pattern for any oblivious routing algorithm on
                 an arbitrary interconnection network topology. The
                 linearity of channel loading offered by oblivious
                 routing algorithms enables the problem to be mapped to
                 a bipartite maximum-weight matching, which can be
                 solved in polynomial time for routing functions with a
                 polynomial number of paths. Finding exact worst case
                 performance was previously intractable, and we
                 demonstrate an example case where traditional
                 characterization techniques overestimate the throughput
                 of a particular routing algorithm by 47\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bipartite graph; Linearity; Multiprocessor
                 interconnection networks; Network topology; oblivious
                 routing; Pattern matching; Polynomials; Routing;
                 Telecommunication traffic; Throughput; worst-case
                 throughput",
}

@Article{Unsal:2002:CFC,
  author =       "O. S. Unsal and C. M. Krishna and C. A. Mositz",
  title =        "{Cool-Fetch}: Compiler-Enabled Power-Aware Fetch
                 Throttling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "5--5",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper, we present an architecture compiler
                 based approach to reduce energy consumption in the
                 processor. While we mainly target the fetch unit, an
                 important side-effect of our approach is that we obtain
                 energy savings in many other parts in the processor.
                 The explanation is that the fetch unit often runs
                 substantially ahead of execution, bringing in
                 instructions to different stages in the processor that
                 may never be executed. We have found, that although the
                 degree of Instruction Level Parallelism (ILP)of a
                 program tends to vary over time, it can be statically
                 predicted by the compiler with considerable accuracy.
                 Our Instructions Per Clock (IPC) prediction scheme is
                 using a dependence-testing-based analysis and simple
                 heuristics, to guide a front-end fetch-throttling
                 mechanism. We develop the necessary architecture
                 support and include its power overhead. We perform
                 experiments over a wide number of architectural
                 configurations, using SPEC2000 applications. Our
                 results are very encouraging: we obtain up to 15\%total
                 energy savings in the processor with generally little
                 performance degradation. In fact, in some cases our
                 intelligent throttling scheme even increases
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Clocks; compiler architecture interaction;
                 Degradation; Energy consumption; fetch-throttling;
                 instruction level parallelism; Low power design;
                 Program processors",
}

@Article{Shang:2002:PEI,
  author =       "Li Shang and L. Peh and N. K. Jha",
  title =        "Power-efficient Interconnection Networks: Dynamic
                 Voltage Scaling with Links",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "6--6",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Power consumption is a key issue in high performance
                 interconnection network design. Communication links,
                 already a significant consumer of power now, will take
                 up an ever larger portion of the power budget as demand
                 for network bandwidth increases. In this paper, we
                 motivate the use of dynamic voltage scaling (DVS) for
                 links, where the frequency and voltage of links are
                 dynamically adjusted to minimize power consumption. We
                 propose a history-based DVS algorithm that judiciously
                 adjusts DVS policies based on past link utilization.
                 Despite every conservative assumptions about DVS link
                 characteristics, our approach realizes up to 4.5X power
                 savings (3.2X average), with just an average 27.4\%
                 latency increase and 2.5\% throughput reduction. To the
                 best of our knowledge, this is the first study that
                 targets dynamic power optimization of interconnection
                 networks.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Clocks; Dynamic voltage scaling; Frequency
                 synthesizers; interconnection network; Multiprocessor
                 interconnection networks; power optimization.;
                 Regulators",
}

@Article{KleinOsowski:2002:MNS,
  author =       "A. J. KleinOsowski and D. J. Lilja",
  title =        "{MinneSPEC}: a New {SPEC} Benchmark Workload for
                 Simulation-Based Computer Architecture Research",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "7--7",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Computer architects must determine how to most
                 effectively use finite computational resources when
                 running simulations to evaluate new architectural
                 ideas. To facilitate efficient simulations with a range
                 of benchmark programs, we have developed the MinneSPEC
                 input set for the SPEC CPU 2000 benchmark suite. This
                 new workload allows computer architects to obtain
                 simulation results in a reasonable time using existing
                 simulators. While the MinneSPEC workload is derived
                 from the standard SPEC CPU 2000 work load, it is a
                 valid benchmark suite in and of itself for
                 simulation-based research. MinneSPEC also may be used
                 to run large numbers of simulations to find ``sweet
                 spots'' in the evaluation parameters pace. This small
                 number of promising design points subsequently may be
                 investigated in more detail with the full SPEC
                 reference workload. In the process of developing the
                 MinneSPEC datasets, we quantify its differences in
                 terms of function-level execution patterns, instruction
                 mixes, and memory behaviors compared to the SPEC
                 programs when executed with the reference inputs. We
                 find that for some programs, the MinneSPEC profiles
                 match the SPEC reference dataset program behavior very
                 closely. For other programs, however, the MinneSPEC
                 inputs produce significantly different program
                 behavior. The MinneSPEC workload has been recognized by
                 SPEC and is distributed with Version 1.2 and higher of
                 the SPEC CPU 2000 benchmark suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computational modeling; Computer architecture;
                 Computer simulation",
}

@Article{Vandierendonck:2002:ATC,
  author =       "H. Vandierendonck and K. {De Bosschere}",
  title =        "An Address Transformation Combining Block- and
                 Word-Interleaving",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "8--8",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As future superscalar processors employ higher issue
                 widths, an increasing number of load/store instructions
                 needs to be executed each cycle to sustain high
                 performance. Multi-bank data caches attempt to address
                 this issue in a cost-effective way. R multi-bank cache
                 consists of multiple cache banks that each support one
                 load/store instruction per clock cycle. The
                 interleaving of cache blocks over the banks is of
                 primary importance. Two common choices are
                 block-interleaving and word-interleaving. AC through
                 word-interleaving leads to higher PC, it is more
                 expensive to implement than block-interleaving since it
                 requires the tag array of the cache to be multi-ported.
                 By swapping the bits in the effective address that are
                 used by word-interleaving with those used by
                 block-interleaving, it is possible to implement a
                 word-interleaved cache with the same cost, cycle time
                 and power consumption of a block interleaved cache.
                 Because this makes the L1 data cache blocks sparse,
                 additional costs are incurred at different levels of
                 the memory hierarchy.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Block-Interleaving; Clocks; Costs; Data cache; Energy
                 consumption; Interleaved codes; Multi-Banking;
                 Word-Interleaving.",
}

@Article{Tambat:2002:PLB,
  author =       "S. Tambat and S. Vajapeyam",
  title =        "Page-Level Behavior of Cache Contention",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "9--9",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Cache misses in small, limited-associativity primary
                 caches very often replace live cache blocks, given the
                 dominance of capacity and conflict misses. Towards
                 motivating novel cache organizations, we study the
                 comparative characteristics of the virtual memory
                 address pairs involved in typical primary-cache
                 contention (block replacements) for the SPEC2000integer
                 benchmarks. We focus on the cache tag bits, and results
                 show that (i) often just a few tag bits differ between
                 contending addresses, and (ii) accesses to certain
                 segments or page groups of the virtual address space
                 (i.e., certain tag-bit groups) contend frequently.
                 Cache conscious virtual address space allocation can
                 further reduce the number of conflicting tag bits. We
                 mention two directions for exploiting such page-level
                 contention patterns to improve cache cost and
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Automation; Benchmark testing; Cache Contention; Cache
                 Tags; Computer science; Data Cache; Libraries; Memory
                 Access Characterization; Microprocessors; Optimizing
                 compilers; Traffic control; Workstations",
}

@Article{Juang:2002:IDT,
  author =       "Philo Juang and P. Diodato and S. Kaxiras and K.
                 Skadron and Zhigang Hu and M. Martonosi and D. W.
                 Clark",
  title =        "Implementing Decay Techniques using {4T} Quasi-Static
                 Memory Cells",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "10--10",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper proposes the use of four-transistor (4T)
                 cache and branch predictor array cell designs to
                 address increasing worries regarding leakage power
                 dissipation. While 4T designs lose state when
                 infrequently accessed, they have very low leakage,
                 smaller area, and no capacitive loads to switch. This
                 short paper gives an overview of 4T implementation
                 issues and a preliminary evaluation of leakage-energy
                 savings that shows improvements of 60-80\%",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Circuit simulation; Delay; Leakage current; Libraries;
                 Microarchitecture; Power dissipation; Power generation;
                 Random access memory; Switches; Transistors",
}

@Article{Sohn:2002:RRE,
  author =       "YoungChul Sohn and NaiHoon Jung and Seungryoul Maeng",
  title =        "Request Reordering to Enhance the Performance of
                 Strict Consistency Models",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "11--11",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Advances in ILP techniques enable strict consistency
                 models to relax memory order through speculative
                 execution of memory operations. However, ordering
                 constraints still hinder the performance because
                 speculatively executed operations cannot be committed
                 out of program order for the possibility of
                 mis-speculation. In this paper, we propose a new
                 technique which allows memory operations to be
                 non-speculatively committed out of order without
                 violating consistency constraints.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ILP; memory consistency model; multiprocessor",
}

@Article{Shaw:2002:MSC,
  author =       "K. A. Shaw and W. J. Dally",
  title =        "Migration in Single Chip Multiprocessors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "12--12",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Global communication costs in future single-chip
                 multiprocessors will increase linearly with distance.
                 In this paper, we revisit the issues of locality and
                 load balance in order to take advantage of these new
                 costs. We present a technique which simultaneously
                 migrates data and threads based on vectors specifying
                 locality and resource usage. This technique improves
                 performance on applications with distinguishable
                 locality and imbalanced resource usage. 64\% of the
                 ideal reduction in execution time was achieved on an
                 application with these traits while no improvement was
                 obtained on a balanced application with little
                 locality.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Cost function; Delay; Global communication;
                 Laboratories; Logic; Monitoring; Multiprocessing
                 systems; Wire",
}

@Article{Sihn:2003:SCS,
  author =       "K.-H. Sihn and Joonwon Lee and Jung-Wan Cho",
  title =        "A Speculative Coherence Scheme using Decoupling
                 Synchronization for Multiprocessor Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "1--1",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper proposes a new speculative coherence
                 scheme, SCDS, for hardware distributed shared memory
                 systems to reduce the overhead of coherence action in
                 directory-based cache-coherence protocol. SCDS has two
                 main features, predicting accurate timing of
                 speculative coherence with synchronization information
                 and detecting write pattern(migratory and
                 non-migratory) for exclusive blocks' speculative
                 coherence action. In our simulation, SCDS outperforms
                 existing schemes (DSI and LTP) for well-synchronized
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Access protocols; Coherence; Costs; Delay; Hardware;
                 Multiprocessing systems; Personal communication
                 networks; Runtime; Timing; Watches",
}

@Article{Kumar:2003:PPR,
  author =       "R. Kumar and K. Farkas and N. P. Jouppi and P.
                 Ranganathan and D. M. Tullsen",
  title =        "Processor Power Reduction Via Single-{ISA}
                 Heterogeneous Multi-Core Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "2--2",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper proposes a single-ISA heterogeneous
                 multi-core architecture as a mechanism to reduce
                 processor power dissipation. It assumes a single chip
                 containing a diverse set of cores that target different
                 performance levels and consume different levels of
                 power. During an application's execution, system
                 software dynamically chooses the most appropriate core
                 to meet specific performance and power requirements. It
                 describes an example architecture with five cores of
                 varying performance and complexity. Initial results
                 demonstrate a five-fold reduction in energy at a cost
                 of only 25\% performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; chip multiprocessor; Computer
                 architecture; Computer science; Costs; Energy
                 consumption; Fans; low-power architecture; Packaging;
                 Power dissipation; Power engineering and energy; System
                 software",
}

@Article{Sendag:2003:ACE,
  author =       "R. Sendag and Peng-fei Chuang and D. J. Lilja",
  title =        "Address Correlation: Exceeding the Limits of
                 Locality",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "3--3",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We investigate a program phenomenon, Address
                 Correlation, which links addresses that reference the
                 same data. This work shows that different addresses
                 containing the same data can often be correlated at
                 run-time to eliminate a load miss or a partial hit. For
                 ten of the SPEC CPU2000 benchmarks, 57 to 99\% of all
                 L1 data cache load misses, and 4 to 85\% of all partial
                 hits, can be supplied from a correlated address already
                 found in the cache. Our source code-level analysis
                 shows that semantically equivalent information,
                 duplicated references, and frequent values are the
                 major causes of address correlations. We also show
                 that, on average, 68\% of the potential correlated
                 addresses that could supply data on a miss of an
                 address containing the same value can be correlated at
                 run time. These correlated addresses correspond to an
                 average of 62\% of all misses in the benchmark programs
                 tested.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Delay; Electronic mail; Hardware;
                 History; Microarchitecture; Object oriented modeling;
                 Out of order; Runtime; Tellurium",
}

@Article{Milenkovic:2003:SBT,
  author =       "A. Milenkovic and M. Milenkovic",
  title =        "Stream-Based Trace Compression",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "4--4",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Trace-driven simulation has long been used in both
                 processor and memory studies. The large size of traces
                 motivated different techniques for trace reduction.
                 These techniques often combine standard compression
                 algorithms with trace-specific solutions, taking into
                 account the tradeoff between reduction in the trace
                 size and simulation slowdown due to decompression. This
                 paper introduces SBC, a new algorithm for instruction
                 and data address trace compression based on instruction
                 streams. The proposed technique significantly reduces
                 trace size and simulation time, and it is orthogonal to
                 general compression algorithms. When combined with
                 gzip, SBC reduces the size of SPEC CPU2000 traces
                 94-71968 times.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Clocks; Compression algorithms; Computational
                 modeling; Computer architecture; Computer simulation;
                 Data mining; Information analysis; instruction and
                 address trace; Instruments; Predictive models;
                 Redundancy; simulation; trace compression",
}

@Article{Zhang:2003:WHC,
  author =       "Chuanjun Zhang and F. Vahid and Jun Yang and W.
                 Walid",
  title =        "A Way-Halting Cache for Low-Energy High-Performance
                 Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "5--5",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We have designed a low power four-way set associative
                 cache that stores the four lowest-order bits of all way
                 stags into a fully associative memory, which we call
                 the halt tag array. The comparison of the halt tag
                 array with the desired tag occurs concurrently with the
                 address decoding that determines which tag and data
                 ways to read from. The halt tag array predetermines
                 most tags that cannot match due to their low-order four
                 bits mismatching. Further accesses to ways with known
                 mismatching tags are then halted, thus saving power.
                 Our halt tag array has the additional feature of using
                 static logic only, rather than dynamic logic used in
                 highly-associative caches, making our cache consumes
                 even less power. Our result shows55\% savings of memory
                 access related energy over a conventional four-way
                 set-associative cache. We show nearly 2x energy savings
                 compared with highly associative caches, while imposing
                 no performance overhead and only 2\% cache area over
                 head.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Cams; Circuits; Computer science; Decoding; Design
                 engineering; Embedded computing; Logic arrays; Power
                 engineering and energy; Power engineering computing;
                 Switches",
}

@Article{Cohen:2003:EOP,
  author =       "A. Cohen and F. Finkelstein and A. Mendelson and R.
                 Ronen and D. Rudoy",
  title =        "On Estimating Optimal Performance of {CPU} Dynamic
                 Thermal Management",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "6--6",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper we focus on dynamic thermal management
                 (DTM) strategies that use dynamic voltage scaling
                 (DVS)for power control. We perform a theoretical
                 analysis targeted at estimating the optimal strategy,
                 and show two facts: (1) when there is a gap between the
                 initial and the limit temperatures, it is best to start
                 with a high (though not necessarily maximal)frequency
                 and decrease it exponentially until the limit
                 temperature is reached; (2) when being close to the
                 limit temperature, the best strategy is to stay there.
                 We use the patterns exhibited by the optimal strategy
                 in order to analyze some existing DTM techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Costs; DTM; DVS; Energy management; Frequency
                 estimation; Microprocessors; optimal control; Pattern
                 analysis; Performance analysis; Temperature control;
                 Temperature sensors; Thermal management; Voltage
                 control",
}

@Article{Cristal:2003:CRC,
  author =       "A. Cristal and J. F. Martinez and J. Llosa and M.
                 Valero",
  title =        "A case for resource-conscious out-of-order
                 processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "7--7",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Modern out-of-order processors tolerate long-latency
                 memory operations by supporting a large number of
                 in-flight instructions. This is achieved in part
                 through proper sizing of critical resources, such as
                 register files or instruction queues. In light of the
                 increasing gap between processor speed and memory
                 latency, tolerating upcoming latencies in this way
                 would require impractical sizes of such critical
                 resources. To tackle this scalability problem, we make
                 a case for resource-conscious out-of-order processors.
                 We present quantitative evidence that critical
                 resources are increasingly underutilized in these
                 processors. We advocate that better use of such
                 resources should be a priority in future research in
                 processor architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bars; checkpointing.; Computer aided instruction;
                 Delay; instruction-level parallelism; Laboratories;
                 memory latency; Optimal control; Out of order;
                 Out-of-order processor; Queueing analysis; Registers;
                 Resource management; resource utilization; Voltage
                 control",
}

@Article{Citron:2004:ELE,
  author =       "D. Citron",
  title =        "Exploiting Low Entropy to Reduce Wire Delay",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "1--1",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Wires shrink less efficiently than transistors.
                 Smaller dimensions increase relative delay and the
                 probability of crosstalk. Solutions to this problem
                 include adding additional latency with pipelining,
                 using ``fat wires'' at higher metal levels, and
                 advances in process and material technology. We propose
                 a stopgap solution to this problem by applying a decade
                 old technique called bus-expanding to the problem. By
                 exploiting low spatial and temporal entropy of data it
                 is possible to transfer m bits of data over a n-bit
                 wide bus in a single cycle (m > n ). High entropy data
                 will be routed directly over the bus while low entropy
                 data will be compacted using small lookup tables. A
                 table index will be transferred in the case of a
                 successful lookup, otherwise the full value will be
                 transferred in several cycles. Reducing the number of
                 wires per bus, enables the use of wider wires, which in
                 turn reduces the wire delay. Examination of projected
                 process technologies shows that by shrinking the number
                 of bits in a bus (64 > 48) instead of shrinking the
                 individual wires maintains a constant wire delay. Tests
                 on SPEC CPU2000 have shown that for the 64-bit buses
                 leading from the L1 caches to the processor core it is
                 possible to transfer all data types (addresses,
                 integers, instructions and floating-points) using
                 40-bits per bus on the average.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Area measurement; Compaction; Crosstalk; Delay;
                 Entropy; Materials science and technology; Pipeline
                 processing; Power measurement; Transistors; Wire",
}

@Article{Singh:2004:GAL,
  author =       "A. Singh and W. J. Dally and B. Towles and A. K.
                 Gupta",
  title =        "Globally Adaptive Load-Balanced Routing on Tori",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "2--2",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We introduce a new method of adaptive routing on k-ary
                 n-cubes, Globally Adaptive Load-Balance (GAL). GAL
                 makes global routing decisions using global
                 information. In contrast, most previous adaptive
                 routing algorithms make local routing decisions using
                 local information (typically channel queue depth). GAL
                 senses global congestion using segmented injection
                 queues to decide the directions to route in each
                 dimension. It further load balances the network by
                 routing in the selected directions adaptively. Using
                 global information, GAL achieves the performance
                 (latency and throughput) of minimal adaptive routing on
                 benign traffic patterns and performs as well as the
                 best obliviously load-balanced routing algorithm (GOAL)
                 on adversarial traffic.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Chaos; Delay; Nearest neighbor searches; Routing;
                 Stability; Switches; Telecommunication traffic;
                 Throughput; Tornadoes; Traffic control",
}

@Article{Gomez:2004:EFT,
  author =       "M. E. Gomez and J. Duato and J. Flich and P. Lopez and
                 A. Robles and N. A. Nordbotten and O. Lysne and T.
                 Skeie",
  title =        "An Efficient Fault-Tolerant Routing Methodology for
                 Meshes and Tori",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "3--3",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper we present a methodology to design
                 fault-tolerant routing algorithms for regular direct
                 interconnection networks. It supports fully adaptive
                 routing, does not degrade performance in the absence of
                 faults, and supports a reasonably large number of
                 faults without significantly degrading performance. The
                 methodology is mainly based on the selection of an
                 intermediate node (if needed) for each
                 source-destination pair. Packets are adaptively routed
                 to the intermediate node and, at this node, without
                 being ejected, they are adaptively forwarded to their
                 destinations. In order to allow deadlock-free minimal
                 adaptive routing, the methodology requires only one
                 additional virtual channel (for a total of three), even
                 for tori. Evaluation results for a 4 x 4 x 4 torus
                 network show that the methodology is 5-fault tolerant.
                 Indeed, for up to 14 link failures, the percentage of
                 fault combinations supported is higher than 99.96\%.
                 Additionally, network throughput degrades by less than
                 10\% when injecting three random link faults without
                 disabling any node. In contrast, a mechanism similar to
                 the one proposed in the BlueGene/L, that disables some
                 network planes, would strongly degrade network
                 throughput by 79\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithm design and analysis; Circuit faults;
                 Degradation; Design methodology; Electronic mail; Fault
                 tolerance; Multiprocessor interconnection networks;
                 Routing; Switches; Throughput",
}

@Article{Stine:2004:CAR,
  author =       "J. M. Stine and N. P. Carter and J. Flich",
  title =        "Comparing Adaptive Routing and Dynamic Voltage Scaling
                 for Link Power Reduction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "4--4",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We compare techniques that dynamically scale the
                 voltage of individual network links to reduce power
                 consumption with an approach in which all links in the
                 network are set to the same voltage and adaptive
                 routing is used to distribute load across the network.
                 Our results show that adaptive routing with static
                 network link voltages outperforms dimension-order
                 routing with dynamic link voltages in all cases,
                 because the adaptive routing scheme can respond more
                 quickly to changes in network demand. Adaptive routing
                 with static link voltages also outperforms adaptive
                 routing with dynamic link voltages in many cases,
                 although dynamic link voltage scaling gives better
                 behavior as the demand on the network grows.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Clocks; Dynamic voltage scaling; Energy
                 consumption; Frequency; Network-on-a-chip; Routing;
                 Telecommunication traffic; Traffic control; Voltage
                 control",
}

@Article{Robatmili:2004:TSI,
  author =       "B. Robatmili and N. Yazdani and S. Sardashti and M.
                 Nourani",
  title =        "Thread-Sensitive Instruction Issue for {SMT}
                 Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "5--5",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Simultaneous Multi Threading (SMT) is a processor
                 design method in which concurrent hardware threads
                 share processor resources like functional units and
                 memory. The scheduling complexity and performance of an
                 SMT processor depend on the topology used in the fetch
                 and issue stages. In this paper, we propose a thread
                 sensitive issue policy for a partitioned SMT processor
                 which is based on a thread metric. We propose the
                 number of ready-to-issue instructions of each thread as
                 priority metric. To evaluate our method, we have
                 developed a reconfigurable SMT-simulator on top of the
                 SimpleScalar Toolset. We simulated our modeled
                 processor under several workloads composed of SPEC
                 benchmarks. Experimental results show around 30\%
                 improvement compared to the conventional OLDEST\_FIRST
                 mixed topology issue policy. Additionally, the hardware
                 implementation of our architecture with this metric in
                 issue stage is quite simple.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Clocks; Delay; Frequency; Intrusion detection;
                 Laboratories; Logic; Processor scheduling;
                 Surface-mount technology; Topology",
}

@Article{Luo:2004:EES,
  author =       "Yue Luo and L. K. John",
  title =        "Efficiently Evaluating Speedup Using Sampled Processor
                 Simulation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "6--6",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Cycle accurate simulation of processors is extremely
                 time consuming. Sampling can greatly reduce simulation
                 time while retaining good accuracy. Previous research
                 on sampled simulation has been focusing on the accuracy
                 of CPI. However, most simulations are used to evaluate
                 the benefit of some microarchitectural enhancement, in
                 which the speedup is a more important metric than CPI.
                 We employ the ratio estimator from statistical sampling
                 theory to design efficient sampling to measure speedup
                 and to quantify its error. We show that to achieve a
                 given relative error limit for speedup, it is not
                 necessary to estimate CPI to the same accuracy. In our
                 experiment, estimating speedup requires about 9X fewer
                 instructions to be simulated in detail in comparison to
                 estimating CPI for the same relative error limit.
                 Therefore using the ratio estimator to evaluate speedup
                 is much more cost-effective and offers great potential
                 for reducing simulation time. We also discuss the
                 reason for this interesting and important result.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; Clocks; Computational modeling;
                 Computer errors; Computer simulation; Frequency;
                 Microarchitecture; Sampling methods; Size measurement;
                 Velocity measurement",
}

@Article{Ceze:2004:CHL,
  author =       "L. Ceze and K. Strauss and J. Tuck and J. Renau and J.
                 Torrellas",
  title =        "{CAVA}: Hiding {L2} Misses with Checkpoint-Assisted
                 Value Prediction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "7--7",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Load misses in on-chip L2 caches often end up stalling
                 modern superscalars. To address this problem, we
                 propose hiding L2 misses with Checkpoint-Assisted VAlue
                 prediction (CAVA). When a load misses in L2, a
                 predicted value is returned to the processor. If the
                 missing load reaches the head of the reorder buffer
                 before the requested data is received from memory, the
                 processor checkpoints, consumes the predicted value,
                 and speculatively continues execution. When the
                 requested data finally arrives, it is compared to the
                 predicted value. If the prediction was correct,
                 execution continues normally; otherwise, execution
                 rolls back to the checkpoint. Compared to a baseline
                 aggressive superscalar, CAVA speeds up execution by a
                 geometric mean of 1.14 for SPECint and 1.34 for SPECfp
                 applications. Additionally, CAVA is faster than an
                 implementation of Runahead execution, and Runahead with
                 value prediction.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; Checkpointing; Costs; Delay;
                 Hardware; Microarchitecture; Out of order; Pipelines;
                 Prefetching; Recycling",
}

@Article{Singh:2004:BDB,
  author =       "A. Singh and W. J. Dally",
  title =        "Buffer and Delay Bounds in High Radix Interconnection
                 Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "8--8",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We apply recent results in queueing theory to propose
                 a methodology for bounding the buffer depth and packet
                 delay in high radix interconnection networks. While
                 most work in interconnection networks has been focused
                 on the throughput and average latency in such systems,
                 few studies have been done providing statistical
                 guarantees for buffer depth and packet delays. These
                 parameters are key in the design and performance of a
                 network. We present a methodology for calculating such
                 bounds for a practical high radix network and through
                 extensive simulations show its effectiveness for both
                 bursty and non-bursty injection traffic. Our results
                 suggest that modest speedups and buffer depths enable
                 reliable networks without flow control to be
                 constructed.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Convergence; Delay; Intelligent networks;
                 Multiprocessor interconnection networks; Queueing
                 analysis; Supercomputers; Switches; Telecommunication
                 traffic; Throughput; Traffic control",
}

@Article{Holloway:2004:CPS,
  author =       "A. L. Holloway and G. S. Sohi",
  title =        "Characterization of Problem Stores",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "9--9",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper introduces the concept of problem stores:
                 static stores whose dependent loads often miss in the
                 cache. Accurately identifying problem stores allows the
                 early determination of addresses likely to cause later
                 misses, potentially allowing for the development of
                 novel, proactive prefetching and memory hierarchy
                 management schemes. We present a detailed empirical
                 characterization of problem stores using the SPEC2000
                 CPU benchmarks. The data suggests several key
                 observations about problem stores. First, we find that
                 the number of important problem stores is typically
                 quite small; the worst 100 problem stores write the
                 values that will lead to about 90\% of non-cold misses
                 for a variety of cache configurations. We also find
                 that problem stores only account for 1 in 8 dynamic
                 stores, though they result in 9 of 10 misses.
                 Additionally, the problem stores dependent loads miss
                 in the L2 cache a larger fraction of the time than
                 loads not dependent on problem stores. We also observe
                 the set of problem stores is stable across a variety of
                 cache configurations. Finally, we found that the
                 instruction distance from problem store to miss and
                 problem store to evict is often greater than one
                 million instructions, but the value is often needed
                 within 100,000 instructions of the eviction.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Delay; Hardware; Memory management; Prefetching;
                 Proposals; Timing",
}

@Article{Sazeides:2005:DIB,
  author =       "Y. Sazeides and R. Kumar and D. M. Tullsen and T.
                 Constantinou",
  title =        "The Danger of Interval-Based Power Efficiency Metrics:
                 When Worst Is Best",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "4",
  number =       "1",
  pages =        "1--1",
  month =        jan,
  year =         "2005",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2005.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper shows that if the execution of a program is
                 divided into distinct intervals, it is possible for one
                 processor or configuration to provide the best power
                 efficiency over every interval, and yet have worse
                 overall power efficiency over the entire execution than
                 other configurations. This unintuitive behavior is a
                 result of a seemingly intuitive use of power efficiency
                 metrics, and can result in suboptimal design and
                 execution decisions. This behavior may occur when using
                 the energy-delay product and energy-delay product
                 metrics but not with the energy metric.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Battery charge measurement; Clocks; Computer science;
                 Delay; Design optimization; Frequency; Out of order;
                 Power engineering and energy; Power measurement",
}

@Article{Mutlu:2005:RRP,
  author =       "O. Mutlu and Hyesoon Kim and J. Stark and Y. N. Patt",
  title =        "On Reusing the Results of Pre-Executed Instructions in
                 a Runahead Execution Processor",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "4",
  number =       "1",
  pages =        "2--2",
  month =        jan,
  year =         "2005",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2005.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Previous research on runahead execution took it for
                 granted as a prefetch-only technique. Even though the
                 results of instructions independent of an L2 miss are
                 correctly computed during runahead mode, previous
                 approaches discarded those results instead of trying to
                 utilize them in normal mode execution. This paper
                 evaluates the effect of reusing the results of
                 preexecuted instructions on performance. We find that,
                 even with an ideal scheme, it is not worthwhile to
                 reuse the results of preexecuted instructions. Our
                 analysis provides insights into why result reuse does
                 not provide significant performance improvement in
                 runahead processors and concludes that runahead
                 execution should be employed as a prefetching mechanism
                 rather than a full-blown prefetching/result-reuse
                 mechanism.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Computational modeling; Computer aided
                 instruction; Delay; Energy consumption;
                 Microprocessors; Performance analysis; Prefetching;
                 Registers",
}

@Article{Zhang:2006:BIC,
  author =       "Chuanjun Zhang",
  title =        "Balanced instruction cache: reducing conflict misses
                 of direct-mapped caches through balanced subarray
                 accesses",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "2--5",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "It is observed that the limited memory space of
                 direct-mapped caches is not used in balance therefore
                 incurs extra conflict misses. We propose a novel cache
                 organization of a balanced cache, which balances
                 accesses to cache sets at the granularity of cache
                 subarrays. The key technique of the balanced cache is a
                 programmable subarray decoder through which the mapping
                 of memory reference addresses to cache subarrays can be
                 optimized hence conflict misses of direct-mapped caches
                 can be resolved. The experimental results show that the
                 miss rate of balanced cache is lower than that of the
                 same sized two-way set-associative caches on average
                 and can be as low as that of the same sized four-way
                 set-associative caches for particular applications.
                 Compared with previous techniques, the balanced cache
                 requires only one cycle to access all cache hits and
                 has the same access time as direct-mapped caches",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "balanced instruction cache; balanced subarray
                 accesses; Bridges; Cache memory; cache organization;
                 cache storage; Clocks; conflict miss reduction;
                 Decoding; Delay; Frequency; High performance computing;
                 programmable subarray decoder; storage allocation",
}

@Article{Ottoni:2006:SPC,
  author =       "G. Ottoni and R. Rangan and A. Stoler and M. J.
                 Bridges and D. I. August",
  title =        "From sequential programs to concurrent threads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "6--9",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Chip multiprocessors are of increasing importance due
                 to difficulties in achieving higher clock frequencies
                 in uniprocessors, but their success depends on finding
                 useful work for the processor cores. This paper
                 addresses this challenge by presenting a simple
                 compiler approach that extracts non-speculative
                 thread-level parallelism from sequential codes. We
                 present initial results from this technique targeting a
                 validated dual-core processor model, achieving speedups
                 ranging from 9-48\% with an average of 25\% for
                 important benchmark loops over their single-threaded
                 versions. We also identify important next steps found
                 during our pursuit of higher degrees of automatic
                 threading",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "automatic threading; Bridges; Clocks; Computer
                 science; concurrency control; concurrent threads;
                 Frequency; Hardware; Microprocessors; multi-threading;
                 nonspeculative thread-level parallelism; Parallel
                 processing; Pipeline processing; program compiler;
                 program compilers; Program processors; sequential
                 programs",
}

@Article{Gupta:2006:TOI,
  author =       "A. K. Gupta and W. J. Dally",
  title =        "Topology optimization of interconnection networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "10--13",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper describes an automatic optimization tool
                 that searches a family of network topologies to select
                 the topology that best achieves a specified set of
                 design goals while satisfying specified packaging
                 constraints. Our tool uses a model of signaling
                 technology that relates bandwidth, cost and distance of
                 links. This model captures the distance-dependent
                 bandwidth of modern high-speed electrical links and the
                 cost differential between electrical and optical links.
                 Using our optimization tool, we explore the design
                 space of hybrid Clos-torus (C-T) networks. For a
                 representative set of packaging constraints we
                 determine the optimal hybrid C-T topology to minimize
                 cost and the optimal C-T topology to minimize latency
                 for various packet lengths. We then use the tool to
                 measure the sensitivity of the optimal topology to
                 several important packaging constraints such as pin
                 count and critical distance",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Constraint optimization; Costs; Design
                 optimization; hybrid Clos-torus networks;
                 interconnection networks; Multiprocessor
                 interconnection networks; multistage interconnection
                 networks; Network topology; Optical fiber
                 communication; Packaging; signaling technology;
                 signalling; Space exploration; Space technology;
                 telecommunication network topology; topology
                 optimization tool",
}

@Article{Gaudiot:2006:F,
  author =       "J.-L. Gaudiot and Y. Patt and K. Skadon",
  title =        "Foreword",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "11--11",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Forward for issue 1 of 2006",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; Computer Society; Concrete;
                 Delay; Footwear; Software libraries; Vehicles",
}

@Article{Morad:2006:PPE,
  author =       "T. Y. Morad and U. C. Weiser and A. Kolodnyt and M.
                 Valero and E. Ayguade",
  title =        "Performance, power efficiency and scalability of
                 asymmetric cluster chip multiprocessors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "14--17",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper evaluates asymmetric cluster chip
                 multiprocessor (ACCMP) architectures as a mechanism to
                 achieve the highest performance for a given power
                 budget. ACCMPs execute serial phases of multithreaded
                 programs on large high-performance cores whereas
                 parallel phases are executed on a mix of large and many
                 small simple cores. Theoretical analysis reveals a
                 performance upper bound for symmetric multiprocessors,
                 which is surpassed by asymmetric configurations at
                 certain power ranges. Our emulations show that
                 asymmetric multiprocessors can reduce power consumption
                 by more than two thirds with similar performance
                 compared to symmetric multiprocessors",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ACCMP; Application software; asymmetric cluster chip
                 multiprocessors; Chip Multiprocessors; Emulation;
                 Frequency; microprocessor chips; multi-threading;
                 multiprocessing systems; multithreaded program;
                 Optimized production technology; Parallel processing;
                 parallel processing; power consumption reduction; power
                 efficiency; Power Efficiency; Power system modeling;
                 Queueing analysis; Scalability; Upper bound; Voltage",
}

@Article{Riley:2006:PCU,
  author =       "N. Riley and C. Zilles",
  title =        "Probabilistic counter updates for predictor hysteresis
                 and bias",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "18--21",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware predictor designers have incorporated
                 hysteresis and/or bias to achieve desired behavior by
                 increasing the number of bits per counter. Some
                 resulting proposed predictor designs are currently
                 impractical because their counter tables are too large.
                 We describe a method for dramatically reducing the
                 amount of storage required for a predictor's counter
                 table with minimal impact on prediction accuracy.
                 Probabilistic updates to counter state are implemented
                 using a hardware pseudo-random number generator to
                 increment or decrement counters a fraction of the time,
                 meaning fewer counter bits are required. We demonstrate
                 the effectiveness of probabilistic updates in the
                 context of Fields et al.'s critical path predictor,
                 which employs a biased 6-bit counter. Averaged across
                 the SPEC CINT2000 benchmarks, our 2-bit and 3-bit
                 probabilistic counters closely approximate a 6-bit
                 deterministic one (achieving speedups of 7.75\% and
                 7.91\% compared to 7.94\%) when used for
                 criticality-based scheduling in a clustered machine.
                 Performance degrades gracefully, enabling even a 1-bit
                 probabilistic counter to outperform the best 3-bit
                 deterministic counter we found",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; clustered machine; computer architecture;
                 Computer science; Costs; Counting circuits; critical
                 path predictor; criticality-based scheduling;
                 Degradation; Hardware; hardware predictor design;
                 hardware pseudorandom number generator; Hysteresis;
                 Microarchitecture; Pipelines; predictor bias; predictor
                 hysteresis; predictors counter table; probabilistic
                 counter update; probability; Processor scheduling;
                 processor scheduling; random number generation",
}

@Article{Zhou:2006:CFT,
  author =       "Huiyang Zhou",
  title =        "A case for fault tolerance and performance enhancement
                 using chip multi-processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "22--25",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper makes a case for using multi-core
                 processors to simultaneously achieve transient-fault
                 tolerance and performance enhancement. Our approach is
                 extended from a recent latency-tolerance proposal,
                 dual-core execution (DCE). In DCE, a program is
                 executed twice in two processors, named the front and
                 back processors. The front processor pre-processes
                 instructions in a very fast yet highly accurate way and
                 the back processor re-executes the instruction stream
                 retired from the front processor. The front processor
                 runs faster as it has no correctness constraints
                 whereas its results, including timely prefetching and
                 prompt branch misprediction resolution, help the back
                 processor make faster progress. In this paper, we
                 propose to entrust the speculative results of the front
                 processor and use them to check the un-speculative
                 results of the back processor. A discrepancy, either
                 due to a transient fault or a mispeculation, is then
                 handled with the existing mispeculation recovery
                 mechanism. In this way, both transient-fault tolerance
                 and performance improvement can be delivered
                 simultaneously with little hardware overhead",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "back processor; chip multiprocessors; Computer aided
                 software engineering; dual-core execution; Error
                 analysis; Fault tolerance; fault tolerant computing;
                 front processor; Hardware; latency-tolerance proposal;
                 microprocessor chips; mispeculation recovery mechanism;
                 Multicore processing; multiprocessing systems;
                 prefetching; Prefetching; prompt branch misprediction
                 resolution; Proposals; Redundancy; storage management;
                 Throughput; transient-fault tolerance; Transistors",
}

@Article{Lee:2006:ASC,
  author =       "Moon-Sang Lee and Sang-Kwon Lee and Joonwon Lee and
                 Seung-Ryoul Maeng",
  title =        "Adopting system call based address translation into
                 user-level communication",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "26--29",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "User-level communication alleviates the software
                 overhead of the communication subsystem by allowing
                 applications to access the network interface directly.
                 For that purpose, efficient address translation of
                 virtual address to physical address is critical. In
                 this study, we propose a system call based address
                 translation scheme where every translation is done by
                 the kernel instead of a translation cache on a network
                 interface controller as in the previous cache based
                 address translation. According to our experiments, our
                 scheme achieves up to 4.5\% reduction in application
                 execution time compared to the previous cache based
                 approach",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; cache based approach; cache
                 storage; Communication system software; Control
                 systems; Costs; Delay; Electronic mail; Hardware;
                 Kernel; network interface controller; network
                 interfaces; Network interfaces; operating system
                 kernels; Protocols; software overhead; system call
                 based address translation; user-level communication",
}

@Article{Ahn:2006:DPA,
  author =       "Jung Ho Ahn and W. J. Dally",
  title =        "Data parallel address architecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "30--33",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Data parallel memory systems must maintain a large
                 number of outstanding memory references to fully use
                 increasing DRAM bandwidth in the presence of increasing
                 latency. At the same time, the throughput of modern
                 DRAMs is very sensitive to access pattern's due to the
                 time required to precharge and activate banks and to
                 switch between read and write access. To achieve memory
                 reference parallelism a system may simultaneously issue
                 references from multiple reference threads.
                 Alternatively multiple references from a single thread
                 can be issued in parallel. In this paper, we examine
                 this tradeoff and show that allowing only a single
                 thread to access DRAM at any given time significantly
                 improves performance by increasing the locality of the
                 reference stream and hence reducing precharge/activate
                 operations and read/write turnaround. Simulations of
                 scientific and multimedia applications show that
                 generating multiple references from a single thread
                 gives, on average, 17\% better performance than
                 generating references from two parallel threads",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Computer architecture; data parallel
                 address architecture; data parallel memory systems;
                 Delay; DRAM bandwidth; DRAM chips; Memory management;
                 parallel architectures; parallel memories; Parallel
                 processing; Random access memory; read access;
                 Scheduling; Streaming media; Switches; write access",
}

@Article{Eisley:2006:NCC,
  author =       "N. Eisley and Li-Shiuan Peh and Li Shang",
  title =        "In-network cache coherence",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "34--37",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We propose implementing cache coherence protocols
                 within the network, demonstrating how an in-network
                 implementation of the MSI directory-based protocol
                 allows for in-transit optimizations of read and write
                 delay. Our results show 15\% and 24\% savings on
                 average in memory access latency for SPLASH-2 parallel
                 benchmarks running on a 4times4 and a 16times16
                 multiprocessor respectively",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Access protocols; benchmark testing; cache coherence;
                 cache storage; Coherence; Delay; delays; Fabrics;
                 interconnection network; memory access latency; Memory
                 architecture; memory architecture; memory protocols;
                 Moore's Law; MSI directory-based protocol;
                 Multiprocessor interconnection networks; network cache
                 coherence protocols; parallel processing; read delay;
                 SPLASH-2 parallel benchmarks; write delay",
}

@Article{Srinivasan:2006:PMU,
  author =       "R. Srinivasan and J. Cook and O. Lubeck",
  title =        "Performance modeling using {Monte Carlo} simulation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "38--41",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/intel-ia-64.bib",
  abstract =     "Cycle accurate simulation has long been the primary
                 tool for micro-architecture design and evaluation.
                 Though accurate, the slow speed often imposes
                 constraints on the extent of design exploration. In
                 this work, we propose a fast, accurate Monte-Carlo
                 based model for predicting processor performance. We
                 apply this technique to predict the CPI of in-order
                 architectures and validate it against the Itanium-2.
                 The Monte Carlo model uses micro-architecture
                 independent application characteristics, and cache,
                 branch predictor statistics to predict CPI with an
                 average error of less than 7\%. Since prediction is
                 achieved in a few seconds, the model can be used for
                 fast design space exploration that can efficiently cull
                 the space for cycle-accurate simulations. Besides
                 accurately predicting CPI, the model also breaks down
                 CPI into various components, where each component
                 quantifies the effect of a particular stall condition
                 (branch misprediction, cache miss, etc.) on overall
                 CPI. Such a CPI decomposition can help processor
                 designers quickly identify and resolve critical
                 performance bottlenecks",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "branch predictor statistics; Computational modeling;
                 Computer architecture; CPI decomposition; design space
                 exploration; Error analysis; Itanium-2; Laboratories;
                 Mathematical analysis; memory architecture;
                 microarchitecture design; microarchitecture evaluation;
                 Monte Carlo methods; Monte Carlo simulation;
                 performance evaluation; Predictive models; Process
                 design; processor performance modeling; program
                 processors; Sampling methods; Space exploration",
}

@Article{Ergin:2006:ENV,
  author =       "O. Ergin and O. Unsal and X. Vera and A. Gonzalez",
  title =        "Exploiting Narrow Values for Soft Error Tolerance",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "12--12",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Soft errors are an important challenge in contemporary
                 microprocessors. Particle hits on the components of a
                 processor are expected to create an increasing number
                 of transient errors with each new microprocessor
                 generation. In this paper we propose simple mechanisms
                 that effectively reduce the vulnerability to soft
                 errors In a processor. Our designs are generally
                 motivated by the fact that many of the produced and
                 consumed values in the processors are narrow and their
                 upper order bits are meaningless. Soft errors canted by
                 any particle strike to these higher order bits can be
                 avoided by simply identifying these narrow values.
                 Alternatively soft errors can be detected or corrected
                 on the narrow values by replicating the vulnerable
                 portion of the value inside the storage space provided
                 for the upper order bits of these operands. We offer a
                 variety of schemes that make use of narrow values and
                 analyze their efficiency in reducing soft error
                 vulnerability of level-1 data cache of the processor",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache storage; Cache storage; contemporary
                 microprocessors; data cache; Data Cache; Error
                 correction; error correction; Error Correction; error
                 correction; error detection; Hardware; Impurities;
                 Manufacturing; microprocessor chips; Microprocessors;
                 Multithreading; Narrow Values; narrow values; Neutrons;
                 particle strike; Process design; radiation effects;
                 Random access memory; soft error tolerance; Soft
                 Errors; system recovery; transient errors; transients",
}

@Article{Li:2006:PBH,
  author =       "W. Li and S. Mohanty and K. Kavi",
  title =        "A Page-based Hybrid (Software--Hardware) Dynamic
                 Memory Allocator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "13--13",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2000.bib",
  abstract =     "Modern programming languages often include complex
                 mechanisms for dynamic memory allocation and garbage
                 collection. These features drive the need for more
                 efficient implementation of memory management
                 functions, both in terms of memory usage and execution
                 performance. In this paper, we introduce a software and
                 hardware co-design to improve the speed of the software
                 allocator used in free-BSD systems. The hardware
                 complexity of our design is independent of the dynamic
                 memory size, thus making the allocator suitable for any
                 memory size. Our design improves the performance of
                 memory management intensive benchmarks by as much as
                 43\%. To oar knowledge, this is the first-ever work of
                 this kind, introducing ``hybrid memory allocator''",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; Computer languages; Computer
                 science; Costs; Delay; Dynamic programming; garbage
                 collection; Hardware; hardware complexity;
                 hardware-software codesign; hybrid dynamic memory
                 allocator; Java; memory allocator; memory architecture;
                 memory management; Memory management; modern
                 programming languages; software allocator; Software
                 performance; software-hardware co-design;
                 software/hardware co-design; storage allocation;
                 storage management",
}

@Article{Donald:2006:EPP,
  author =       "J. Donald and M. Martonosi",
  title =        "An Efficient, Practical Parallelization Methodology
                 for Multicore Architecture Simulation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "14--14",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Multiple core designs have become commonplace in the
                 processor market, and are hence a major focus in modern
                 computer architecture research. Thus, for both product
                 development and research, multiple core processor
                 simulation environments are necessary. A well-known
                 positive feedback property of computer design is that
                 we use today's computers to design tomorrow's. Thus,
                 with the emergence of chip multiprocessors, it is
                 natural to re-examine simulation environments written
                 to exploit parallelism. In this paper we present a
                 programming methodology for directly converting
                 existing uniprocessor simulators into parallelized
                 multiple-core simulators. Our method not only takes
                 significantly less development effort compared to some
                 prior used programming techniques, but also possesses
                 advantages by retaining a modular and comprehensible
                 programming structure. We demonstrate our case with
                 actual developed products after applying this method to
                 two different simulators, one developed from IBM
                 Ibrandot and the other from the SimpleScalar tool set.
                 Our SimpleScalar-based framework achieves a parallel
                 speedup of 2.2times on a dual-CPU dual-core (4-way)
                 Opteron server",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "chip multiprocessors; comprehensible programming
                 structure; Computational modeling; Computer
                 architecture; Computer simulation; Feedback; IBM
                 Ibrandot; logic simulation; microcomputers; modern
                 computer architecture; modular programming structure;
                 multicore; multicore architecture simulation; Multicore
                 processing; multiple core processor simulation;
                 multiprocessing systems; Object oriented modeling;
                 parallel architectures; Parallel processing; Parallel
                 programming; parallelism; parallelization method;
                 parallelized multiple-core simulators; positive
                 feedback property; Process planning; Product
                 development; programming methodology; SimpleScalar tool
                 set; simulation",
}

@Article{Bracy:2006:DAC,
  author =       "A. Bracy and K. Doshi and Q. Jacobson",
  title =        "Disintermediated Active Communication",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "15--15",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Disintermediated active communication (DAC) is a new
                 paradigm of communication in which a sending thread
                 actively engages a receiving thread when sending it a
                 message via shared memory. DAC is different than
                 existing approaches that use passive communication
                 through shared-memory --- based on intermittently
                 checking for messages --- or that use preemptive
                 communication but must rely on intermediaries such as
                 the operating system or dedicated interrupt channels.
                 An implementation of DAC builds on existing cache
                 coherency support and exploits light-weight user-level
                 interrupts. Inter-thread communication occurs via
                 monitored memory locations where the receiver thread
                 responds to invalidations of monitored addresses with a
                 light-weight user-level software-defined handler.
                 Address monitoring is supported by cache line
                 user-bits, or CLUbits. CLUbits reside in the cache next
                 to the coherence state, are private per thread, and
                 maintain user-defined per-cache-line state. A light
                 weight software library can demultiplex asynchronous
                 notifications and handle exceptional cases. In
                 DAC-based programs threads coordinate with one another
                 by explicit signaling and implicit resource monitoring.
                 With the simple and direct communication primitives of
                 DAC, multi-threaded workloads synchronize at a finer
                 granularity and more efficiently utilize the hardware
                 of upcoming multi-core designs. This paper introduces
                 DAC, presents several signaling models for DAC-based
                 programs, and describes a simple memory-based framework
                 that supports DAC by leveraging existing
                 cache-coherency models. Our framework is general enough
                 to support uses beyond DAC",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "address monitoring; cache coherency; cache line
                 user-bits; cache storage; CLUbits; Computer aided
                 instruction; Concurrent computing; disintermediated
                 active communication; Hardware; High performance
                 computing; interrupts; interthread communication;
                 memory locations; Monitoring; multi-threading;
                 multicore designs; Operating systems; Processor
                 scheduling; Programming profession; resource
                 monitoring; shared memory; shared memory systems;
                 signaling models; software libraries; Software
                 libraries; software library; storage allocation;
                 user-level interrupts",
}

@Article{Mallik:2006:UDF,
  author =       "A. Mallik and B. Lin and G. Memik and P. Dinda and R.
                 P. Dick",
  title =        "User-Driven Frequency Scaling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "16--16",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We propose and evaluate user-driven frequency scaling
                 (UDFS) for improved power management on processors that
                 support dynamic voltage and frequency scaling (DVFS),
                 e.g, those used in current laptop and desktop
                 computers. UDFS dynamically adapts CPU frequency to the
                 individual user and the workload through a simple user
                 feedback mechanism, unlike currently-used DVFS methods
                 which rely only on CPU utilization. Our UDFS algorithms
                 dramatically reduce typical operating frequencies while
                 maintaining performance at satisfactory levels for each
                 user. We evaluated our techniques through user studies
                 conducted on a Pentium M laptop running Windows
                 applications. The UDFS scheme reduces measured system
                 power by 22.1\%, averaged across all our users and
                 applications, compared to the Windows XP DVFS scheme",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Central Processing Unit; computer power supplies; CPU
                 frequency; DVFS; dynamic frequency scaling; Dynamic
                 voltage scaling; dynamic voltage scaling; Energy
                 consumption; Energy management; Engineering management;
                 Feedback; Frequency control; improved power management;
                 microprocessor chips; Pentium M laptop; Portable
                 computers; power aware computing; Power engineering
                 computing; Power Management; Power measurement; user
                 feedback mechanism; User-aware computing; user-driven
                 frequency scaling; Windows XP DVFS scheme",
}

@Article{Blundell:2006:STM,
  author =       "C. Blundell and E. C. Lewis and M. M. K. Martin",
  title =        "Subtleties of transactional memory atomicity
                 semantics",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "17--17",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.18",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Transactional memory has great potential for
                 simplifying multithreaded programming by allowing
                 programmers to specify regions of the program that must
                 appear to execute atomically. Transactional memory
                 implementations then optimistically execute these
                 transactions concurrently to obtain high performance.
                 This work shows that the same atomic guarantees that
                 give transactions their power also have unexpected and
                 potentially serious negative effects on programs that
                 were written assuming narrower scopes of atomicity. We
                 make four contributions: (1) we show that a direct
                 translation of lock-based critical sections into
                 transactions can introduce deadlock into otherwise
                 correct programs, (2) we introduce the terms strong
                 atomicity and weak atomicity to describe the
                 interaction of transactional and non-transactional
                 code, (3) we show that code that is correct under weak
                 atomicity can deadlock under strong atomicity, and (4)
                 we demonstrate that sequentially composing
                 transactional code can also introduce deadlocks. These
                 observations invalidate the intuition that transactions
                 are strictly safer than lock-based critical sections,
                 that strong atomicity is strictly safer than weak
                 atomicity, and that transactions are always
                 composable",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer languages; Computer Systems Organization;
                 Concurrent distributed and parallel languages;
                 deadlock; direct translation; Hardware; Information
                 science; Interference; Interleaved codes; Language
                 Classifications; Law; lock-based critical sections;
                 Multi-core/single-chip multiprocessors;
                 multi-threading; Multiple Data Stream Architectures
                 (Multiprocessors); multithreaded programming;
                 nontransactional code; operating systems (computers);
                 Parallel Architectures; Processor Architectures;
                 program verification; Programming Languages;
                 Programming profession; sequentially composing
                 transactional code; Software performance;
                 Software/Software Engineering; strong atomicity; System
                 recovery; Transaction databases; transaction
                 processing; transactional memory atomicity semantics;
                 weak atomicity",
}

@Article{Price:2006:CCT,
  author =       "G. Price and M. Vachharajani",
  title =        "A Case for Compressing Traces with {BDDs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "18--18",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Instruction-level traces are widely used for program
                 and hardware analysis. However, program traces for just
                 a few seconds of execution are enormous, up to several
                 terabytes in size, uncompressed. Specialized
                 compression can shrink traces to a few gigabytes, but
                 trace analyzers typically stream the decompressed trace
                 through the analysis engine. Thus, the complexity of
                 analysis depends on the decompressed trace size (even
                 though the decompressed trace is never stored to disk).
                 This makes many global or interactive analyses
                 infeasible. This paper presents a method to compress
                 program traces using binary decision diagrams (BDDs).
                 BDDs intrinsically support operations common to many
                 desirable program analyses and these analyses operate
                 directly on the BDD. Thus, they are often polynomial in
                 the size of the compressed representation. The paper
                 presents mechanisms to represent a variety of trace
                 data using BDDs and shows that BDDs can store, in 1 GB
                 of RAM, the entire data-dependence graph of traces with
                 over 1 billion instructions. This allows rapid
                 computation of global analyses such as heap-object
                 liveness and dynamic slicing",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "binary decision diagrams; Binary decision diagrams;
                 Boolean functions; Data analysis; Data structures;
                 data-dependence graph; dynamic slicing; Engines; global
                 analyses; Hardware; hardware analysis; heap-object
                 liveness; instruction-level traces; Performance
                 analysis; Polynomials; program analysis; program
                 slicing; program traces; rapid computation; Read-write
                 memory; Software Engineering; Software Processor
                 validation Engineering; Software/Program Verification;
                 Software/Software; Software/Software Engineering;
                 specialized compression; Testing and Debugging; trace
                 analyzers; traces compression; Tracing; Validation;
                 Visualization",
}

@Article{MoretoPlanas:2007:EDC,
  author =       "M. {Moreto Planas} and F. Cazorla and A. Ramirez and
                 M. Valero",
  title =        "Explaining Dynamic Cache Partitioning Speed Ups",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "1",
  pages =        "1--4",
  month =        jan,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Cache partitioning has been proposed as an interesting
                 alternative to traditional eviction policies of shared
                 cache levels in modern CMP architectures: throughput is
                 improved at the expense of a reasonable cost. However,
                 these new policies present different behaviors
                 depending on the applications that are running in the
                 architecture. In this paper, we introduce some metrics
                 that characterize applications and allow us to give a
                 clear and simple model to explain final throughput
                 speed ups.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B Hardware; B.3 Memory Structures; B.3.2 Design
                 Styles; B.3.2.b Cache memories; B.3.3 Performance
                 Analysis and Design Aids; C Computer Systems
                 Organization; C.1 Processor Architectures; C.1.4
                 Parallel Architectures; C.1.4.e Multi-core/single-chip
                 multiprocessors; C.1.5 Micro-architecture
                 implementation considerations; C.1.5.e Memory
                 hierarchy; C.4 Performance of Systems; C.4.d Modeling
                 techniques; cache storage; chip multiprocessing;
                 Computer architecture; Counting circuits; dynamic cache
                 partitioning; microprocessor chips; Parallel
                 processing; Process design; Resource management; shared
                 cache levels; Streaming media; Surface-mount
                 technology; Throughput; Uninterruptible power systems",
}

@Article{Jerger:2007:CSC,
  author =       "N. Enright Jerger and M. Lipasti and L. Peh",
  title =        "Circuit-Switched Coherence",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "1",
  pages =        "5--8",
  month =        jan,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Circuit-switched networks can significantly lower the
                 communication latency between processor cores, when
                 compared to packet-switched networks, since once
                 circuits are set up, communication latency approaches
                 pure interconnect delay. However, if circuits are not
                 frequently reused, the long set up time and poorer
                 interconnect utilization can hurt overall performance.
                 To combat this problem, we propose a hybrid router
                 design which intermingles packet-switched flits with
                 circuit-switched flits. Additionally, we co-design a
                 prediction-based coherence protocol that leverages the
                 existence of circuits to optimize pair-wise sharing
                 between cores. The protocol allows pair-wise sharers to
                 communicate directly with each other via circuits and
                 drives up circuit reuse. Circuit-switched coherence
                 provides overall system performance improvements of up
                 to 17\% with an average improvement of 10\% and reduces
                 network latency by up to 30\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; C Computer Systems Organization; C.1
                 Processor Architectures; C.1.4 Parallel Architectures;
                 C.1.4.e Multi-core/single-chip multiprocessors; C.1.4.g
                 On-chip interconnection networks; C.1.5
                 Micro-architecture implementation considerations;
                 C.1.5.e Memory hierarchy; circuit switching;
                 circuit-switched network; Coupling circuits; Delay;
                 Fabrics; hybrid router design; Integrated circuit
                 interconnections; multiprocessor interconnection
                 networks; network latency; Network-on-a-chip; packet
                 switching; Packet switching; packet switching;
                 pair-wise sharing; Pipelines; prediction-based
                 coherence protocol; processor core; Protocols; routing
                 protocols; System performance",
}

@Article{Kodakara:2007:CRM,
  author =       "S. Kodakara and J. Kim and D. Lilja and D. Hawkins and
                 W. Hsu and P. Yew",
  title =        "{CIM}: a Reliable Metric for Evaluating Program Phase
                 Classifications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "1",
  pages =        "9--12",
  month =        jan,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We propose the use of the confidence interval of
                 estimated mean (CIM), a metric based on statistical
                 sampling theory, to evaluate the quality of a given
                 phase classification and for comparing different phase
                 classification schemes. Previous research on phase
                 classification used the weighted average of coefficient
                 of variation (CoVwa) to estimate phase classification
                 quality. We found that the phase quality indicated by
                 CoVwa could be inconsistent across different phase
                 classifications. We explain the reasons behind this
                 inconsistency and demonstrate the inconsistency using
                 data from several SPEC CPU2000 benchmark programs. We
                 show that the confidence interval of estimated mean
                 (CIM) correctly estimates the quality of phase
                 classification with a meaningful statistical
                 interpretation.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; Benchmark Analysis; Clustering
                 algorithms; Computer architecture; computer
                 architecture; Computer integrated manufacturing;
                 confidence interval; estimated mean; estimation theory;
                 pattern classification; Phase Classification; Phase
                 detection; Phase estimation; Phase measurement; phase
                 quality estimation; program compilers; program
                 diagnostics; program phase classification; Quality
                 Metric; reliable metric; Sampling methods; sampling
                 methods; SPEC CPU2000 benchmark program; statistical
                 interpretation; Statistical Sampling; statistical
                 sampling theory; Statistics; Surges",
}

@Article{Dieter:2007:LCM,
  author =       "W. R. Dieter and A. Kaveti and H. G. Dietz",
  title =        "Low-Cost Microarchitectural Support for Improved
                 Floating-Point Accuracy",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "1",
  pages =        "13--16",
  month =        jan,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Some processors designed for consumer applications,
                 such as graphics processing units (CPUs) and the CELL
                 processor, promise outstanding floating-point
                 performance for scientific applications at commodity
                 prices. However, IEEE single precision is the most
                 precise floating-point data type these processors
                 directly support in hardware. Pairs of native
                 floating-point numbers can be used to represent a base
                 result and a residual term to increase accuracy, but
                 the resulting order of magnitude slowdown dramatically
                 reduces the price/performance advantage of these
                 systems. By adding a few simple microarchitectural
                 features, acceptable accuracy can be obtained with
                 relatively little performance penalty. To reduce the
                 cost of native-pair arithmetic, a residual register is
                 used to hold information that would normally have been
                 discarded after each floating-point computation. The
                 residual register dramatically simplifies the code,
                 providing both lower latency and better
                 instruction-level parallelism.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; B Hardware; B.2 Arithmetic and
                 Logic Structures; B.2.4 High-Speed Arithmetic; B.2.4.b
                 Cost/performance; C Computer Systems Organization; C.0
                 General; C.0.b Hardware/software interfaces; C.1
                 Processor Architectures; C.1.5 Micro-architecture
                 implementation considerations; CELL processor; computer
                 architecture; Costs; floating point arithmetic;
                 floating-point accuracy; Floating-point arithmetic; G
                 Mathematics of Computing; G.1 Numerical Analysis; G.1.0
                 General; G.1.0.e Multiple precision arithmetic;
                 Graphics; graphics processing units; Hardware; I
                 Computing Methodologies; I.3 Computer Graphics; I.3.1
                 Hardware Architecture; I.3.1.a Graphics processors;
                 IEEE single precision; instruction-level parallelism;
                 microarchitectural support; Microarchitecture; parallel
                 processing; Pipelines; Registers; Software algorithms;
                 Software performance",
}

@Article{Etsion:2007:PPT,
  author =       "Y. Etsion and D. G. Feitelson",
  title =        "Probabilistic Prediction of Temporal Locality",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "1",
  pages =        "17--20",
  month =        jan,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The increasing gap between processor and memory
                 speeds, as well as the introduction of multi-core CPUs,
                 have exacerbated the dependency of CPU performance on
                 the memory subsystem. This trend motivates the search
                 for more efficient caching mechanisms, enabling both
                 faster service of frequently used blocks and decreased
                 power consumption. In this paper we describe a novel,
                 random sampling based predictor that can distinguish
                 transient cache insertions from non-transient ones. We
                 show that this predictor can identify a small set of
                 data cache resident blocks that service most of the
                 memory references, thus serving as a building block for
                 new cache designs and block replacement policies.
                 Although we only discuss the L1 data cache, we have
                 found this predictor to be efficient also when handling
                 L1 instruction caches and shared L2 caches.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B Hardware; B.3 Memory Structures; B.3.2 Design
                 Styles; B.3.2.b Cache memories; B.3.3 Performance
                 Analysis and Design Aids; cache storage; Computer
                 science; Data analysis; data cache; Distributed
                 computing; Energy consumption; Extraterrestrial
                 phenomena; memory subsystem; multi-core CPU; power
                 aware computing; probabilistic prediction; random
                 sampling; Sampling methods; temporal locality;
                 transient cache insertions; Visualization",
}

@Article{Guz:2007:NCO,
  author =       "Z. Guz and I. Keidar and A. Kolodny and U. Weiser",
  title =        "{Nahalal}: Cache Organization for Chip
                 Multiprocessors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "1",
  pages =        "21--24",
  month =        jan,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper addresses cache organization in chip
                 multiprocessors (CMPs). We show that in CMP systems it
                 is valuable to distinguish between shared data, which
                 is accessed by multiple cores, and private data
                 accessed by a single core. We introduce Nahalal, an
                 architecture whose novel floorplan topology partitions
                 cached data according to its usage (shared versus
                 private data), and thus enables fast access to shared
                 data for all processors while preserving the vicinity
                 of private data to each processor. Nahalal exhibits
                 significant improvements in cache access latency
                 compared to a traditional cache design.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Cache memories; cache organization; cache
                 storage; chip multiprocessors; circuit layout; CMP
                 systems; Computer integrated manufacturing; Computer
                 Systems Organization; Design Styles; floorplan topology
                 partitions; Hardware; Memory Structures; microprocessor
                 chips; Multi-core/single-chip multiprocessors; Nahalal;
                 Parallel Architectures; Processor Architectures;
                 Writing",
}

@Article{Joao:2007:DPI,
  author =       "J. A. Joao and O. Mutlu and H. Kim and Y. N. Patt",
  title =        "Dynamic Predication of Indirect Jumps",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "25--28",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Indirect jumps are used to implement
                 increasingly-common programming language constructs
                 such as virtual function calls, switch-case statements,
                 jump tables, and interface calls. Unfortunately, the
                 prediction accuracy of indirect jumps has remained low
                 because many indirect jumps have multiple targets that
                 are difficult to predict even with specialized
                 hardware. This paper proposes a new way of handling
                 hard-to-predict indirect jumps: dynamically predicating
                 them. The compiler identifies indirect jumps that are
                 suitable for predication along with their control-flow
                 merge (CFM) points. The microarchitecture predicates
                 the instructions between different targets of the jump
                 and its CFM point if the jump turns out to be
                 hard-to-predict at run time. We describe the new
                 indirect jump predication architecture, provide code
                 examples showing why it could reduce the performance
                 impact of jumps, derive an analytical cost-benefit
                 model for deciding which jumps and targets to
                 predicate, and present preliminary evaluation
                 results.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; Analytical models; and statically-scheduled
                 implementation; Computer languages; Computer Systems
                 Organization; control-flow merge point;
                 dynamically-scheduled; dynamically-scheduled and
                 statically-scheduled implementation; hard-to-predict
                 indirect jump handling; Hardware; Instruction fetch;
                 Instruction sets; interface call; jump table;
                 Micro-architecture implementation considerations;
                 Microarchitecture; microarchitecture dynamic
                 predication; Object oriented modeling; parallel
                 architectures; Performance analysis; Pipeline
                 processors; Pipelines; Processor Architectures; program
                 compiler; program compilers; program control
                 structures; programming language construct; Single Data
                 Stream Architectures; Superscalar; switch-case
                 statement; Switches; system monitoring; virtual
                 function call",
}

@Article{Das:2007:MMC,
  author =       "A. Das and S. Ozdemir and G. Memik and J. Zambreno and
                 A. Choudhary",
  title =        "Microarchitectures for Managing Chip Revenues under
                 Process Variations",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "29--32",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As transistor feature sizes continue to shrink into
                 the sub-90 nm range and beyond, the effects of process
                 variations on critical path delay and chip yields have
                 amplified. A common concept to remedy the effects of
                 variation is speed-binning, by which chips from a
                 single batch are rated by a discrete range of
                 frequencies and sold at different prices. In this
                 paper, we discuss strategies to modify the number of
                 chips in different bins and hence enhance the profits
                 obtained from them. Particularly, we propose a scheme
                 that introduces a small Substitute Cache associated
                 with each cache way to replicate the data elements that
                 will be stored in the high latency lines. Assuming a
                 fixed pricing model, this method increases the revenue
                 by as much as 13.8\% without any impact on the
                 performance of the chips.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Cache Memories; cache memory; cache storage; Circuits;
                 Computer Architecture; computer architecture; Computer
                 Architecture; Computer architecture; critical path
                 delay; Fabrication; Fault-tolerant Computing.; fixed
                 pricing model; Frequency; Logic arrays;
                 Microarchitecture; microarchitecture chip;
                 microprocessor chips; Microprocessors; optimisation;
                 process variation; Process Variations; Registers; Size
                 control; Voltage control",
}

@Article{Zebchuk:2007:BBC,
  author =       "J. Zebchuk and A. Moshovos",
  title =        "A Building Block for Coarse-Grain Optimizations in the
                 On-Chip Memory Hierarchy",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "33--36",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Current on-chip block-centric memory hierarchies
                 exploit access patterns at the fine-grain scale of
                 small blocks. Several recently proposed memory
                 hierarchy enhancements for coherence traffic reduction
                 and prefetching suggest that additional useful patterns
                 emerge with a macroscopic, coarse-grain view. This
                 paper presents RegionTracker, a dual-grain, on-chip
                 cache design that exposes coarse-grain behavior while
                 maintaining block-level communication. RegionTracker
                 eliminates the extraneous, often imprecise coarse-grain
                 tracking structures of previous proposals. It can be
                 used as the building block for coarse-grain
                 optimizations, reducing their overall cost and easing
                 their adoption. Using full-system simulation of a
                 quad-core chip multiprocessor and commercial workloads,
                 we demonstrate that RegionTracker overcomes the
                 inefficiencies of previous coarse-grain cache designs.
                 We also demonstrate how RegionTracker boosts the
                 benefits and reduces the cost of a previously proposed
                 snoop reduction technique.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "access patterns; Bandwidth; cache storage; Cache
                 storage; coarse-grain optimizations; coherence traffic
                 reduction; Cost function; Design optimization;
                 Explosions; Information management; Memory management;
                 Multithreading; on-chip memory hierarchy; optimising
                 compilers; Prefetching; prefetching; Proposals;
                 quad-core chip multiprocessor; RegionTracker dual-grain
                 on-chip cache design; system-on-chip",
}

@Article{Kim:2007:FBT,
  author =       "J. Kim and J. Balfour and W. J. Dally",
  title =        "Flattened Butterfly Topology for On-Chip Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "37--40",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "With the trend towards increasing number of cores in a
                 multicore processors, the on-chip network that connects
                 the cores needs to scale efficiently. In this work, we
                 propose the use of high-radix networks in on-chip
                 networks and describe how the flattened butterfly
                 topology can be mapped to on-chip networks. By using
                 high-radix routers to reduce the diameter of the
                 network, the flattened butterfly offers lower latency
                 and energy consumption than conventional on-chip
                 topologies. In addition, by properly using bypass
                 channels in the flattened butterfly network,
                 non-minimal routing can be employed without increasing
                 latency or the energy consumption.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Computer networks; Delay; Energy
                 consumption; flattened butterfly; flattened butterfly
                 topology; high-radix networks; high-radix routers;
                 Laboratories; Multicore processing; multicore
                 processors; Multiprocessor interconnection networks;
                 Network topology; network topology; Network-on-a-chip;
                 network-on-chip; on-chip networks; Routing; topology",
}

@Article{Xiao:2007:NPD,
  author =       "X. Xiao and J. Lee",
  title =        "A Novel Parallel Deadlock Detection Algorithm and
                 Hardware for Multiprocessor System-on-a-Chip",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "41--44",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Given the projected dramatic increase in the number of
                 processors and resources in a system-on-a-chip, a
                 quadratic increase in the likelihood of deadlock is
                 predicted due to complex system behavior. To deal with
                 this issue, we here present a novel parallel
                 hardware-oriented deadlock detection algorithm with $
                 O(1) $ DEADLOCK DETECTION AND $ O(\MIN (M, N)) $
                 preparation, where $m$ and $n$ are the numbers of
                 processes and resources, respectively. Our
                 contributions are (i) the first $ O(1)$ deadlock
                 detection hardware implementation and (ii) a new
                 algorithmic method of achieving $ O(\min (m, n))$
                 overall run-time complexity. We implement our algorithm
                 in Verilog HDL and demonstrate that deadlock detection
                 always takes only two clock cycles regardless of the
                 size of a system (i.e., $m$ and $n$).",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithms implemented in hardware; computational
                 complexity; deadlock detection hardware; Deadlocks;
                 Detection algorithms; Hardware design languages;
                 microprocessor chips; Multiprocessing systems;
                 multiprocessing systems; multiprocessor
                 system-on-a-chip; operating systems (computers);
                 Parallel algorithms; parallel algorithms; parallel
                 deadlock detection algorithm; Processor scheduling;
                 Real time systems; Real-time and embedded systems;
                 Resource management; run-time complexity; Runtime;
                 Software performance; System recovery; system-on-chip",
}

@Article{August:2007:UOS,
  author =       "D. August and J. Chang and S. Girbal and D.
                 Gracia-Perez and G. Mouchard and D. A. Penry and O.
                 Temam and N. Vachharajani",
  title =        "{UNISIM}: an Open Simulation Environment and Library
                 for Complex Architecture Design and Collaborative
                 Development",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "45--48",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Simulator development is already a huge burden for
                 many academic and industry research groups; future
                 complex or heterogeneous multi-cores, as well as the
                 multiplicity of performance metrics and required
                 functionality, will make matters worse. We present a
                 new simulation environment, called UNISIM, which is
                 designed to rationalize simulator development by making
                 it possible and efficient to distribute the overall
                 effort over multiple research groups, even without
                 direct cooperation. UNISIM achieves this goal with a
                 combination of modular software development,
                 distributed communication protocols, multilevel
                 abstract modeling, interoperability capabilities, a set
                 of simulator services APIs, and an open
                 library/repository for providing a consistent set of
                 simulator modules.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "API; application program interfaces; Collaboration;
                 collaborative development; complex architecture design;
                 Computational modeling; Computer architecture; Computer
                 industry; Computer science; Design engineering;
                 distributed communication protocols; groupware;
                 interoperability capability; Libraries; Measurement;
                 modular software development; multilevel abstract
                 modeling; open library; open repository; open
                 simulation environment; open systems; Operating
                 systems; Performance and Reliability; Processor
                 Architectures; Programming; simulator development;
                 simulator modules; simulator services; software
                 architecture; UNISIM",
}

@Article{Sendag:2007:BMP,
  author =       "R. Sendag and J. Yi and P. Chuang",
  title =        "Branch Misprediction Prediction: Complementary Branch
                 Predictors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "49--52",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper, we propose a new class of branch
                 predictors, complementary branch predictors, which can
                 be easily added to any branch predictor to improve the
                 overall prediction accuracy. This mechanism differs
                 from conventional branch predictors in that it focuses
                 only on mispredicted branches. As a result, this
                 mechanism has the advantages of scalability and
                 flexibility (can be implemented with any branch
                 predictor), but is not on the critical path. More
                 specifically, this mechanism improves the branch
                 prediction accuracy by predicting which future branch
                 will be mispredicted next and when that will occur, and
                 then it changes the predicted direction at the
                 predicted time. Our results show that a branch
                 predictor with the branch misprediction predictor
                 achieves the same prediction accuracy as a conventional
                 branch predictor that is 4 to 16 times larger, but
                 without significantly increasing the overall complexity
                 or lengthening the critical path.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; branch misprediction prediction; branch
                 predictor; computational complexity; Computer networks;
                 Costs; Delay; Emerging technologies; History; parallel
                 architectures; Performance loss; Pipeline processors;
                 Pipelines; Prediction algorithms; Scalability;
                 Testing",
}

@Article{Yalcin:2007:UTM,
  author =       "G. Yalcin and O. Ergin",
  title =        "Using tag-match comparators for detecting soft
                 errors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "53--56",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Soft errors caused by high energy particle strikes are
                 becoming an increasingly important problem in
                 microprocessor design. With increasing transistor
                 density and die sizes, soft errors are expected to be a
                 larger problem in the near future. Recovering from
                 these unexpected faults may be possible by reexecuting
                 some part of the program only if the error can be
                 detected. Therefore it is important to come up with new
                 techniques to detect soft errors and increase the
                 number of errors that are detected. Modern
                 microprocessors employ out-of-order execution and
                 dynamic scheduling logic. Comparator circuits, which
                 are used to keep track of data dependencies, are
                 usually idle. In this paper, we propose various schemes
                 to exploit on-chip comparators to detect transient
                 faults. Our results show that around 50\% of the errors
                 on the wakeup logic can be detected with minimal
                 hardware overhead by using the proposed techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "and Fault-Tolerance; Broadcasting; Circuit faults;
                 comparators (circuits); Computer errors; Control
                 Structure Reliability; dynamic scheduling logic;
                 Electrical fault detection; Fault detection;
                 identification technology; Logic; logic design; logic
                 testing; microprocessor chips; microprocessor design;
                 Microprocessors; Out of order; out-of-order execution;
                 Pipelines; Processor Architectures; Registers;
                 scheduling; soft error detection; tag-match comparator;
                 Testing; Testing and Fault-Tolerance",
}

@Article{Joao:2008:DPI,
  author =       "J. A. Joao and O. Mutlu and H. Kim and Y. N. Patt",
  title =        "Dynamic Predication of Indirect Jumps",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "1--4",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Indirect jumps are used to implement increasingly
                 common programming language constructs such as virtual
                 function calls, switch-case statements, jump tables,
                 and interface calls. Unfortunately, the prediction
                 accuracy of indirect jumps has remained low because
                 many indirect jumps have multiple targets that are
                 difficult to predict even with specialized hardware.
                 This paper proposes a new way of handling
                 hard-to-predict indirect jumps: dynamically predicating
                 them. The compiler identifies indirect jumps that are
                 suitable for predication along with their control-flow
                 merge (CFM) points. The microarchitecture predicates
                 the instructions between different targets of the jump
                 and its CFM point if the jump turns out to be
                 hardto-predict at run time. We describe the new
                 indirect jump predication architecture, provide code
                 examples showing why it could reduce the performance
                 impact of jumps, derive an analytical cost-benefit
                 model for deciding which jumps and targets to
                 predicate, and present preliminary evaluation
                 results.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; Analytical models; B Hardware; B.3 Memory
                 Structures; Cache memories; Computer languages;
                 Computer Systems Organization; Design Styles; Hardware;
                 Instruction sets; Microarchitecture;
                 Multi-core/single-chip multiprocessors; Object oriented
                 modeling; Parallel Architectures; Performance analysis;
                 Pipelines; Processor Architectures; Switches",
}

@Article{Das:2008:MMC,
  author =       "A. Das and S. Ozdemir and G. Memik and J. Zambreno and
                 A. Choudhary",
  title =        "Microarchitectures for Managing Chip Revenues under
                 Process Variations",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "5--8",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As transistor feature sizes continue to shrink into
                 the sub-90nm range and beyond, the effects of process
                 variations on critical path delay and chip yields have
                 amplified. A common concept to remedy the effects of
                 variation is speed-binning, by which chips from a
                 single batch are rated by a discrete range of
                 frequencies and sold at different prices. In this
                 paper, we discuss strategies to modify the number of
                 chips in different bins and hence enhance the profits
                 obtained from them. Particularly, we propose a scheme
                 that introduces a small substitute cache associated
                 with each cache way to replicate the data elements that
                 will be stored in the high latency lines. Assuming a
                 fixed pricing model, this method increases the revenue
                 by as much as 13.8\% without any impact on the
                 performance of the chips.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Cache Memories; Computer Architecture; Computer
                 architecture; Cost function; Delay effects; Design
                 optimization; Fabrication; Fault-tolerant Computing.;
                 Frequency; Manufacturing; Microarchitecture; Pricing;
                 Process Variations; Transistors",
}

@Article{Roth:2008:PRR,
  author =       "A. Roth",
  title =        "Physical register reference counting",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "9--12",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Several proposed techniques including CPR (checkpoint
                 processing and recovery) and NoSQ (no store queue) rely
                 on reference counting to manage physical registers.
                 However, the register reference counting mechanism
                 itself has received surprisingly little attention. This
                 paper fills this gap by describing potential register
                 reference counting schemes for NoSQ, CPR, and a
                 hypothetical NoSQ/CPR hybrid. Although previously
                 described in terms of binary counters, we find that
                 reference counts are actually more naturally
                 represented as matrices. Binary representations can be
                 used as an optimization in specific situations.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "and statically-scheduled implementation; binary
                 representations; checkpoint processing; checkpointing;
                 Counting circuits; dynamically-scheduled;
                 dynamically-scheduled and statically-scheduled
                 implementation; Engines; Information science; matrices;
                 Micro-architecture implementation considerations;
                 Microarchitecture; no store queue; physical register
                 reference counting; Physics computing; Proposals;
                 recovery technique; Registers; shift registers;
                 Superscalar",
}

@Article{Flich:2008:LBD,
  author =       "J. Flich and J. Duato",
  title =        "Logic-Based Distributed Routing for {NoCs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "13--16",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "The design of scalable and reliable interconnection
                 networks for multicore chips (NoCs) introduces new
                 design constraints like power consumption, area, and
                 ultra low latencies. Although 2D meshes are usually
                 proposed for NoCs, heterogeneous cores, manufacturing
                 defects, hard failures, and chip virtualization may
                 lead to irregular topologies. In this context,
                 efficient routing becomes a challenge. Although
                 switches can be easily configured to support most
                 routing algorithms and topologies by using routing
                 tables, this solution does not scale in terms of
                 latency and area. We propose a new circuit that removes
                 the need for using routing tables. The new mechanism,
                 referred to as logic-based distributed routing (LBDR),
                 enables the implementation in NoCs of many routing
                 algorithms for most of the practical topologies we
                 might find in the near future in a multicore chip. From
                 an initial topology and routing algorithm, a set of
                 three bits per switch output port is computed. By using
                 a small logic block, LHDR mimics (demonstrated by
                 evaluation) the behavior of routing algorithms
                 implemented with routing tables. This result is
                 achieved both in regular and irregular topologies.
                 Therefore, LBDR removes the need for using routing
                 tables for distributed routing, thus enabling flexible,
                 fast and power-efficient routing in NoCs.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "chip virtualization; circuit reliability; Circuit
                 topology; Delay; Energy consumption; heterogeneous
                 cores; interconnection network reliability;
                 interconnections; logic circuits; logic-based
                 distributed routing; Manufacturing; manufacturing
                 defects; Multi-core/single-chip multiprocessors;
                 Multicore processing; Multiprocessor interconnection
                 networks; network routing; network topology; Network
                 topology; Network-on-a-chip; network-on-chip; networks
                 for multicore chips; NoC; On-chip interconnection
                 networks; Routing; Switches",
}

@Article{Yoon:2008:CHP,
  author =       "J. H. Yoon and E. H. Nam and Y. J. Seong and H. Kim
                 and B. Kim and S. L. Min and Y. Cho",
  title =        "{Chameleon}: a High Performance Flash\slash {FRAM}
                 Hybrid Solid State Disk Architecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "17--20",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Flash memory solid state disk (SSD) is gaining
                 popularity and replacing hard disk drive (HDD) in
                 mobile computing systems such as ultra mobile PCs
                 (UMPCs) and notebook PCs because of lower power
                 consumption, faster random access, and higher shock
                 resistance. One of the key challenges in designing a
                 high-performance flash memory SSD is an efficient
                 handling of small random writes to non-volatile data
                 whose performance suffers from the inherent limitation
                 of flash memory that prohibits in-placc update. In this
                 paper, we propose a high performance Flash/FRAM hybrid
                 SSD architecture called Chameleon. In Chameleon,
                 metadata used by the flash translation layer (FTL), a
                 software layer in the flash memory SSD, is maintained
                 in a small FRAM since this metadata is a target of
                 intensive small random writes, whereas the bulk data is
                 kept in the flash memory. Performance evaluation based
                 on an FPGA implementation of the Chameleon architecture
                 shows that the use of FRAM in Chameleon improves the
                 performance by 21.3\%. The results also show that even
                 for bulk data that cannot be maintained in FRAM because
                 of the size limitation, the use of fine-grained write
                 buffering is critically important because of the
                 inability of flash memory to perform in-placc update of
                 data.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Chameleon; Computer architecture; Design studies; disc
                 drives; Energy consumption; Ferroelectric films; field
                 programmable gate arrays; flash memories; Flash memory;
                 flash memory solid state disk; flash translation layer;
                 flash-FRAM hybrid SSD architecture; FPGA
                 implementation; FTL; hard discs; hard disk drive; Hard
                 disks; HDD; Mass storage; memory architecture; Mobile
                 computing; mobile computing systems; Nonvolatile
                 memory; notebook PCs; Personal communication networks;
                 Random access memory; random-access storage; Solid
                 state circuits; SSD; ultra mobile PCs; UMPC",
}

@Article{Biswas:2008:CAA,
  author =       "A. Biswas and P. Racunas and J. Emer and S.
                 Mukherjee",
  title =        "Computing Accurate {AVFs} using {ACE} Analysis on
                 Performance Models: a Rebuttal",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "21--24",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.19",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "ACE (architecturally correct execution) analysis
                 computes AVFs (architectural vulnerability factors) of
                 hardware structures. AVF expresses the fraction of
                 radiation-induced transient faults that result in
                 user-visible errors. Architects usually perform this
                 analysis on a high-level performance model to quickly
                 compute per-structure AVFs. If, however, low-level
                 details of a microarchitecture are not modeled
                 appropriately, then their effects may not be reflected
                 in the per-structure AVFs. In this paper we refute
                 Wang, et al.'s (2007) claim that this detail is
                 difficult to model and imposes a practical threshold on
                 ACE analysis that forces its estimates to have a high
                 error margin. We show that carefully choosing a small
                 amount of additional detail can result in a much
                 tighter AVF bound than Wang, et al. were able to
                 achieve in their refined ACE analysis. Even the
                 inclusion of small details, such as read/write pointers
                 and appropriate inter-structure dependencies, can
                 increase the accuracy of the AVF computation by 40\% or
                 more. We argue that this is no different than modeling
                 the IPC (instructions per cycle) of a microprocessor
                 pipeline. A less detailed performance model will
                 provide less accurate IPCs. AVFs are no different.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "and Fault-Tolerance; architectural vulnerability
                 factors; architecturally correct execution analysis;
                 Computational modeling; Hardware; hardware structures;
                 High performance computing; instructions per cycle;
                 inter-structure dependencies; Microarchitecture;
                 microprocessor pipeline; Microprocessors; Performance
                 analysis; Performance and Reliability; performance
                 evaluation; performance models; Pipelines; Protection;
                 radiation-induced transient faults; read pointers;
                 Reliability; Target tracking; Testing; Testing and
                 Fault-Tolerance; user-visible errors; write pointers",
}

@Article{Cho:2008:CAL,
  author =       "S. Cho and R. Melhem",
  title =        "Corollaries to {Amdahl's Law} for Energy",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "25--28",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.18",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper studies the important interaction between
                 parallelization and energy consumption in a
                 parallelizable application. Given the ratio of serial
                 and parallel portion in an application and the number
                 of processors, we first derive the optimal frequencies
                 allocated to the serial and parallel regions in the
                 application to minimize the total energy consumption,
                 while the execution time is preserved (i.e., speedup =
                 1). We show that dynamic energy improvement due to
                 parallelization has a function rising faster with the
                 increasing number of processors than the speed
                 improvement function given by the well-known Amdahl's
                 Law. Furthermore, we determine the conditions under
                 which one can obtain both energy and speed improvement,
                 as well as the amount of improvement. The formulas we
                 obtain capture the fundamental relationship between
                 parallelization, speedup, and energy consumption and
                 can be directly utilized in energy aware processor
                 resource management. Our results form a basis for
                 several interesting research directions in the area of
                 power and energy aware parallel processing.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Amdahl's Law; Application software; Computer science;
                 Concurrent computing; dynamic energy improvement;
                 energy aware processor resource management; Energy
                 capture; energy consumption; Energy consumption; energy
                 consumption; Energy management; Equations; Hardware;
                 Parallel Architectures; parallel processing; Parallel
                 processing; parallelization; Power Management; Radio
                 spectrum management; Resource management",
}

@Article{Balfour:2008:EEP,
  author =       "J. Balfour and W. Dally and D. Black-Schaffer and V.
                 Parikh and J. Park",
  title =        "An Energy-Efficient Processor Architecture for
                 Embedded Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "29--32",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We present an efficient programmable architecture for
                 compute-intensive embedded applications. The processor
                 architecture uses instruction registers to reduce the
                 cost of delivering instructions, and a hierarchical and
                 distributed data register organization to deliver data.
                 Instruction registers capture instruction reuse and
                 locality in inexpensive storage structures that arc
                 located near to the functional units. The data register
                 organization captures reuse and locality in different
                 levels of the hierarchy to reduce the cost of
                 delivering data. Exposed communication resources
                 eliminate pipeline registers and control logic, and
                 allow the compiler to schedule efficient instruction
                 and data movement. The architecture keeps a significant
                 fraction of instruction and data bandwidth local to the
                 functional units, which reduces the cost of supplying
                 instructions and data to large numbers of functional
                 units. This architecture achieves an energy efficiency
                 that is 23x greater than an embedded RISC processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Communication system control; compute-intensive
                 embedded applications; Computer applications; computer
                 architecture; Computer architecture; Costs; data
                 movement; distributed data register organization;
                 Embedded computing; embedded RISC processor; Embedded
                 system; embedded systems; Energy efficiency;
                 energy-efficient processor architecture; hierarchical
                 organization; inexpensive storage structures;
                 instruction registers; instruction sets; Logic; Mobile
                 processors; pipeline processing; pipeline registers;
                 Pipelines; Registers",
}

@Article{Anonymous:2008:FC,
  author =       "Anonymous",
  title =        "{[Front} cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "c1--c1",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Presents the front cover for this issue of the
                 publication.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2008:EBC,
  author =       "Anonymous",
  title =        "Editorial Board [Cover2]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "c2--c2",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Provides a listing of current society officers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Pao:2008:PAM,
  author =       "D. Pao and W. Lin and B. Liu",
  title =        "Pipelined Architecture for Multi-String Matching",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "33--36",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This letter presents a new oblivious routing algorithm
                 for 3D mesh networks called randomized
                 partially-minimal (RPM) routing that provably achieves
                 optimal worst- case throughput for 3D meshes when the
                 network radix fc is even and within a factor of 1/k2 of
                 optimal when k is odd. Although this optimality result
                 has been achieved with the minimal routing algorithm
                 OITURN for the 2D case, the worst-case throughput of
                 OITURN degrades tremendously in higher dimensions.
                 Other existing routing algorithms suffer from either
                 poor worst-case throughput (DOR, ROMM) or poor latency
                 (VAL). RPM on the other hand achieves near optimal
                 worst-case and good average-case throughput as well as
                 good latency performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "3D mesh networks; Automata; computer architecture;
                 Computer architecture; Computer science; Costs;
                 deterministic finite automaton; Hardware; Intrusion
                 detection; network intrusion detection; network radix;
                 OITURN; Partial response channels; pipelined
                 processing; Pipelines; randomized partially-minimal
                 routing; string matching; Table lookup;
                 three-dimensional mesh networks; Throughput",
}

@Article{Ramanujam:2008:RPM,
  author =       "R. Sunkam Ramanujam and B. Lin",
  title =        "Randomized Partially-Minimal Routing on
                 Three-Dimensional Mesh Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "37--40",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This letter presents a new oblivious routing algorithm
                 for 3D mesh networks called Randomized Partially-
                 Minimal (RPM) routing that provably achieves optimal
                 worst-case throughput for 3D meshes when the network
                 radix k is even and within a factor of 1/k2 of optimal
                 when k is odd. Although this optimality result has been
                 achieved with the minimal routing algorithm O1TURN [9]
                 for the 2D case, the worst-case throughput of O1TURN
                 degrades tremendously in higher dimensions. Other
                 existing routing algorithms suffer from either poor
                 worst-case throughput (DOR [10], ROMM [8]) or poor
                 latency (VAL [14]). RPM on the other hand achieves near
                 optimal worst-case and good average-case throughput as
                 well as good latency performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Degradation; Delay; Emerging technologies; Fabrics;
                 Interconnection architectures; Mesh networks; Network
                 communications; Network topology; On-chip
                 interconnection networks; Packet-switching networks;
                 Routing; Silicon; Technological innovation;
                 Telecommunication traffic; Throughput",
}

@Article{Black-Schaffer:2008:HIR,
  author =       "D. Black-Schaffer and J. Balfour and W. Dally and V.
                 Parikh and J. Park",
  title =        "Hierarchical Instruction Register Organization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "41--44",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper analyzes a range of architectures for
                 efficient delivery of VLIW instructions for embedded
                 media kernels. The analysis takes an efficient filter
                 cache as a baseline and examines the benefits from (1)
                 removing the tag overhead, (2) distributing the
                 storage, (3) adding indirection, (4) adding efficient
                 NOP generation, and (5) sharing instruction memory. The
                 result is a hierarchical instruction register
                 organization that provides a 56\% energy and 40\% area
                 savings over an already efficient filter cache.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache storage; Cache storage; Computer aided
                 instruction; Computer architecture; Computer integrated
                 manufacturing; distributed shared memory systems;
                 Embedded computing; embedded media kernel; embedded
                 processor architecture; embedded systems; filter cache;
                 Filters; hierarchical instruction register
                 organization; Instruction fetch; instruction memory
                 sharing; instruction sets; Kernel; Laboratories;
                 Low-power design; NOP generation; parallel
                 architectures; Registers; RISC/CISC; VLIW; VLIW
                 architectures; VLIW instruction delivery",
}

@Article{Lee:2008:PDD,
  author =       "J. Lee and X. Xiao",
  title =        "A Parallel Deadlock Detection Algorithm with {$ O(1)
                 $} Overall Run-time Complexity",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "45--48",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This article proposes a novel parallel,
                 hardware-oriented deadlock detection algorithm for
                 multiprocessor system-on-chips. The proposed algorithm
                 takes full advantage of hardware parallelism in
                 computation and maintains information needed by
                 deadlock detection through classifying all resource
                 allocation events and performing class specific
                 operations, which together make the overall run-time
                 complexity of the new method O(1). We implement the
                 proposed algorithm in Verilog HDL and demonstrate in
                 the simulation that each algorithm invocation takes at
                 most four clock cycles in hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithms implemented in hardware; clock cycle;
                 Computational modeling; Concurrent computing;
                 Deadlocks; Detection algorithms; Event detection;
                 hardware description languages; Hardware design
                 languages; hardware-oriented deadlock detection;
                 Multiprocessing systems; multiprocessing systems;
                 multiprocessor system-on-chips; operating systems
                 (computers); parallel deadlock detection; Parallel
                 processing; Real-time and embedded systems; resource
                 allocation; Resource management; run-time complexity;
                 Runtime; System recovery; system-on-chip; Verilog HDL",
}

@Article{GomezRequena:2008:BFT,
  author =       "C. {Gomez Requena} and F. Gilabert Villamon and M.
                 Gomez and P. Lopez and J. Duato",
  title =        "Beyond Fat-tree: Unidirectional Load--Balanced
                 Multistage Interconnection Network",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "49--52",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  note =         "See comment \cite{Antelo:2009:CBF}.",
  abstract =     "The fat-tree is one of the most widely-used topologies
                 by interconnection network manufacturers. Recently, it
                 has been demonstrated that a deterministic routing
                 algorithm that optimally balances the network traffic
                 can not only achieve almost the same performance than
                 an adaptive routing algorithm but also outperforms it.
                 On the other hand, fat-trees require a high number of
                 switches with a non-negligible wiring complexity. In
                 this paper, we propose replacing the fat-tree by a
                 unidirectional multistage interconnection network
                 (UMIN) that uses a traffic balancing deterministic
                 routing algorithm. As a consequence, switch hardware is
                 almost reduced to the half, decreasing, in this way,
                 the power consumption, the arbitration complexity, the
                 switch size itself, and the network cost. Preliminary
                 evaluation results show that the UMIN with the load
                 balancing scheme obtains lower latency than fat-tree
                 for low and medium traffic loads. Furthermore, in
                 networks with a high number of stages or with high
                 radix switches, it obtains the same, or even higher,
                 throughput than fat-tree.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "adaptive routing algorithm; Butterfly Network;
                 computational complexity; Cost-efficiency; Costs;
                 Deterministic Routing; Energy consumption; Fat-trees;
                 Hardware; interconnection network manufacturers;
                 Manufacturing; Multiprocessor interconnection networks;
                 Multistage Interconnection Networks; Network
                 Architecture and Design; Network topology; network
                 traffic; nonnegligible wiring complexity; power
                 consumption; radix switches; Routing; Switches;
                 telecommunication network routing; telecommunication
                 switching; Telecommunication traffic; telecommunication
                 traffic; Traffic Balancing; traffic balancing
                 deterministic routing algorithm; trees (mathematics);
                 unidirectional load-balanced multistage interconnection
                 network; Wiring",
}

@Article{Li:2008:TAN,
  author =       "Z. Li and C. Zhu and L. Shang and R. Dick and Y. Sun",
  title =        "Transaction-Aware Network-on-Chip Resource
                 Reservation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "53--56",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Performance and scalability are critically-important
                 for on-chip interconnect in many-core
                 chip-multiprocessor systems. Packet-switched
                 interconnect fabric, widely viewed as the de facto
                 on-chip data communication backplane in the many-core
                 era, offers high throughput and excellent scalability.
                 However, these benefits come at the price of router
                 latency due to run-time multi-hop data buffering and
                 resource arbitration. The network accounts for a
                 majority of on-chip data transaction latency. In this
                 work, we propose dynamic in-network resource
                 reservation techniques to optimize run-time on-chip
                 data transactions. This idea is motivated by the need
                 to preserve existing abstraction and general-purpose
                 network performance while optimizing for
                 frequently-occurring network events such as data
                 transactions. Experimental studies using multithreaded
                 benchmarks demonstrate that the proposed techniques can
                 reduce on-chip data access latency by 28.4\% on average
                 in a 16-node system and 29.2\% on average in a 36-node
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Backplanes; buffer storage; Computer buffers; data
                 communication; Data communication; de facto on-chip
                 data communication backplane; Delay; dynamic in-network
                 resource reservation techniques; Fabrics;
                 frequently-occurring network events; Interconnection
                 architectures; Interconnections (Subsystems); many-core
                 chip-multiprocessor systems; multiprocessor
                 interconnection networks; Network-on-a-chip; on-chip
                 data transaction latency; On-chip interconnection
                 networks; packet switching; packet-switched
                 interconnect fabric; Parallel Architectures; resource
                 allocation; router latency; run-time multihop data
                 buffering; Runtime; Scalability; System-on-a-chip;
                 telecommunication network routing; Throughput;
                 transaction-aware network-on-chip resource
                 reservation",
}

@Article{Fide:2008:PUS,
  author =       "S. Fide and S. Jenks",
  title =        "Proactive Use of Shared {L3} Caches to Enhance Cache
                 Communications in Multi-Core Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "57--60",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The software and hardware techniques to exploit the
                 potential of multi-core processors are falling behind,
                 even though the number of cores and cache levels per
                 chip is increasing rapidly. There is no explicit
                 communications support available, and hence inter-core
                 communications depend on cache coherence protocols,
                 resulting in demand-based cache line transfers with
                 their inherent latency and overhead. In this paper, we
                 present software controlled eviction (SCE) to improve
                 the performance of multithreaded applications running
                 on multi-core processors by moving shared data to
                 shared cache levels before it is demanded from remote
                 private caches. Simulation results show that SCE offers
                 significant performance improvement (8-28\%) and
                 reduces L3 cache misses by 88-98\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache coherence protocol; cache communication; cache
                 storage; Concurrent computing; Control systems;
                 Degradation; Delay; demand-based cache line transfer;
                 Hardware; intercore communications; microprocessor
                 chips; Multi-core/single-chip multiprocessors;
                 multi-threading; Multicore processing; multicore
                 processors; multithreaded application; Parallel
                 processing; Protocols; shared L3 cache; shared memory
                 systems; software controlled eviction; Software
                 performance; Support for multi-threaded execution",
}

@Article{Walter:2008:BBE,
  author =       "I. Walter and I. Cidon and A. Kolodny",
  title =        "{BENoC}: a Bus-Enhanced Network on-Chip for a Power
                 Efficient {CMP}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "61--64",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Network-on-chips (NoCs) outperform buses in terms of
                 scalability, parallelism and system modularity and
                 therefore are considered as the main interconnect
                 infrastructure in future chip multi-processor (CMP).
                 However, while NoCs are very efficient for delivering
                 high throughput point-to-point data from sources to
                 destinations, their multi-hop operation is too slow for
                 latency sensitive signals. In addition, current NoCS
                 are inefficient for broadcast operations and
                 centralized control of CMP resources. Consequently,
                 state-of-the-art NoCs may not facilitate the needs of
                 future CMP systems. In this paper, the benefit of
                 adding a low latency, customized shared bus as an
                 internal part of the NoC architecture is explored.
                 BENoC (bus-enhanced network on-chip) possesses two main
                 advantages: First, the bus is inherently capable of
                 performing broadcast transmission in an efficient
                 manner. Second, the bus has lower and more predictable
                 propagation latency. In order to demonstrate the
                 potential benefit of the proposed architecture, an
                 analytical comparison of the power saving in BENoC
                 versus a standard NoC providing similar services is
                 presented. Then, simulation is used to evaluate BENoC
                 in a dynamic non-uniform cache access (DNUCA)
                 multiprocessor system.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "broadcast transmission; Broadcasting; bus-enhanced
                 network-on-chip; Centralized control; chip
                 multiprocessor; Delay; dynamic nonuniform cache access;
                 integrated circuit interconnections; interconnect
                 infrastructure; Interconnection architectures;
                 low-power electronics; microprocessor chips;
                 multiprocessing systems; Multiprocessing systems;
                 Multiprocessor interconnection networks;
                 Network-on-a-chip; network-on-chip; NoC; On-chip
                 interconnection networks; power efficient CMP; Power
                 system interconnection; propagation latency;
                 Scalability; system buses; System-on-a-chip;
                 Throughput",
}

@Article{Golander:2008:DDS,
  author =       "A. Golander and S. Weiss and R. Ronen",
  title =        "{DDMR}: Dynamic and Scalable Dual Modular Redundancy
                 with Short Validation Intervals",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "65--68",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "DMR (dual modular redundancy) was suggested for
                 increasing reliability. Classical DMR consists of pairs
                 of cores that check each other and are pre-connected
                 during manufacturing by dedicated links. In this paper
                 we introduce the dynamic dual modular redundancy (DDMR)
                 architecture. DDMR supports run-time scheduling of
                 redundant threads, which has significant benefits
                 relative to static binding. To allow dynamic pairing,
                 DDMR replaces the special links with a novel ring
                 architecture. DDMR uses short instruction sequences for
                 validation, smaller than the processor reorder buffer.
                 Such short sequences reduce latencies in parallel
                 programs and save resources needed to buffer
                 uncommitted data. DDMR scales with the number of cores
                 and may be used in large multicore architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "buffer storage; DDMR; Delay; dynamic dual modular
                 redundancy; Job shop scheduling; Joining processes;
                 Manufacturing; Multi-core/single-chip multiprocessors;
                 multicore architectures; Multicore processing; parallel
                 architectures; parallel programs; processor reorder
                 buffer; processor scheduling; Processor scheduling;
                 Proposals; Redundancy; Redundant design; ring
                 architecture; run-time scheduling; scalable dual
                 modular redundancy; short validation intervals;
                 Transistors",
}

@Article{Anonymous:2008:IA,
  author =       "Anonymous",
  title =        "Information for authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "c3--c3",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Provides instructions and guidelines to prospective
                 authors who wish to submit manuscripts.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2008:ICS,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [Cover 4]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "c4--c4",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.18",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Provides a listing of current society officers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Ramanujam:2009:WRR,
  author =       "Rohit Sunkam Ramanujam and Bill Lin",
  title =        "Weighted Random Routing on Torus Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper, we introduce a new closed-form
                 oblivious routing algorithm called W2TURN that is
                 worst-case throughput optimal for 2D-torus networks.
                 W2TURN is based on a weighted random selection of paths
                 that contain at most two turns. In terms of average hop
                 count, W2TURN outperforms the best previously known
                 closed-form worst-case throughput optimal routing
                 algorithm called IVAL [7]. In addition, we present a
                 new optimal weighted random routing algorithm for rings
                 called WRD.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ramanujam, RS (Reprint Author), Univ Calif San Diego,
                 San Diego, CA 92103 USA. Ramanujam, Rohit Sunkam; Lin,
                 Bill, Univ Calif San Diego, San Diego, CA 92103 USA.",
  author-email = "rsunkamr@ucsd.edu billlin@ucsd.edu",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "2D-torus networks; Algorithm design and analysis;
                 closed-form oblivious routing algorithm; Data
                 communications; Delay; Interconnection network;
                 internetworking; IVAL; latency; Measurement;
                 Multiprocessor interconnection networks;
                 Network-on-a-chip; oblivious routing; Oblivious
                 Routing; On-chip interconnection networks; optimal
                 weighted random routing algorithm; Routing; Runtime;
                 System recovery; telecommunication network routing;
                 throughput; Throughput; torus network; Torus Network;
                 W2TURN; weighted random path selection",
  number-of-cited-references = "8",
  ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X",
  research-areas = "Computer Science",
  researcherid-numbers = "Lin, Binshan/A-9772-2009",
  times-cited =  "2",
  unique-id =    "Ramanujam:2009:WRR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Ahn:2009:MDE,
  author =       "Jung Ho Ahn and Jacob Leverich and Robert S. Schreiber
                 and Norman P. Jouppi",
  title =        "Multicore {DIMM}: an Energy Efficient Memory Module
                 with Independently Controlled {DRAMs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Demand for memory capacity and bandwidth keeps
                 increasing rapidly in modern computer systems, and
                 memory power consumption is becoming a considerable
                 portion of the system power budget. However, the
                 current DDR DIMM standard is not well suited to
                 effectively serve CMP memory requests from both a power
                 and performance perspective. We propose a new memory
                 module called a Multicore DIMM, where DRAM chips are
                 grouped into multiple virtual memory devices, each of
                 which has its own data path and receives separate
                 commands (address and control signals). The Multicore
                 DIMM is designed to improve the energy efficiency of
                 memory systems with small impact on system performance.
                 Dividing each memory modules into 4 virtual memory
                 devices brings a simultaneous 22\%, 7.6\%, and 18\%
                 improvement in memory power, IPC, and system
                 energy-delay product respectively on a set of
                 multithreaded applications and consolidated
                 workloads.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ahn, JH (Reprint Author), Hewlett Packard Labs,
                 Mississauga, ON, Canada. Ahn, Jung Ho; Schreiber,
                 Robert S.; Jouppi, Norman P., Hewlett Packard Labs,
                 Mississauga, ON, Canada. Leverich, Jacob, Stanford
                 Univ, Stanford, CA 94305 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; CMP memory requests; Control systems; DDR
                 DIMM standard; DRAM; DRAM chips; Energy consumption;
                 Energy efficiency; energy efficiency; energy efficient
                 memory module; Energy-aware systems; Error correction
                 codes; independently controlled DRAM; Jacobian
                 matrices; memory capacity; memory module; memory power
                 consumption; Memory Structures; memory system;
                 microprocessor chips; Multicore; multicore DIMM;
                 Multicore processing; Proposals; Random access memory;
                 System performance; system power budget; virtual memory
                 devices",
  number-of-cited-references = "16",
  ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394",
  research-areas = "Computer Science",
  researcherid-numbers = "Ahn, Jung Ho/D-1298-2013",
  times-cited =  "26",
  unique-id =    "Ahn:2009:MDE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wang:2009:PST,
  author =       "Po-Han Wang and Yen-Ming Chen and Chia-Lin Yang and
                 Yu-Jung Cheng",
  title =        "A Predictive Shutdown Technique for {GPU} Shader
                 Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As technology continues to shrink, reducing leakage is
                 critical to achieve energy efficiency. Previous works
                 on low-power GPU (Graphics Processing Unit) focus on
                 techniques for dynamic power reduction, such as DVFS
                 (Dynamic Voltage/Frequency Scaling) and clock gating.
                 In this paper, we explore the potential of adopting
                 architecture-level power gating techniques for leakage
                 reduction on GPU. In particular, we focus on the most
                 power-hungry components, shader processors. We observe
                 that, due to different scene complexity, the required
                 shader resources to satisfy the target frame rate
                 actually vary across frames. Therefore, we propose the
                 Predictive Shader Shutdown technique to exploit
                 workload variation across frames for leakage reduction
                 on shader processors. The experimental results show
                 that Predictive Shader Shutdown achieves up to 46\%
                 leakage reduction on shader processors with negligible
                 performance degradation.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wang, PH (Reprint Author), Natl Taiwan Univ, Dept Comp
                 Sci \& Informat Engn, Taipei 10764, Taiwan. Wang,
                 Po-Han; Chen, Yen-Ming; Yang, Chia-Lin, Natl Taiwan
                 Univ, Dept Comp Sci \& Informat Engn, Taipei 10764,
                 Taiwan. Cheng, Yu-Jung, Natl Taiwan Univ, Grad Inst
                 Networking \& Multimedia, Taipei 10764, Taiwan.",
  author-email = "r96002@csie.ntu.edu.tw r95125@csie.ntu.edu.tw
                 yangc@csie.ntu.edu.tw d96944002@ntu.edu.tw",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Institute for Information Industry of
                 Taiwan [97-FS-C03]; National Taiwan University
                 [97R0062-05]",
  funding-text = "This work was partially supported by the Institute for
                 Information Industry of Taiwan under project No.
                 97-FS-C03, and by the Excellent Research Projects of
                 National Taiwan University, 97R0062-05.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architecture-level power gating techniques; Central
                 Processing Unit; Circuits; clock gating; Clocks;
                 computer architecture; computer graphic equipment;
                 Computer science; coprocessors; Degradation; dynamic
                 power reduction; Dynamic voltage scaling; dynamic
                 voltage-frequency scaling; Energy efficiency;
                 Energy-aware systems; Frequency; GPU; GPU shader
                 processors; Graphics; graphics processing unit; Layout;
                 leakage; Low-power design; power aware computing; power
                 gating; predictive shader shutdown technique",
  number-of-cited-references = "15",
  ORCID-numbers = "YANG, CHIA-LIN/0000-0003-0091-5027",
  research-areas = "Computer Science",
  times-cited =  "10",
  unique-id =    "Wang:2009:PST",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Barnes:2009:XBA,
  author =       "Christopher Barnes and Pranav Vaidya and Jaehwan John
                 Lee",
  title =        "An {XML}-Based {ADL} Framework for Automatic
                 Generation of Multithreaded Computer Architecture
                 Simulators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Computer architecture simulation has always played a
                 pivotal role in continuous innovation of computers.
                 However, constructing or modifying a high quality
                 simulator is time consuming and error-prone. Thus,
                 often Architecture Description Languages (ADLs) are
                 used to provide an abstraction layer for describing the
                 computer architecture and automatically generating
                 corresponding simulators. Along the line of such
                 research, we present a novel XML-based ADL, its
                 compiler, and a generation methodology to automatically
                 generate multithreaded simulators for computer
                 architecture. We utilize the industry-standard
                 extensible markup language XML to describe the
                 functionality and architecture of a modeled processor.
                 Our ADL framework allows users to easily and quickly
                 modify the structure, register set, and execution of a
                 modeled processor. To prove its validity, we have
                 generated several multithreaded simulators with
                 different configurations based on the MIPS five-stage
                 processor, and successfully tested with two programs.",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "IUPUI RSFG",
  funding-text = "This research was funded by the IUPUI RSFG grant.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "abstraction layer; Architecture description languages;
                 automatic generation; C.0.d Modeling of computer
                 architecture; C.1.1.b Pipeline processors;
                 Computational modeling; computer architecture; Computer
                 architecture; Computer simulation; Concurrent
                 computing; extensible markup language-architecture
                 description language; Kernel; MIPS five-stage
                 processor; Modeling of computer architecture;
                 multi-threading; multithreaded computer architecture
                 simulator; Object oriented modeling; Pipeline
                 processors; Pipelines; program compilers; program
                 verification; Testing; validity testing; XML; XML-based
                 ADL framework",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Barnes:2009:XBA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Luque:2009:CAC,
  author =       "Carlos Luque and Miquel Moreto and Francisco J.
                 Cazorla and Roberto Gioiosa and Alper Buyuktosunoglu
                 and Mateo Valero",
  title =        "{CPU} Accounting in {CMP} Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Chip-MultiProcessors (CMP) introduce complexities when
                 accounting CPU utilization to processes because the
                 progress done by a process during an interval of time
                 highly depends on the activity of the other processes
                 it is co-scheduled with. We propose a new hardware
                 accounting mechanism to improve the accuracy when
                 measuring the CPU utilization in CMPs and compare it
                 with the previous accounting mechanisms. Our results
                 show that currently known mechanisms could lead to a
                 12\% average error when it comes to CPU utilization
                 accounting. Our proposal reduces this error to less
                 than 1\% in a modeled 4-core processor system.",
  acknowledgement = ack-nhfb,
  affiliation =  "Luque, C (Reprint Author), Univ Politecn Cataluna,
                 E-08028 Barcelona, Spain. Luque, Carlos; Moreto,
                 Miquel; Valero, Mateo, Univ Politecn Cataluna, E-08028
                 Barcelona, Spain. Cazorla, Francisco J.; Valero, Mateo,
                 Barcelona Supercomp Ctr, Barcelona, Spain.",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Ministry of Science and Technology of Spain
                 [TIN-2007-60625, BES-2008-003683, AP-2005-3318]; HiPEAC
                 Network of Excellence [IST-004408]; IBM Research; IBM
                 Deep Computing organizations",
  funding-text = "This work has been supported by the Ministry of
                 Science and Technology of Spain under contract
                 TIN-2007-60625 and grants BES-2008-003683 and
                 AP-2005-3318, by the HiPEAC Network of Excellence
                 (IST-004408) and a Collaboration Agreement between IBM
                 and BSC with funds from IBM Research and IBM Deep
                 Computing organizations. The authors would like to
                 thank Pradip Bose and Chen-Yong Cher from IBM for their
                 technical support.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "4-core processor system; Bandwidth; Cache memory;
                 chip-multiprocessor architecture; Clocks; CMP processor
                 system; CPU utilization accounting; data center;
                 General; Hardware; hardware accounting mechanism;
                 Hardware/software interfaces; Kernel; microprocessor
                 chips; Multi-core/single-chip multiprocessors;
                 multiprocessing systems; operating system task
                 scheduling; Operating systems; process scheduling;
                 processor scheduling; Proposals; resource allocation;
                 Semiconductor device measurement; Switches",
  number-of-cited-references = "11",
  oa =           "Green Published",
  ORCID-numbers = "Moreto Planas, Miquel/0000-0002-9848-8758 Cazorla,
                 Francisco/0000-0002-3344-376X Luque,
                 Carlos/0000-0003-0442-0785 Valero,
                 Mateo/0000-0003-2917-2482 Gioiosa,
                 Roberto/0000-0001-9430-2656",
  research-areas = "Computer Science",
  researcherid-numbers = "Moreto Planas, Miquel/C-1823-2016 Cazorla,
                 Francisco/D-7261-2016 Luque, Carlos/E-2110-2019 Valero,
                 Mateo/L-5709-2014",
  times-cited =  "5",
  unique-id =    "Luque:2009:CAC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Soteriou:2009:HTD,
  author =       "Vassos Soteriou and Rohit Sunkam Ramanujam and Bill
                 Lin and Li-Shiuan Peh",
  title =        "A High-Throughput Distributed Shared-Buffer {NoC}
                 Router",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Microarchitectural configurations of buffers in
                 routers have a significant impact on the overall
                 performance of an on-chip network (NoC). This buffering
                 can be at the inputs or the outputs of a router,
                 corresponding to an input-buffered router (IBR) or an
                 output-buffered router (OBR). OBRs are attractive
                 because they have higher throughput and lower queuing
                 delays under high loads than IBRs. However, a direct
                 implementation of OBRs requires a router speedup equal
                 to the number of ports, making such a design
                 prohibitive given the aggressive clocking and power
                 budgets of most NoC applications. In this letter, we
                 propose a new router design that aims to emulate an OBR
                 practically based on a distributed shared-buffer (DSB)
                 router architecture. We introduce innovations to
                 address the unique constraints of NoCs, including
                 efficient pipelining and novel flow control. Our DSB
                 design can achieve significantly higher bandwidth at
                 saturation, with an improvement of up to 20\% when
                 compared to a state-of-the-art pipelined IBR with the
                 same amount of buffering, and our proposed
                 microarchitecture can achieve up to 94\% of the ideal
                 saturation throughput.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ramanujam, Rohit Sunkam; Lin, Bill, Univ Calif San
                 Diego, San Diego, CA 92103 USA. Peh, Li-Shiuan,
                 Princeton Univ, Princeton, NJ 08544 USA.",
  author-email = "vassos.soteriou@cut.ac.cy rsunkamr@ucsd.edu
                 billlin@ucsd.edu peh@princeton.edu",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; buffer circuits; Clocks; Computer
                 architecture; configuration management; Delay;
                 distributed shared-buffer; Interconnection
                 architectures; Internet; microarchitectural
                 configurations; Microarchitecture; network routing;
                 Network-on-a-chip; network-on-chip; NoC router; On-chip
                 interconnection networks; output-buffered router;
                 Pipeline processing; router architecture; Router
                 micro-architecture; Technological innovation;
                 Throughput",
  keywords-plus = "ARCHITECTURE",
  number-of-cited-references = "16",
  ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X Soteriou,
                 Vassos/0000-0002-2818-0459",
  research-areas = "Computer Science",
  researcherid-numbers = "Lin, Binshan/A-9772-2009 Soteriou,
                 Vassos/H-4603-2014",
  times-cited =  "15",
  unique-id =    "Soteriou:2009:HTD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Guz:2009:MCV,
  author =       "Zvika Guz and Evgeny Bolotin and Idit Keidar and
                 Avinoam Kolodny and Avi Mendelson and Uri C. Weiser",
  title =        "Many-Core vs. Many-Thread Machines: Stay Away From the
                 Valley",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We study the tradeoffs between Many-Core machines like
                 Intel's Larrabee and Many-Thread machines like Nvidia
                 and AMD GPGPUs. We define a unified model describing a
                 superposition of the two architectures, and use it to
                 identify operation zones for which each machine is more
                 suitable. Moreover, we identify an intermediate zone in
                 which both machines deliver inferior performance. We
                 study the shape of this ``performance valley'' and
                 provide insights on how it can be avoided.",
  acknowledgement = ack-nhfb,
  affiliation =  "Guz, Z (Reprint Author), Technion Israel Inst Technol,
                 EE Dept, IL-32000 Haifa, Israel. Guz, Zvika; Keidar,
                 Idit; Kolodny, Avinoam; Weiser, Uri C., Technion Israel
                 Inst Technol, EE Dept, IL-32000 Haifa, Israel. Bolotin,
                 Evgeny, Intel Corp, Santa Clara, CA 95051 USA.
                 Mendelson, Avi, Microsoft Corp, Redmond, WA 98052
                 USA.",
  author-email = "zguz@tx.technion.ac.il evgeny.bolotin@intel.com
                 idish@ee.technion.ac.il kolodny@ee.technion.ac.il
                 avim@microsoft.com uri.weiser@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Semiconductors Research Corporation (SRC);
                 Intel; Israeli Ministry of Science Knowledge Center on
                 Chip MultiProcessors",
  funding-text = "We thank Ronny Ronen, Michael Behar, and Roni Rosner.
                 This work was partially supported by Semiconductors
                 Research Corporation (SRC), Intel, and the Israeli
                 Ministry of Science Knowledge Center on Chip
                 MultiProcessors.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "AMD GPGPU; architecture superposition; Bandwidth; Chip
                 Multiprocessors; Computer Systems; coprocessors; Delay;
                 Engines; Equations; GPGPU; Graphics; Intelpsilas
                 Larrabee; many-core machines; many-thread machines;
                 Multi-core/single-chip multiprocessors;
                 multi-threading; multiprocessing systems; Nvidia GPGPU;
                 Parallel Architectures; parallel architectures;
                 Parallel processing; performance valley; Processor
                 Architectures; Shape",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "27",
  unique-id =    "Guz:2009:MCV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Desai:2009:AIC,
  author =       "Aniruddha Desai and Jugdutt Singh",
  title =        "Architecture Independent Characterization of Embedded
                 {Java} Workloads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "29--32",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "This paper presents architecture independent
                 characterization of embedded Java workloads based on
                 the industry standard GrinderBench benchmark which
                 includes different classes of real world embedded Java
                 applications. This work is based on a custom built
                 embedded Java Virtual Machine (JVM) simulator
                 specifically designed for embedded JVM modeling and
                 embodies domain specific details such as thread
                 scheduling, algorithms used for native CLDC APIs and
                 runtime data structures optimized for use in embedded
                 systems. The results presented include dynamic
                 execution characteristics, dynamic bytecode instruction
                 mix, application and API workload distribution, Object
                 allocation statistics, instruction-set coverage, memory
                 usage statistics and method code and stack frame
                 characteristics.",
  acknowledgement = ack-nhfb,
  affiliation =  "Desai, A (Reprint Author), La Trobe Univ, Bundoora,
                 Vic 3086, Australia. Desai, Aniruddha; Singh, Jugdutt,
                 La Trobe Univ, Bundoora, Vic 3086, Australia.",
  author-email = "desai@ieee.org",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithm design and analysis; application program
                 interfaces; architecture independent characterization;
                 CLDC API; custom built embedded Java virtual machine
                 simulator; data structures; Data structures; Design
                 optimization; dynamic bytecode instruction mix; dynamic
                 execution characteristics; embedded Java workload;
                 Embedded Systems; embedded systems; Embedded Systems;
                 industry standard GrinderBench benchmark; instruction
                 sets; instruction-set coverage; Java; Java bytecode;
                 Job shop scheduling; JVM; memory usage statistics;
                 method code characteristics; multi-threading; object
                 allocation statistics; Runtime; runtime data structure;
                 scheduling; Scheduling algorithm; stack frame
                 characteristics; Statistical distributions; storage
                 allocation; thread scheduling; virtual machines;
                 Virtual machining; Workload Characterization",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Desai:2009:AIC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Antelo:2009:CBF,
  author =       "Elisardo Antelo",
  title =        "A Comment on {``Beyond Fat-tree: Unidirectional
                 Load-Balanced Multistage Interconnection Network''}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "33--34",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  note =         "See \cite{GomezRequena:2008:BFT}.",
  abstract =     "A recent work proposed to simplify fat-trees with
                 adaptive routing by means of a load-balancing
                 deterministic routing algorithm. The resultant network
                 has performance figures comparable to the more complex
                 adaptive routing fat-trees when packets need to be
                 delivered in order. In a second work by the same
                 authors published in IEEE CAL, they propose to simplify
                 the fat-tree to a unidirectional multistage
                 interconnection network (UMIN), using the same
                 load-balancing deterministic routing algorithm. They
                 show that comparable performance figures are achieved
                 with much lower network complexity. In this comment we
                 show that the proposed load-balancing deterministic
                 routing is in fact the routing scheme used by the
                 butterfly network. Moreover we show that the properties
                 of the simplified UMIN network proposed by them are
                 intrinsic to the standard butterfly and other existing
                 UMINs",
  acknowledgement = ack-nhfb,
  affiliation =  "Antelo, E (Reprint Author), Univ Santiago de
                 Compostela, Dept Elect \& Comp Sci, Santiago De
                 Compostela, Spain. Univ Santiago de Compostela, Dept
                 Elect \& Comp Sci, Santiago De Compostela, Spain.",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "adaptive routing fat-trees; Bismuth; butterfly
                 network; Computer science; deterministic algorithms;
                 fat-tree; hypercube networks; Interconnection networks;
                 Interconnections (Subsystems); load balancing
                 deterministic routing algorithm; Logic functions;
                 Multiprocessor interconnection networks; Multistage
                 Interconnection networks; network complexity; Network
                 topology; packets; resource allocation; Routing;
                 Switches; Technological innovation; Topology;
                 unidirectional load-balanced multistage interconnection
                 network; unidirectional multistage interconnection
                 network",
  number-of-cited-references = "7",
  ORCID-numbers = "Antelo, Elisardo/0000-0003-3743-3689",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Antelo:2009:CBF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2009:Aa,
  author =       "Anonymous",
  title =        "{[Advertisement]}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "35--35",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.38",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:AIC,
  author =       "Anonymous",
  title =        "Ad --- {IEEE Computer Society Digital Library}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "36--36",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.39",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:EBCa,
  author =       "Anonymous",
  title =        "Editorial Board [Cover2]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "c2--c2",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.41",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:FCa,
  author =       "Anonymous",
  title =        "{[Front} cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "c1--c1",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.40",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:IAa,
  author =       "Anonymous",
  title =        "Information for authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "c3--c3",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.42",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:ICSa,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [Cover4]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "c4--c4",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.43",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Gaudiot:2009:INE,
  author =       "Jean-Luc Gaudiot",
  title =        "Introducing the New {Editor-in-Chief} of
                 {{\booktitle{IEEE Computer Architecture Letters}}}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "37--38",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.60",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  number-of-cited-references = "0",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Gaudiot:2009:INE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Skadron:2009:LE,
  author =       "K. Skadron",
  title =        "Letter from the {Editor}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "39--39",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.61",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Skadron:2009:U,
  author =       "Kevin Skadron",
  title =        "Untitled",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "39--39",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.61",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  number-of-cited-references = "0",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Skadron:2009:U",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Xin:2009:ELI,
  author =       "Jing Xin and Russ Joseph",
  title =        "Exploiting Locality to Improve Circuit-level Timing
                 Speculation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "40--43",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.50",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Circuit-level timing speculation has been proposed as
                 a technique to reduce dependence on design margins,
                 eliminating power and performance overheads. Recent
                 work has proposed microarchitectural methods to
                 dynamically detect and recover from timing errors in
                 processor logic. This work has not evaluated or
                 exploited the disparity of error rates at the level of
                 static instructions. In this paper, we demonstrate
                 pronounced locality in error rates at the level of
                 static instructions. We propose timing error prediction
                 to dynamically anticipate timing errors at the
                 instruction-level and reduce the costly recovery
                 penalty. This allows us to achieve 43.6\% power savings
                 when compared to a baseline policy and incurs only
                 6.9\% performance penalty.",
  acknowledgement = ack-nhfb,
  affiliation =  "Xin, J (Reprint Author), Northwestern Univ, Evanston,
                 IL 60208 USA. Xin, Jing; Joseph, Russ, Northwestern
                 Univ, Evanston, IL 60208 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-0644332, CNS-0720820]",
  funding-text = "Manuscript submitted: 17-Sep-2009. Manuscript
                 accepted: 08-Oct-2009. Final manuscript received:
                 15-Oct-2009. We thank the anonymous reviewers for their
                 constructive feedback. This work was supported by NSF
                 awards CAREER CCF-0644332 and CNS-0720820.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Circuit faults; circuit reliability; circuit-level
                 timing speculation; Costs; Delay; Dynamic voltage
                 scaling; Error analysis; Error locality; Frequency;
                 Hardware; instruction sets; Logic; logic design;
                 low-power design; Low-power design; microarchitectural
                 methods; microprocessor chips; Pipelines; power
                 elimination; processor logic; reliability; Reliability;
                 static instruction level; Testing and Fault-Tolerance;
                 Timing; timing error prediction; timing speculation",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Xin:2009:ELI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Sudarsanam:2009:PPD,
  author =       "Arvind Sudarsanam and Ramachandra Kallam and Aravind
                 Dasu",
  title =        "{PRR--PRR} Dynamic Relocation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "44--47",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.49",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Partial bitstream relocation (PBR) on FPGAs has been
                 gaining attention in recent years as a potentially
                 promising technique to scale parallelism of accelerator
                 architectures at run time, enhance fault tolerance,
                 etc. PBR techniques to date have focused on reading
                 inactive bitstreams stored in memory, on-chip or
                 off-chip, whose contents are generated for a specific
                 partial reconfiguration region (PRR) and modified on
                 demand for configuration into a PRR at a different
                 location. As an alternative, we propose a PRR-PRR
                 relocation technique to generate source and destination
                 addresses, read the bitstream from an active PRR
                 (source) in a non-intrusive manner, and write it to
                 destination PRR. We describe two options of realizing
                 this on Xilinx Virtex 4 FPGAs: (a) hardware-based
                 accelerated relocation circuit (ARC) and (b) a software
                 solution executed on Microblaze. A comparative
                 performance analysis to highlight the speed-up obtained
                 using ARC is presented. For real test cases,
                 performance of our implementations are compared to
                 estimated performances of two state of the art
                 methods.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sudarsanam, A (Reprint Author), Utah State Univ, Dept
                 Elect \& Comp Engn, Logan, UT 84321 USA. Sudarsanam,
                 Arvind; Kallam, Ramachandra; Dasu, Aravind, Utah State
                 Univ, Dept Elect \& Comp Engn, Logan, UT 84321 USA.",
  author-email = "arvind.sudarsanam@aggiemail.usu.edu
                 ramachandra.kallam@aggiemail.usu.edu
                 dasu@engineering.usu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NASA; Micron Research Center",
  funding-text = "Manuscript submitted: 03-Aug-2009. Manuscript
                 accepted: 16-Sep-2009. Final manuscript received:
                 24-Sep-2009. This work was supported by NASA and Micron
                 Research Center.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; Accelerator architectures; accelerator
                 architectures; Bioreactors; Circuits; destination
                 address; Emerging technologies; Fault tolerance; fault
                 tolerance; field programmable gate arrays; Field
                 programmable gate arrays; Filters; FPGAs; Hardware;
                 hardware-based accelerated relocation circuit; parallel
                 architecture; parallel architectures; Parallel
                 processing; partial bitstream relocation; Partial
                 dynamic reconfiguration; Partial dynamic relocation;
                 partial reconfiguration region; PBR techniques;
                 Performance analysis; Performance Analysis and Design
                 Aids; PRR-PRR dynamic relocation technique; PRR-PRR
                 relocation technique; Reconfigurable computing;
                 Reconfigurable hardware; source address; Xilinx Virtex
                 4 FPGA",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "Sudarsanam:2009:PPD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Leverich:2009:PMD,
  author =       "Jacob Leverich and Matteo Monchiero and Vanish Talwar
                 and Partha Ranganathan and Christos Kozyrakis",
  title =        "Power Management of Datacenter Workloads Using
                 Per-Core Power Gating",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "48--51",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.46",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "While modern processors offer a wide spectrum of
                 software-controlled power modes, most datacenters only
                 rely on Dynamic Voltage and Frequency Scaling (DVFS,
                 a.k.a. P-states) to achieve energy efficiency. This
                 paper argues that, in the case of datacenter workloads,
                 DVFS is not the only option for processor power
                 management. We make the case for per-core power gating
                 (PCPG) as an additional power management knob for
                 multi-core processors. PCPG is the ability to cut the
                 voltage supply to selected cores, thus reducing to
                 almost zero the leakage power for the gated cores.
                 Using a testbed based on a commercial 4-core chip and a
                 set of real-world application traces from enterprise
                 environments, we have evaluated the potential of PCPG.
                 We show that PCPG can significantly reduce a
                 processor's energy consumption (up to 40\%) without
                 significant performance overheads. When compared to
                 DVFS, PCPG is highly effective saving up to 30\% more
                 energy than DVFS. When DVFS and PCPG operate together
                 they can save up to almost 60\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "Leverich, J (Reprint Author), Hewlett Packard Labs,
                 Mississauga, ON, Canada. Leverich, Jacob; Monchiero,
                 Matteo; Talwar, Vanish; Ranganathan, Partha, Hewlett
                 Packard Labs, Mississauga, ON, Canada. Leverich, Jacob;
                 Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305
                 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; computer centres; Costs; data
                 center workloads; dynamic voltage and frequency
                 scaling; Dynamic voltage scaling; Energy consumption;
                 energy efficiency; Energy management; Energy-aware
                 systems; enterprise environments; Frequency;
                 integration and modeling; Jacobian matrices; leakage
                 power; microprocessor chips; Multicore processing;
                 multicore processors; per-core power gating; power
                 consumption; Power supplies; processor energy
                 consumption; processor power management;
                 software-controlled power modes; System architectures;
                 Testing",
  number-of-cited-references = "10",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "43",
  unique-id =    "Leverich:2009:PMD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Musoll:2009:PVA,
  author =       "Enric Musoll",
  title =        "A Process-Variation Aware Technique for Tile-Based,
                 Massive Multicore Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "52--55",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.48",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Process variations in advanced nodes introduce
                 significant core-to-core performance differences in
                 single-chip multicore architectures. Isolating each
                 core with its own frequency and voltage island helps
                 improving the performance of the multi-core
                 architecture by operating at the highest frequency
                 possible rather than operating all the cores at the
                 frequency of the slowest core. However, inter-core
                 communication suffers from additional
                 cross-clock-domain latencies that can offset the
                 performance benefits. This work proposes the concept of
                 the configurable, variable-size frequency and voltage
                 domain, and it is described in the context of a
                 tile-based, massive multi-core architecture.",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Clocks; computer architecture; Context;
                 cross-clock-domain latency; Delay; Frequency; intercore
                 communication; massive multi-core; massive multicore
                 processors; Multi-core/single-chip multiprocessors;
                 multicore architecture; Multicore processing;
                 Network-on-a-chip; network-on-chip; On-chip
                 interconnection networks; Performance gain; Process
                 design; process-variation aware architecture;
                 process-variation aware technique; Runtime; single-chip
                 multicore architectures; tile-base architecture;
                 tile-based multicore processors; variable-size
                 frequency domain; Voltage; voltage domain",
  number-of-cited-references = "5",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Musoll:2009:PVA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Baldassin:2009:CEC,
  author =       "Alexandro Baldassin and Felipe Klein and Guido Araujo
                 and Rodolfo Azevedo and Paulo Centoducatte",
  title =        "Characterizing the Energy Consumption of Software
                 Transactional Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "56--59",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.47",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The well-known drawbacks imposed by lock-based
                 synchronization have forced researchers to devise new
                 alternatives for concurrent execution, of which
                 transactional memory is a promising one. Extensive
                 research has been carried out on Software Transaction
                 Memory (STM), most of all concentrated on program
                 performance, leaving unattended other metrics of great
                 importance like energy consumption. This letter
                 presents a thorough evaluation of energy consumption in
                 a state-of-the-art STM. We show that energy and
                 performance results do not always follow the same trend
                 and, therefore, it might be appropriate to consider
                 different strategies depending on the focus of the
                 optimization. We also introduce a novel strategy based
                 on dynamic voltage and frequency scaling for contention
                 managers, revealing important energy and energy-delay
                 product improvements in high-contended scenarios. This
                 work is a first study towards a better understanding of
                 the energy consumption behavior of STM systems, and
                 could prompt STM designers to research new
                 optimizations in this area, paving the way for an
                 energy-aware transactional memory.",
  acknowledgement = ack-nhfb,
  affiliation =  "Baldassin, A (Reprint Author), Univ Estadual Campinas,
                 Inst Comp, Campinas, SP, Brazil. Baldassin, Alexandro;
                 Klein, Felipe; Araujo, Guido; Azevedo, Rodolfo;
                 Centoducatte, Paulo, Univ Estadual Campinas, Inst Comp,
                 Campinas, SP, Brazil.",
  author-email = "alebal@ic.unicamp.br klein@ic.unicamp.br
                 guido@ic.unicamp.br rodolfo@ic.unicamp.br
                 ducatte@ic.unicamp.br",
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "FAPESP [2005/02565-9]",
  funding-text = "Manuscript submitted: 02-Jul-2009. Manuscript
                 accepted: 23-Jul-2009. Final manuscript received:
                 05-Aug-2009. This work was supported in part by FAPESP
                 (2005/02565-9).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Concurrent computing; Concurrent Programming; Content
                 management; Costs; Dynamic voltage scaling; Energy
                 Consumption; Energy consumption; energy consumption;
                 Energy management; Energy-aware systems; energy-delay
                 product improvements; frequency scaling; Frequency
                 synchronization; Hardware; lock-based synchronization;
                 Measurement techniques; Memory management;
                 multiprocessing systems; Multiprocessor Systems;
                 multiprocessor systems; Multiprocessor Systems;
                 Parallel Architectures; parallel architectures; Power
                 Management; Software performance; software
                 transactional memory; synchronisation; transaction
                 processing; Transactional Memory",
  number-of-cited-references = "13",
  ORCID-numbers = "Azevedo, Rodolfo/0000-0002-8803-0401",
  research-areas = "Computer Science",
  researcherid-numbers = "Azevedo, Rodolfo/F-3008-2012",
  times-cited =  "3",
  unique-id =    "Baldassin:2009:CEC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Balfour:2009:ORE,
  author =       "James Balfour and R. Curtis Harting and William J.
                 Dally",
  title =        "Operand Registers and Explicit Operand Forwarding",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "60--63",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.45",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Operand register files are small, inexpensive register
                 files that are integrated with function units in the
                 execute stage of the pipeline, effectively extending
                 the pipeline operand registers into register files.
                 Explicit operand forwarding lets software
                 opportunistically orchestrate the routing of operands
                 through the forwarding network to avoid writing
                 ephemeral values to registers. Both mechanisms let
                 software capture short-term reuse and locality close to
                 the function units, improving energy efficiency by
                 allowing a significant fraction of operands to be
                 delivered from inexpensive registers that are
                 integrated with the function units. An evaluation shows
                 that capturing operand bandwidth close to the function
                 units allows operand registers to reduce the energy
                 consumed in the register files and forwarding network
                 of an embedded processor by 61\%, and allows explicit
                 forwarding to reduce the energy consumed by 26\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "Balfour, J (Reprint Author), Stanford Univ, Comp Syst
                 Lab, Stanford, CA 94305 USA. Balfour, James; Harting,
                 R. Curtis; Dally, William J., Stanford Univ, Comp Syst
                 Lab, Stanford, CA 94305 USA.",
  author-email = "jbalfour@cva.stanford.edu dally@cva.stanford.edu",
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Code generation; Computer aided
                 instruction; Computer System Implementation; Computer
                 Systems Organizat; embedded processor; Energy capture;
                 energy consumption; energy efficient register
                 organization; explicit operand forwarding; explicit
                 operand forwarding network; Fixed-point arithmetic;
                 impact of technology trends; Impact of VLSI on system
                 design; Laboratories; Logic; low-power programmable
                 processors; Memory hierarchy; microprocessor chips;
                 operand bandwidth; operand register files; operand
                 registers; Optimization; Physically aware
                 micro-architecture: power; Pipelines; Real-time and
                 embedded systems; Registers; Routing; software
                 reusability; thermal; VLSI Systems; Writing",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "Balfour:2009:ORE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Chiou:2009:AFF,
  author =       "Derek Chiou and Hari Angepat and Nikhil A. Patil and
                 Dam Sunwoo",
  title =        "Accurate Functional-First Multicore Simulators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "64--67",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.44",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Fast and accurate simulation of multicore systems
                 requires a parallelized simulator. This paper describes
                 a novel method to build parallelizable and
                 cycle-accurate-capable functional-first simulators of
                 multicore targets.",
  acknowledgement = ack-nhfb,
  affiliation =  "Chiou, D (Reprint Author), Univ Texas Austin, Dept
                 Elect \& Comp Engn, Austin, TX 78712 USA. Chiou, Derek;
                 Angepat, Hari; Patil, Nikhil A.; Sunwoo, Dam, Univ
                 Texas Austin, Dept Elect \& Comp Engn, Austin, TX 78712
                 USA.",
  author-email = "derek@ece.utexas.edu angepat@ece.utexas.edu
                 npatil@ece.utexas.edu sunwoo@ece.utexas.edu",
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [0615352,
                 0747438]",
  funding-text = "This material is based upon work supported by the
                 National Science Foundation under Grants No. 0615352
                 and No. 0747438 and gifts from Intel and IBM. We thank
                 the anonymous reviewers for their comments.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "circuit simulation; Computational modeling; Computer
                 simulation; field programmable gate arrays;
                 FPGA-accelerated simulation technologies;
                 functional-first multicore simulators; Instruction
                 sets; integration and modeling; Microarchitecture;
                 Modeling and Visualization; Modeling of computer
                 architecture; Modeling techniques;
                 Multi-core/single-chip multiprocessors; Multicore
                 processing; multicore system simulation; Parallel;
                 Parallel Architectures; parallelized simulator;
                 Performance Analysis and Design Aids; Predictive
                 models; Simulation; Software prototyping; System
                 architectures; Timing; Virtual machining; Virtual
                 prototyping",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "7",
  unique-id =    "Chiou:2009:AFF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2009:Ab,
  author =       "Anonymous",
  title =        "{[Advertisement]}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "68--68",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.52",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:Ac,
  author =       "Anonymous",
  title =        "{[Advertisement]}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "69--69",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.53",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:Ad,
  author =       "Anonymous",
  title =        "{[Advertisement]}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "70--70",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.55",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:Ae,
  author =       "Anonymous",
  title =        "{[Advertisement]}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "71--71",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.54",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:Af,
  author =       "Anonymous",
  title =        "{[Advertisement]}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "72--72",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.51",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:EBCb,
  author =       "Anonymous",
  title =        "Editorial Board [Cover2]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "c2--c2",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.57",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:FCb,
  author =       "Anonymous",
  title =        "{[Front} cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "c1--c1",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.56",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:IAb,
  author =       "Anonymous",
  title =        "Information for authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "c3--c3",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.58",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:ICSb,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [Cover4]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "c4--c4",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.59",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Patil:2010:URT,
  author =       "Shruti Patil and David J. Lilja",
  title =        "Using Resampling Techniques to Compute Confidence
                 Intervals for the Harmonic Mean of Rate-Based
                 Performance Metrics",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Rate-based metrics such as floating point operations
                 per second, instructions per cycle and so forth are
                 commonly used to measure computer performance. In
                 addition to the average or mean performance of the
                 metric, indicating the precision of the mean using
                 confidence intervals helps to make informed decisions
                 and comparisons with the data. In this paper, we
                 discuss the determination of confidence intervals for
                 the harmonic mean of rate-based metrics using two
                 statistical resampling techniques Jackknife and
                 Bootstrap. We show using Monte Carlo simulations that
                 resampling indeed works as expected, and can be used
                 for generating confidence intervals for harmonic
                 mean.",
  acknowledgement = ack-nhfb,
  affiliation =  "Patil, S (Reprint Author), Univ Minnesota Twin Cities,
                 Dept Elect \& Comp Engn, St Paul, MN USA. Patil,
                 Shruti; Lilja, David J., Univ Minnesota Twin Cities,
                 Dept Elect \& Comp Engn, St Paul, MN USA.",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CCF-0541162]",
  funding-text = "This work was supported in part by the National
                 Science Foundation grant no. CCF-0541162. Any opinions,
                 findings and conclusions or recommendations expressed
                 in this material are those of the authors and do not
                 necessarily reflect the views of the NSF. The authors
                 also thank the University of Minnesota Statistical
                 Consulting Service for their helpful insights.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Arithmetic; bootstrap; bootstrap technique; Cities and
                 towns; Computer errors; Computer performance; computer
                 performance measurement; Confidence intervals;
                 confidence intervals; Electric variables measurement;
                 Equations; floating point operations; Harmonic
                 analysis; harmonic mean; jackknife; jackknife
                 technique; Monte Carlo methods; Monte Carlo
                 simulations; Nonparametric statistics; Performance
                 analysis; performance evaluation; Performance of
                 Systems; Probability distribution; rate-based
                 performance metrics; resampling; statistical analysis;
                 statistical resampling techniques; Statistics",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "Patil:2010:URT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Seznec:2010:PCM,
  author =       "Andre Seznec",
  title =        "A Phase Change Memory as a Secure Main Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/prng.bib",
  abstract =     "Phase change memory (PCM) technology appears as more
                 scalable than DRAM technology. As PCM exhibits access
                 time slightly longer but in the same range as DRAMs,
                 several recent studies have proposed to use PCMs for
                 designing main memory systems. Unfortunately PCM
                 technology suffers from a limited write endurance;
                 typically each memory cell can be only be written a
                 large but still limited number of times (10(7) to 10(9)
                 writes are reported for current technology). Till now,
                 research proposals have essentially focused their
                 attention on designing memory systems that will survive
                 to the average behavior of conventional applications.
                 However PCM memory systems should be designed to
                 survive worst-case applications, i.e., malicious
                 attacks targeting the physical destruction of the
                 memory through overwriting a limited number of memory
                 cells.",
  acknowledgement = ack-nhfb,
  affiliation =  "Seznec, A (Reprint Author), INRIA Rennes Bretagne
                 Atlantique, Ctr Rech, Campus Beaulieu, F-35042 Rennes,
                 France. INRIA Rennes Bretagne Atlantique, Ctr Rech,
                 F-35042 Rennes, France.",
  author-email = "seznec@irisa.fr",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "European Commission [27648]",
  funding-text = "This work was partially supported by the European
                 Commission in the context of the SARC integrated
                 project \#27648 (FP6).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; DRAM technology; Energy
                 consumption; memory cells; Memory Structures; PCM
                 memory systems; Phase change materials; phase change
                 memories; phase change memory; Phase change memory;
                 Physics computing; Proposals; Random access memory;
                 Random number generation; Random processes;
                 Scalability; secure PCM-based main memory;
                 Semiconductor Memories",
  keywords-plus = "TECHNOLOGY",
  number-of-cited-references = "8",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "17",
  unique-id =    "Seznec:2010:PCM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Park:2010:EIP,
  author =       "Seon-yeong Park and Euiseong Seo and Ji-Yong Shin and
                 Seungryoul Maeng and Joonwon Lee",
  title =        "Exploiting Internal Parallelism of Flash-based
                 {SSDs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "For the last few years, the major driving force behind
                 the rapid performance improvement of SSDs has been the
                 increment of parallel bus channels between a flash
                 controller and flash memory packages inside the
                 solid-state drives (SSDs). However, there are other
                 internal parallelisms inside SSDs yet to be explored.
                 In order to improve performance further by utilizing
                 the parallelism, this paper suggests request
                 rescheduling and dynamic write request mapping.
                 Simulation results with real workloads have shown that
                 the suggested schemes improve the performance of the
                 SSDs by up to 15\% without any additional hardware
                 support.",
  acknowledgement = ack-nhfb,
  affiliation =  "Park, SY (Reprint Author), Korea Adv Inst Sci \&
                 Technol, Taejon, South Korea. Park, Seon-yeong; Shin,
                 Ji-Yong; Maeng, Seungryoul, Korea Adv Inst Sci \&
                 Technol, Taejon, South Korea. Seo, Euiseong, Ulsan Natl
                 Inst Sci \& Technol, Ulsan, South Korea. Lee, Joonwon,
                 Sungkyunkwan Univ, Seoul, South Korea.",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Korea government(MEST) [2009-0080381]",
  funding-text = "This work was supported by the Korea Science and
                 Engineering Foundation (KOSEF) grant funded by the
                 Korea government (MEST), (No. 2009-080381)",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Delay; Drives; exploiting internal parallelism; flash
                 based SSD; flash controller; flash memories; Flash
                 memory; flash memory packages; Force control; Hard
                 disks; I/O scheduling; Input/Output Devices; Packaging;
                 parallel bus channels; parallel processing; Parallel
                 systems; parallelism; pipeline processing; Pipeline
                 processing; Secondary storage; Simulation; Solid state
                 circuits; solid state drives; Solid-State Drives
                 (SSDs); Space technology; Storage Management; system
                 buses; Throughput",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  researcherid-numbers = "Maeng, Seungryoul/C-1882-2011",
  times-cited =  "35",
  unique-id =    "Park:2010:EIP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Subramoni:2010:ISI,
  author =       "Hari Subramoni and Fabrizio Petrini and Virat Agarwal
                 and Davide Pasetto",
  title =        "Intra-Socket and Inter-Socket Communication in
                 Multi-core Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The increasing computational and communication demands
                 of the scientific and industrial communities require a
                 clear understanding of the performance trade-offs
                 involved in multi-core computing platforms. Such
                 analysis can help application and toolkit developers in
                 designing better, topology aware, communication
                 primitives intended to suit the needs of various high
                 end computing applications. In this paper, we take on
                 the challenge of designing and implementing a portable
                 intra-core communication framework for streaming
                 computing and evaluate its performance on some popular
                 multi-core architectures developed by Intel, AMD and
                 Sun. Our experimental results, obtained on the Intel
                 Nehalem, AMD Opteron and Sun Niagara 2 platforms, show
                 that we are able to achieve an intra-socket small
                 message latency between 120 and 271 nanoseconds, while
                 the inter-socket small message latency is between 218
                 and 320 nanoseconds. The maximum intra-socket
                 communication bandwidth ranges from 0.179 (Sun Niagara
                 2) to 6.5 (Intel Nehalem) Gbytes/second. We were also
                 able to obtain an inter-socket communication
                 performance of 1.2 and 6.6 Gbytes/second on the AMD
                 Opteron and Intel Nehalem, respectively.",
  acknowledgement = ack-nhfb,
  affiliation =  "Subramoni, H (Reprint Author), IBM TJ Watson, Yorktown
                 Hts, NY 10598 USA. Subramoni, Hari; Petrini, Fabrizio;
                 Agarwal, Virat, IBM TJ Watson, Yorktown Hts, NY 10598
                 USA. Pasetto, Davide, IBM Computat Sci Ctr, Dublin,
                 Ireland. Subramoni, Hari, Ohio State Univ, Columbus, OH
                 43210 USA.",
  author-email = "subramon@cse.ohio-state.edu fpetrin@us.ibm.com
                 viratagarwal@us.ibm.com pasetto\_davide@ie.ibm.com",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "AMD Opteron; Bandwidth; Communication industry;
                 communication primitives; Communication Protocols;
                 Computer applications; Computer architecture; Computer
                 industry; Delay; General; Hardware; High Performance
                 Computing; industrial communities; Intel Nehalem;
                 intersocket communication; Intrasocket communication;
                 multicore architectures; Multicore Processors;
                 multicore systems; multiprocessing systems; parallel
                 architectures; Performance of Systems; Portable
                 computers; streaming computing; Sun; toolkit
                 developers; Topology; topology aware",
  keywords-plus = "NETWORK",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Subramoni:2010:ISI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hoang:2010:CAN,
  author =       "Giang Hoang and Chang Bae and John Lange and Lide
                 Zhang and Peter Dinda and Russ Joseph",
  title =        "A Case for Alternative Nested Paging Models for
                 Virtualized Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Address translation often emerges as a critical
                 performance bottleneck for virtualized systems and has
                 recently been the impetus for hardware paging
                 mechanisms. These mechanisms apply similar translation
                 models for both guest and host address translations. We
                 make an important observation that the model employed
                 to translate from guest physical addresses (GPAs) to
                 host physical addresses (HPAs) is in fact orthogonal to
                 the model used to translate guest virtual addresses
                 (GVAs) to GPAs. Changing this model requires VMM
                 cooperation, but has no implications for guest OS
                 compatibility. As an example, we consider a hashed page
                 table approach for GPA -> HPA translation. Nested
                 paging, widely considered the most promising approach,
                 uses unhashed multi-level forward page tables for both
                 GVA -> GPA and GPA -> HPA translations, resulting in a
                 potential O(n(2)) page walk cost on a TLB miss, for
                 n-level page tables. In contrast, the hashed page table
                 approach results in an expected O(n) cost. Our
                 simulation results show that when a hashed page table
                 is used in the nested level, the performance of the
                 memory system is not worse, and sometimes even better
                 than a nested forward-mapped page table due to reduced
                 page walks and cache pressure. This showcases the
                 potential for alternative paging mechanisms.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hoang, GA (Reprint Author), Northwestern Univ,
                 Evanston, IL 60208 USA. Hoang, Giang; Bae, Chang;
                 Lange, John; Dinda, Peter; Joseph, Russ, Northwestern
                 Univ, Evanston, IL 60208 USA. Zhang, Lide, Univ
                 Michigan, Ann Arbor, MI 48109 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "address translation; Computer Architecture; Computer
                 architecture; Computer Architecture; Computer displays;
                 Control systems; Costs; Emerging technologies; file
                 organisation; guest physical addresses; guest virtual
                 addresses; Hardware; hardware paging mechanisms;
                 Hardware/software interfaces; host physical addresses;
                 Instruction sets; Nested Paging; nested paging models;
                 Operating systems; OS compatibility; paged storage;
                 Platform virtualization; Software performance; storage
                 allocation; unhashed multilevel forward page tables;
                 virtual machine monitors; Virtual machine monitors;
                 virtual machines; Virtual Memory; Virtualization;
                 virtualized systems; VMM cooperation",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  researcherid-numbers = "Joseph, Russell/B-7230-2009 Dinda,
                 Peter/B-7142-2009",
  times-cited =  "5",
  unique-id =    "Hoang:2010:CAN",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Krimer:2010:SNT,
  author =       "Evgeni Krimer and Robert Pawlowski and Mattan Erez and
                 Patrick Chiang",
  title =        "{Synctium}: a Near-Threshold Stream Processor for
                 Energy-Constrained Parallel Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "While Moore's law scaling continues to double
                 transistor density every technology generation, supply
                 voltage reduction has essentially stopped, increasing
                 both power density and total energy consumed in
                 conventional microprocessors. Therefore, future
                 processors will require an architecture that can: (a)
                 take advantage of the massive amount of transistors
                 that will be available; and (b) operate these
                 transistors in the near-threshold supply domain,
                 thereby achieving near optimal energy/computation by
                 balancing the leakage and dynamic energy consumption.
                 Unfortunately, this optimality is typically achieved
                 while running at very low frequencies (i.e.,
                 0.1--10MHz) and with only one computation executing per
                 cycle, such that performance is limited. Further,
                 near-threshold designs suffer from severe process
                 variability that can introduce extremely large delay
                 variations. In this paper, we propose a near
                 energy-optimal, stream processor family that relies on
                 massively parallel, near-threshold VLSI circuits and
                 interconnect, incorporating cooperative
                 circuit/architecture techniques to tolerate the
                 expected large delay variations. Initial estimations
                 from circuit simulations show that it is possible to
                 achieve greater than 1 Giga-Operations per second
                 (1GOP/s) with less than 1mW total power consumption,
                 enabling a new class of energy-constrained,
                 high-throughput computing applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Krimer, E (Reprint Author), UT Austin, ECE, Austin, TX
                 USA. Krimer, Evgeni; Erez, Mattan, UT Austin, ECE,
                 Austin, TX USA. Pawlowski, Robert; Chiang, Patrick,
                 Oregon State Univ, EECS, Corvallis, OR 97331 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Circuits; Computer architecture; conventional
                 microprocessors; Delay; double transistor density;
                 dynamic energy consumption; energy constrained parallel
                 applications; Energy consumption; etc.; Frequency;
                 impact of technology trends; Low-power design;
                 Microprocessors; Mobile processors; Moore's Law; near
                 threshold stream processor; optimisation; parallel
                 programming; Physically aware micro-architecture:
                 power; pipeline processing; Power generation; SIMD
                 processors; supply voltage reduction; Synctium;
                 thermal; Very large scale integration; VLSI circuits;
                 Voltage",
  keywords-plus = "CIRCUITS; TOLERANCE; CMOS",
  number-of-cited-references = "19",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "22",
  unique-id =    "Krimer:2010:SNT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hilton:2010:SDE,
  author =       "Andrew Hilton and Amir Roth",
  title =        "{SMT-Directory}: Efficient Load-Load Ordering for
                 {SMT}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Memory models like SC, TSO, and PC enforce load-load
                 ordering, requiring that loads from any single thread
                 appear to occur in program order to all other threads.
                 Out-of-order execution can violate load-load ordering.
                 Conventional multi-processors with out-of-order cores
                 detect load-load ordering violations by snooping an
                 age-ordered load queue on cache invalidations or
                 evictions-events that act as proxies for the completion
                 of remote stores. This mechanism becomes less efficient
                 in an SMT processor, as every completing store must
                 search the loads queue segments of all other threads.
                 This inefficiency exists because store completions from
                 other threads in the same core are not filtered by the
                 cache and coherence protocol: thread 0 observes all of
                 thread 1's stores, not only the first store to every
                 cache line. SMT-Directory eliminates this overhead by
                 implementing the filtering traditionally provided by
                 the cache in the cache itself. SMT-Directory adds a
                 per-thread ``read'' bit to every data cache line. When
                 a load executes, it sets the bit corresponding to its
                 thread. When a store completes and write to the cache,
                 it checks the SMT-Directory bits of its cache line and
                 searches the load queue segments only of those threads
                 whose bits are set. As a result, local store
                 completions trigger searches only for data that is
                 actually shared.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hilton, A (Reprint Author), Univ Penn, Philadelphia,
                 PA 19104 USA. Hilton, Andrew; Roth, Amir, Univ Penn,
                 Philadelphia, PA 19104 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-0541292]",
  funding-text = "We thank Arun Raghavan for the address traces and Milo
                 Martin for comments on early versions of this work. The
                 anonymous reviewers provided valuable feedback. This
                 work was supported by NSF award CCF-0541292.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "age-ordered load queue; Buffer storage; cache
                 invalidations; cache protocol; cache storage; coherence
                 protocol; consistency models; data cache line;
                 directory; Filtering; Load modeling; load queue search;
                 load queue segments; load-load ordering; Memory
                 hierarchy; multi-threading; multiprocessing systems;
                 Multithreaded processors; Multithreading; Out of order;
                 Protocols; Read-write memory; Simultaneous
                 multithreading; SMT processor; Surface-mount
                 technology; Writing",
  keywords-plus = "CONSISTENCY",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Hilton:2010:SDE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hammoud:2010:DPA,
  author =       "Mohammad Hammoud and Sangyeun Cho and Rami G. Melhem",
  title =        "A Dynamic Pressure-Aware Associative Placement
                 Strategy for Large Scale Chip Multiprocessors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "29--32",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper describes dynamic pressure-aware
                 associative placement (DPAP), a novel distributed cache
                 management scheme for large-scale chip multiprocessors.
                 Our work is motivated by the large non-uniform
                 distribution of memory accesses across cache sets in
                 different L2 banks. DPAP decouples the physical
                 locations of cache blocks from their addresses for the
                 sake of reducing misses caused by destructive
                 interferences. Temporal pressure at the on-chip
                 last-level cache, is continuously collected at a group
                 (comprised of local cache sets) granularity, and
                 periodically recorded at the memory controller(s) to
                 guide the placement process. An incoming block is
                 consequently placed at a cache group that exhibits the
                 minimum pressure. Simulation results using a
                 full-system simulator demonstrate that DPAP outperforms
                 the baseline shared NUCA scheme by an average of 8.3\%
                 and by as much as 18.9\% for the benchmark programs we
                 examined. Furthermore, evaluations showed that DPAP
                 outperforms related cache designs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hammoud, M (Reprint Author), Univ Pittsburgh, Dept
                 Comp Sci, Pittsburgh, PA 15260 USA. Hammoud, Mohammad;
                 Cho, Sangyeun; Melhem, Rami G., Univ Pittsburgh, Dept
                 Comp Sci, Pittsburgh, PA 15260 USA.",
  author-email = "mhh@cs.pitt.edu cho@cs.pitt.edu melhem@cs.pitt.edu",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-0952273]",
  funding-text = "This work was supported in part by NSF grant
                 CCF-0952273.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Aggregate Cache Sets; Aggregates; Associative
                 Placement; cache storage; Chip Multiprocessors;
                 Computer architecture; Computer science; destructive
                 interferences; distributed cache management; DPAP;
                 dynamic pressure aware associative placement strategy;
                 Interference; large scale chip multiprocessors;
                 Large-scale systems; Local Cache Sets; memory access
                 distribution; memory controllers; microprocessor chips;
                 Network-on-a-chip; NUCA scheme; Pressure control;
                 Pressure-Aware Placement; Random access memory",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Hammoud:2010:DPA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2010:LUC,
  author =       "Hyungjun Kim and Paul V. Gratz",
  title =        "Leveraging Unused Cache Block Words to Reduce Power in
                 {CMP} Interconnect",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "33--36",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Power is of paramount importance in modern computer
                 system design. In particular, the cache interconnect in
                 future CMP designs is projected to consume up to half
                 of the system power for cache fills and spills [8].
                 Despite the power consumed by spills and fills, a
                 significant percentage of each cache line is unused
                 prior to eviction from the cache. If unused cache block
                 words can be identified, this information can be used
                 to improve CMP interconnect power and energy
                 consumption. We propose a new method of CMP
                 interconnect packet composition, leveraging unused data
                 to reduce power. These methods are well suited to
                 interconnection networks with high-bandwidth wires, and
                 do not require expensive multi-ported memory systems.
                 Assuming perfect prediction, our techniques achieve an
                 average of similar to 37\% savings in total dynamic
                 link power consumption. With our current best
                 prediction mechanism, our techniques reduce dynamic
                 power consumption by similar to 23\% on average.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, H (Reprint Author), Texas A\&M Univ, Dept Elect
                 \& Comp Engn, College Stn, TX 77843 USA. Kim, Hyungjun;
                 Gratz, Paul V., Texas A\&M Univ, Dept Elect \& Comp
                 Engn, College Stn, TX 77843 USA.",
  author-email = "hyungjuk@tamu.edu pgratz@tamu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; cache fills; cache interconnect; Cache
                 memories; cache spills; cache storage; CMP
                 interconnect; computer system design; Delay; dynamic
                 power; Energy consumption; energy consumption; flit
                 encoding; integrated circuit design; Interconnection
                 architectures; Low-power design; memory system;
                 microprocessor chips; Multicore; Multiprocessor
                 interconnection networks; Network-on-a-chip; NoC; power
                 aware computing; Power engineering computing; power
                 reduction; Power system interconnection; Random access
                 memory; total dynamic link power consumption; unused
                 cache block words; Very large scale integration;
                 Wires",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kim:2010:LUC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2010:EBCa,
  author =       "Anonymous",
  title =        "Editorial Board [Cover2]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "c2--c2",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:FCa,
  author =       "Anonymous",
  title =        "{[Front} cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "c1--c1",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:IAa,
  author =       "Anonymous",
  title =        "Information for authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "c3--c3",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ICSa,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [Cover4]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "c4--c4",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Skadron:2010:ELE,
  author =       "K. Skadron",
  title =        "Editorial: Letter from the {Editor-in-Chief}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "37--44",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.27",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Skadron:2010:U,
  author =       "Kevin Skadron",
  title =        "Untitled",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "37--44",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.27",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "731BX",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  number-of-cited-references = "0",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Skadron:2010:U",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Iqbal:2010:POS,
  author =       "Syed Muhammad Zeeshan Iqbal and Yuchen Liang and Hakan
                 Grahn",
  title =        "{ParMiBench} --- an Open-Source Benchmark for Embedded
                 Multiprocessor Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "45--48",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Multicore processors are the main computing platform
                 in laptops, desktop, and servers today, and are making
                 their way into the embedded systems market also. Using
                 benchmarks is a common approach to evaluate the
                 performance of a system. However, benchmarks for
                 embedded systems have so far been either targeted for a
                 uni-processor environment, e.g., MiBench, or have been
                 commercial, e.g., MultiBench by EEMBC. In this paper,
                 we propose and implement an open source benchmark,
                 ParMiBench, targeted for multiprocessor-based embedded
                 systems. ParMiBench consists of parallel
                 implementations of seven compute intensive algorithms
                 from the uni-processor benchmark suite MiBench. The
                 applications are selected from four domains: Automation
                 and Industry Control, Network, Office, and Security.",
  acknowledgement = ack-nhfb,
  affiliation =  "Iqbal, SMZ (Reprint Author), Blekinge Inst Technol,
                 Sch Comp, SE-37179 Karlskrona, Sweden. Iqbal, Syed
                 Muhammad Zeeshan; Liang, Yuchen; Grahn, Hakan, Blekinge
                 Inst Technol, Sch Comp, SE-37179 Karlskrona, Sweden.",
  author-email = "mzeeshan01@gmail.com yuchen9760@gmail.com
                 hakan.grahn@bth.se",
  da =           "2019-06-20",
  doc-delivery-number = "731BX",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "benchmark testing; Benchmark testing; Concurrent
                 Programming; desktop; embedded multiprocessor system;
                 Embedded system; embedded system market; embedded
                 systems; intensive algorithm; laptop; Load management;
                 Multicore processing; multiprocessing systems;
                 Multiprocessor Systems; open-source benchmark; parallel
                 architectures; parallel implementation; ParMiBench;
                 Performance Evaluation; Performance evaluation;
                 Performance Evaluation; Program processors; public
                 domain software; Security; uniprocessor benchmark
                 suite",
  number-of-cited-references = "9",
  ORCID-numbers = "Grahn, Hakan/0000-0001-9947-1088",
  research-areas = "Computer Science",
  researcherid-numbers = "Grahn, Hakan/G-9720-2011",
  times-cited =  "32",
  unique-id =    "Iqbal:2010:POS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Fang:2010:BRP,
  author =       "Zhen Fang and Erik G. Hallnor and Bin Li and Michael
                 Leddige and Donglai Dai and Seung Eun Lee and Srihari
                 Makineni and Ravi Iyer",
  title =        "{Boomerang}: Reducing Power Consumption of Response
                 Packets in {NoCs} with Minimal Performance Impact",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "49--52",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Most power reduction mechanisms for NoC channel
                 buffers rely on on-demand wakeup to transition from a
                 low-power state to the active state. Two drawbacks of
                 on-demand wakeup limit its effectiveness: (1)
                 performance impact caused by wakeup delays, and (2)
                 energy and area cost of sleep circuitry itself. What
                 makes the problem harder to solve is that solutions to
                 either problem tend to exacerbate the other. For
                 example, faster wakeup from a power-gated state
                 requires greater charge/discharge current for the sleep
                 transistors while using nimbler sleep transistors
                 implies long wakeup delays. As a result, powerdowns
                 have to be conservatively prescribed, missing many
                 power-saving opportunities. We propose Boomerang, a
                 novel power-saving method that overcomes the above
                 drawbacks. Specifically, based on the observation that
                 a response is always preceded by a request, we let the
                 request trigger wakeup of the buffer that is to be used
                 by its response in the ( near) future, instead of using
                 on-demand wakeups. Hiding the wakeup delay completely,
                 Boomerang allows us to employ aggressive sleep policies
                 and use low-cost power gating circuits on response
                 buffers.",
  acknowledgement = ack-nhfb,
  affiliation =  "Fang, Z (Reprint Author), Intel Corp, Santa Clara, CA
                 95051 USA. Fang, Zhen; Hallnor, Erik G.; Li, Bin;
                 Leddige, Michael; Dai, Donglai; Makineni, Srihari;
                 Iyer, Ravi, Intel Corp, Santa Clara, CA 95051 USA. Lee,
                 Seung Eun, Seoul Natl Univ Sci \& Technol, Seoul, South
                 Korea.",
  author-email = "zhen.fang@intel.com",
  da =           "2019-06-20",
  doc-delivery-number = "731BX",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Boomerang; buffer circuits; charge-discharge current;
                 Delay; Interconnection networks; Leakage currents;
                 leakage power; low-cost power gating circuits;
                 low-power design; Mobile communication;
                 network-on-chip; nimbler sleep transistors; NoC channel
                 buffers; packet-switching networks; power aware
                 computing; power consumption reduction mechanism;
                 power-gated state; power-saving method; response
                 packets; Routing; Switches; System-on-a-chip;
                 Transistors; wakeup delay",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Fang:2010:BRP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lyons:2010:ASF,
  author =       "Michael J. Lyons and Mark Hempstead and Gu-Yeon Wei
                 and David Brooks",
  title =        "The Accelerator Store framework for high-performance,
                 low-power accelerator-based systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "53--56",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware acceleration can increase performance and
                 reduce energy consumption. To maximize these benefits,
                 accelerator-based systems that emphasize computation on
                 accelerators (rather than on general purpose cores)
                 should be used. We introduce the ``accelerator store,''
                 a structure for sharing memory between accelerators in
                 these accelerator-based systems. The accelerator store
                 simplifies accelerator I/O and reduces area by mapping
                 memory to accelerators when needed at runtime.
                 Preliminary results demonstrate a 30\% system area
                 reduction with no energy overhead and less than 1\%
                 performance overhead in contrast to conventional DMA
                 schemes.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lyons, MJ (Reprint Author), Harvard Univ, Sch Engn \&
                 Appl Sci, Cambridge, MA 02138 USA. Lyons, Michael J.;
                 Brooks, David, Harvard Univ, Sch Engn \& Appl Sci,
                 Cambridge, MA 02138 USA.",
  author-email = "mjlyons@eecs.harvard.edu mhempstead@coe.drexel.edu
                 guyeon@eecs.harvard.edu dbrooks@eecs.harvard.edu",
  da =           "2019-06-20",
  doc-delivery-number = "731BX",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [IIS-0926148];
                 Gigascale Systems Research Center",
  funding-text = "This material is based upon work supported by the
                 National Science Foundation under Grant No.
                 IIS-0926148. The authors acknowledge the support of the
                 Gigascale Systems Research Center, one of six research
                 centers funded under the Focus Center Research Program
                 (FCRP), a Semiconductor Research Corporation entity.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; accelerator store framework; energy
                 consumption; General; hardware acceleration;
                 Heterogeneous (hybrid) systems; high-performance
                 low-power accelerator-based system; low-power
                 electronics; memory architecture; Memory management;
                 memory mapping; memory sharing; Program processors;
                 Random access memory; Real time systems; Real-time and
                 embedded systems; shared memory systems; storage
                 management; Throughput; Transform coding",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "13",
  unique-id =    "Lyons:2010:ASF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Manevich:2010:CAR,
  author =       "Ran Manevich and Israel Cidon and Avinoam Kolodny and
                 Isask'har Walter",
  title =        "Centralized Adaptive Routing for {NoCs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "57--60",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As the number of applications and programmable units
                 in CMPs and MPSoCs increases, the Network-on-Chip (NoC)
                 encounters diverse and time dependent traffic loads.
                 This trend motivates the introduction of NoC
                 load-balanced, adaptive routing mechanisms that achieve
                 higher throughput as compared with traditional
                 oblivious routing schemes that are perceived better
                 suited for hardware implementations. However, an
                 efficient adaptive routing scheme should base its
                 decisions on the global state of the system rather than
                 on local or regional congestion signals as is common in
                 current adaptive routing schemes. In this paper we
                 introduce a novel paradigm of NoC centralized adaptive
                 routing, and a specific design for mesh topology. Our
                 scheme continuously monitors the global traffic load in
                 the network and modifies the routing of packets to
                 improve load balancing accordingly. In our specific
                 mesh-based design, XY or YX routes are adaptively
                 selected for each source-destination pair. We show that
                 while our implementation is scalable and lightweight in
                 hardware costs, it outperforms distributed adaptive
                 routing schemes in terms of load balancing and
                 throughput.",
  acknowledgement = ack-nhfb,
  affiliation =  "Manevich, R (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
                 Manevich, Ran; Cidon, Israel; Kolodny, Avinoam; Walter,
                 Isask'har, Technion Israel Inst Technol, Dept Elect
                 Engn, IL-32000 Haifa, Israel.",
  author-email = "ranman@tx.technion.ac.il cidon@ee.technion.ac.il
                 kolodny@ee.technion.ac.il zigi@tx.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "731BX",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "adaptive routing; Adaptive systems; centralized
                 adaptive routing; Computer architecture; distributed
                 adaptive routing; global state; load balanced adaptive
                 routing; load balancing; Load control; Load management;
                 mesh based design; mesh topology; network on chip;
                 Network on Chip; network routing; Network-on-Chip;
                 network-on-chip; NoC; packet routing; programmable
                 unit; regional congestion signal; routing algorithms;
                 Routing protocols; Telecommunication traffic;
                 Throughput; time dependent traffic load",
  number-of-cited-references = "7",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "Manevich:2010:CAR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zhang:2010:FCA,
  author =       "Meng Zhang and Alvin R. Lebeck and Daniel J. Sorin",
  title =        "Fractal Consistency: Architecting the Memory System to
                 Facilitate Verification",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "61--64",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.18",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "One of the most challenging problems in developing a
                 multicore processor is verifying that the design is
                 correct, and one of the most difficult aspects of
                 pre-silicon verification is verifying that the memory
                 system obeys the architecture's specified memory
                 consistency model. To simplify the process of
                 pre-silicon design verification, we propose a system
                 model called the Fractally Consistent Model (FCM). We
                 prove that systems that adhere to the FCM can be
                 verified to obey the memory consistency model in three
                 simple, scalable steps. The procedure for verifying FCM
                 systems contrasts sharply with the difficult,
                 non-scalable procedure required to verify non-FCM
                 systems. We show that FCM systems do not necessarily
                 sacrifice performance, compared to non-FCM systems,
                 despite being simpler to verify.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zhang, M (Reprint Author), Duke Univ, Dept Elect \&
                 Comp Engn, Durham, NC 27706 USA. Zhang, Meng; Sorin,
                 Daniel J., Duke Univ, Dept Elect \& Comp Engn, Durham,
                 NC 27706 USA. Lebeck, Alvin R., Duke Univ, Dept Comp
                 Sci, Durham, NC 27706 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "731BX",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CCF-0702434,
                 CCF-0811290]",
  funding-text = "This material is based upon work supported by the
                 National Science Foundation under grants CCF-0702434
                 and CCF-0811290.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Arithmetic and Logic Structures; Coherence;
                 Computational modeling; Computer architecture; Computer
                 Reliability; Fault-Tolerance; FCM systems; Formal
                 verification; fractal consistent model; Fractals;
                 Hardware; Memory; memory architecture; Memory
                 Consistency; memory consistency model; Memory
                 hierarchy; memory system architecture;
                 Micro-architecture implementation considerations;
                 microprocessor chips; Multicore; multicore processor;
                 multiprocessing systems; Performance Analysis and
                 Design Aids; presilicon verification; Processor
                 Architectures; Protocols; Testing; Validation;
                 Verification",
  number-of-cited-references = "10",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Zhang:2010:FCA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2010:AIT,
  author =       "Anonymous",
  title =        "Advertisement --- {{\booktitle{IEEE Transactions on
                 Computers}}} Celebrates 60 Years",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "65--65",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.29",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ICSb,
  author =       "Anonymous",
  title =        "2011 {IEEE Computer Society} Simulator Design
                 Competition",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "66--66",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.19",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ASS,
  author =       "Anonymous",
  title =        "Advertisement --- Special Student Offer",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "67--67",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.30",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ADY,
  author =       "Anonymous",
  title =        "Advertisement --- Distinguish Yourself With the
                 {CSDP}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "68--68",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.26",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:CPS,
  author =       "Anonymous",
  title =        "{Conference Proceedings Services (CPS)}
                 [advertisement]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "69--69",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.21",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ICSc,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} Jobs",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "70--70",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.28",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ASC,
  author =       "Anonymous",
  title =        "Advertisement --- Stay Connected to the {IEEE Computer
                 Society}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "71--71",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.31",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ACS,
  author =       "Anonymous",
  title =        "Advertisement --- {Computer Society Digital Library}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "72--72",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.20",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:EBCb,
  author =       "Anonymous",
  title =        "Editorial Board [Cover2]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "c2--c2",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.23",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:FCb,
  author =       "Anonymous",
  title =        "{[Front} cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "c1--c1",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.22",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:IAb,
  author =       "Anonymous",
  title =        "Information for authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "c3--c3",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.24",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ICSd,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [Cover4]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "c4--c4",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.25",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Skadron:2011:ELE,
  author =       "K. Skadron",
  title =        "Editorial: Letter from the {Editor-in-Chief}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "1--3",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Skadron:2011:U,
  author =       "Kevin Skadron",
  title =        "Untitled",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "1--3",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  number-of-cited-references = "0",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Skadron:2011:U",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Vandierendonck:2011:FMM,
  author =       "Hans Vandierendonck and Andre Seznec",
  title =        "Fairness Metrics for Multi-Threaded Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "4--7",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multi-threaded processors execute multiple threads
                 concurrently in order to increase overall throughput.
                 It is well documented that multi-threading affects
                 per-thread performance but, more importantly, some
                 threads are affected more than others. This is
                 especially troublesome for multi-programmed workloads.
                 Fairness metrics measure whether all threads are
                 affected equally. However defining equal treatment is
                 not straightforward. Several fairness metrics for
                 multi-threaded processors have been utilized in the
                 literature, although there does not seem to be a
                 consensus on what metric does the best job of measuring
                 fairness. This paper reviews the prevalent fairness
                 metrics and analyzes their main properties. Each metric
                 strikes a different trade-off between fairness in the
                 strict sense and throughput. We categorize the metrics
                 with respect to this property. Based on experimental
                 data for SMT processors, we suggest using the minimum
                 fairness metric in order to balance fairness and
                 throughput.",
  acknowledgement = ack-nhfb,
  affiliation =  "Vandierendonck, H (Reprint Author), Univ Ghent, Dept
                 Elect \& Informat Syst, Ghent, Belgium. Vandierendonck,
                 Hans, Univ Ghent, Dept Elect \& Informat Syst, Ghent,
                 Belgium. Seznec, Andre, INRIA Rennes, Rennes, France.",
  author-email = "hans.vandierendonck@elis.ugent.be
                 Andre.Seznec@inria.fr",
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Correlation; fairness; fairness metrics; Harmonic
                 analysis; Instruction sets; measurement; Measurement;
                 multi-programming; Multi-threaded processors;
                 multi-threading; multiprocessing systems;
                 multiprogrammed workloads; multithreaded processors;
                 Parallel Architectures; Performance of Systems;
                 quality-of-service; resource allocation; SMT
                 processors; software metrics; System-on-a-chip;
                 Throughput",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "13",
  unique-id =    "Vandierendonck:2011:FMM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Tang:2011:PEM,
  author =       "Jie Tang and Shaoshan Liu and Zhimin Gu and Chen Liu
                 and Jean-Luc Gaudiot",
  title =        "Prefetching in Embedded Mobile Systems Can Be
                 Energy-Efficient",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "8--11",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Data prefetching has been a successful technique in
                 high-performance computing platforms. However, the
                 conventional wisdom is that they significantly increase
                 energy consumption, and thus not suitable for embedded
                 mobile systems. On the other hand, as modern mobile
                 applications pose an increasing demand for high
                 performance, it becomes essential to implement
                 high-performance techniques, such as prefetching, in
                 these systems. In this paper, we study the impact of
                 prefetching on the performance and energy consumption
                 of embedded mobile systems. Contrary to the
                 conventional wisdom, our findings demonstrate that as
                 technology advances, prefetching can be
                 energy-efficient while improving performance.
                 Furthermore, we have developed a simple but effective
                 analytical model to help system designers to identify
                 the conditions for energy efficiency.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tang, J (Reprint Author), Beijing Inst Technol,
                 Beijing 100081, Peoples R China. Tang, Jie; Gu, Zhimin,
                 Beijing Inst Technol, Beijing 100081, Peoples R China.
                 Liu, Shaoshan, Microsoft Corp, Redmond, WA 98052 USA.
                 Liu, Chen, Florida Int Univ, Miami, FL 33199 USA.
                 Gaudiot, Jean-Luc, Univ Calif Irvine, Irvine, CA USA.",
  author-email = "tangjie.bit@gmail.com shaoliu@microsoft.com
                 zmgu@x263.net chen.liu@fiu.edu gaudiot@uci.edu",
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "data prefetching; embedded mobile systems; embedded
                 systems; energy consumption; energy efficiency
                 condition; energy-efficient prefetching;
                 high-performance computing platform; Low power
                 electronics; Low-power design; Memory management;
                 Memory Structures; mobile computing; Mobile computing;
                 Mobile Computing; storage management",
  number-of-cited-references = "11",
  ORCID-numbers = "Liu, Chen/0000-0003-1558-6836",
  research-areas = "Computer Science",
  times-cited =  "19",
  unique-id =    "Tang:2011:PEM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Khan:2011:DDC,
  author =       "Omer Khan and Mieszko Lis and Yildiz Sinangil and
                 Srinivas Devadas",
  title =        "{DCC}: a Dependable Cache Coherence Multicore
                 Architecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "12--15",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Cache coherence lies at the core of
                 functionally-correct operation of shared memory
                 multicores. Traditional directory-based hardware
                 coherence protocols scale to large core counts, but
                 they incorporate complex logic and directories to track
                 coherence states. Technology scaling has reached
                 miniaturization levels where manufacturing
                 imperfections, device unreliability and occurrence of
                 hard errors pose a serious dependability challenge.
                 Broken or degraded functionality of the coherence
                 protocol can lead to a non-operational processor or
                 user visible performance loss. In this paper, we
                 propose a dependable cache coherence architecture (DCC)
                 that combines the traditional directory protocol with a
                 novel execution-migration-based architecture to ensure
                 dependability that is transparent to the programmer.
                 Our architecturally redundant execution migration
                 architecture only permits one copy of data to be cached
                 anywhere in the processor: when a thread accesses an
                 address not locally cached on the core it is executing
                 on, it migrates to the appropriate core and continues
                 execution there. Both coherence mechanisms can co-exist
                 in the DCC architecture and we present architectural
                 extensions to seamlessly transition between the
                 directory and execution migration protocols.",
  acknowledgement = ack-nhfb,
  affiliation =  "Khan, O (Reprint Author), MIT, 77 Massachusetts Ave,
                 Cambridge, MA 02139 USA. Khan, Omer; Lis, Mieszko;
                 Sinangil, Yildiz; Devadas, Srinivas, MIT, Cambridge, MA
                 02139 USA. Khan, Omer, Univ Massachusetts, Lowell, MA
                 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architecturally redundant execution migration
                 architecture; B.3.4 Reliability, Testing, and
                 Fault-Tolerance; B.8 Performance and Reliability;
                 broken functionality; C.4.b Fault tolerance; cache
                 coherence; cache storage; Coherence; coherence
                 mechanisms; coherence states; DCC architecture;
                 degraded functionality; dependability challenge;
                 Dependable architecture; dependable cache coherence
                 architecture; dependable cache coherence multicore
                 architecture; device unreliability; directory protocol;
                 directory-based hardware coherence protocols;
                 execution-migration-based architecture;
                 functionally-correct operation; Hardware; incorporate
                 complex logic; Instruction sets; large core counts;
                 manufacturing imperfections; memory architecture;
                 memory protocols; microprocessor chips; miniaturization
                 levels; Multicore processing; multicores;
                 nonoperational processor; Protocols; shared memory
                 multicores; shared memory systems; System-on-a-chip;
                 technology scaling; user visible performance loss",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Khan:2011:DDC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Rosenfeld:2011:DCA,
  author =       "Paul Rosenfeld and Elliott Cooper-Balis and Bruce
                 Jacob",
  title =        "{DRAMSim2}: a Cycle Accurate Memory System Simulator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "16--19",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper we present DRAMSim2, a cycle accurate
                 memory system simulator. The goal of DRAMSim2 is to be
                 an accurate and publicly available DDR2/3 memory system
                 model which can be used in both full system and
                 trace-based simulations. We describe the process of
                 validating DRAMSim2 timing against manufacturer Verilog
                 models in an effort to prove the accuracy of simulation
                 results. We outline the combination of DRAMSim2 with a
                 cycle-accurate x86 simulator that can be used to
                 perform full system simulations. Finally, we discuss
                 DRAMVis, a visualization tool that can be used to graph
                 and compare the results of DRAMSim2 simulations.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rosenfeld, P (Reprint Author), Univ Maryland, Dept
                 Elect \& Comp Engn, College Pk, MD 20742 USA.
                 Rosenfeld, Paul; Cooper-Balis, Elliott; Jacob, Bruce,
                 Univ Maryland, Dept Elect \& Comp Engn, College Pk, MD
                 20742 USA.",
  author-email = "prosenf1@umd.edu ecc17@umd.edu blj@umd.edu",
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  eissn =        "1556-6064",
  esi-highly-cited-paper = "Y",
  esi-hot-paper = "N",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computational modeling; cycle accurate memory system
                 simulator; DDR2/3 memory system model; DRAM; DRAM
                 chips; DRAMSim2 simulation; DRAMSim2 timing; Driver
                 circuits; Hardware design languages; Load modeling;
                 memory architecture; memory cards; Object oriented
                 modeling; Primary memory; Random access memory;
                 Simulation; Timing; trace-based simulation; Verilog
                 model; visualization tool",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "270",
  unique-id =    "Rosenfeld:2011:DCA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Gou:2011:ESH,
  author =       "Chunyang Gou and Georgi N. Gaydadjiev",
  title =        "Exploiting {SPMD} Horizontal Locality",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "20--23",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper, we analyze a particular spatial
                 locality case (called horizontal locality) inherent to
                 manycore accelerator architectures employing barrel
                 execution of SPMD kernels, such as GPUs. We then
                 propose an adaptive memory access granularity framework
                 to exploit and enforce the horizontal locality in order
                 to reduce the interferences among accelerator cores
                 memory accesses and hence improve DRAM efficiency. With
                 the proposed technique, DRAM efficiency grows by 1.42X
                 on average, resulting in 12.3\% overall performance
                 gain, for a set of representative memory intensive
                 GPGPU applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Gou, C (Reprint Author), Delft Univ Technol, NL-2600
                 AA Delft, Netherlands. Gou, Chunyang; Gaydadjiev,
                 Georgi N., Delft Univ Technol, NL-2600 AA Delft,
                 Netherlands.",
  author-email = "c.gou@tudelft.nl g.n.gaydadjiev@tudelft.nl",
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerator core memory access; adaptive memory access
                 granularity; Bandwidth; barrel execution; DRAM chips;
                 DRAM efficiency; GPU; Graphics processing unit;
                 Instruction sets; interference; Kernel; manycore
                 accelerator architecture; Memory hierarchy;
                 microprocessor chips; Multi-core/single-chip
                 multiprocessors; parallel architectures; Pipelines;
                 Proposals; Random access memory; SIMD processors;
                 single program multiple data; spatial locality; SPMD
                 horizontal locality; SPMD kernel",
  number-of-cited-references = "13",
  ORCID-numbers = "Gaydadjiev, Georgi/0000-0002-3678-7007",
  research-areas = "Computer Science",
  researcherid-numbers = "Gaydadjiev, Georgi/F-1488-2010",
  times-cited =  "1",
  unique-id =    "Gou:2011:ESH",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wang:2011:GGC,
  author =       "Xiaoqun Wang and Zhenzhou Ji and Chen Fu and Mingzeng
                 Hu",
  title =        "{GCMS}: a Global Contention Management Scheme in
                 Hardware Transactional Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "24--27",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware Transactional Memory (HTM) is a promising
                 Transactional Memory (TM) implementation because of its
                 strong atomicity and high performance. Unfortunately,
                 most contention management approaches in HTMs are
                 dedicated to specific transaction conflict scenarios
                 and it is hard to choose a universal strategy for
                 different workloads. In addition, HTM performance
                 degrades sharply when there are severe transaction
                 conflicts. In this paper, we present a Global
                 Contention Management Scheme (GCMS) to resolve severe
                 transaction conflicts in HTMs. Our scheme depends on a
                 Deadlock and Livelock Detection Mechanism (DLDM) and a
                 Global Contention Manager (GCM) to resolve severe
                 transaction conflicts. This scheme is orthogonal to the
                 rest of the contention management policies. We have
                 incorporated GCMS into different HTMs and compared the
                 performance of the enhanced systems with that of the
                 original HTMs with the STAMP benchmark suite. The
                 results demonstrate that the performance of the
                 enhanced HTMs is improved.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wang, XQ (Reprint Author), Harbin Inst Technol, Sch
                 Comp Sci, Harbin 150006, Peoples R China. Wang,
                 Xiaoqun; Ji, Zhenzhou; Fu, Chen; Hu, Mingzeng, Harbin
                 Inst Technol, Sch Comp Sci, Harbin 150006, Peoples R
                 China.",
  author-email = "wxiaoqun@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bioinformatics; Concurrent Programming; Contention
                 Management; deadlock-and-livelock detection mechanism;
                 GCMS scheme; Genomics; global contention management
                 scheme; global contention manager; Hardware; Hardware
                 Transactional Memory; hardware transactional memory;
                 Multi-core/single-chip multiprocessors; Multicore
                 Processors; Parallel Programming; Program processors;
                 Radiation detectors; storage management; System
                 recovery; transaction conflict; transaction
                 processing",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Wang:2011:GGC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2011:RL,
  author =       "Anonymous",
  title =        "2010 Reviewers List",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "28--28",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "IEEE publishing",
}

@Article{Anonymous:2011:AI,
  author =       "Anonymous",
  title =        "2010 Annual Index",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "??--??",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:Ca,
  author =       "Anonymous",
  title =        "Cover 2",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "c2--c2",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:Cb,
  author =       "Anonymous",
  title =        "Cover 3",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "c3--c3",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:Cc,
  author =       "Anonymous",
  title =        "Cover 4",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "c4--c4",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:FCa,
  author =       "Anonymous",
  title =        "{[Front} cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "c1--c1",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Mars:2011:HHW,
  author =       "Jason Mars and Lingjia Tang and Robert Hundt",
  title =        "Heterogeneity in {``Homogeneous''} Warehouse-Scale
                 Computers: a Performance Opportunity",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "29--32",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The class of modern datacenters recently coined as
                 ``warehouse scale computers'' (WSCs) has traditionally
                 been embraced as homogeneous computing platforms.
                 However, due to frequent machine replacements and
                 upgrades, modern WSCs are in fact composed of diverse
                 commodity microarchitectures and machine
                 configurations. Yet, current WSCs are designed with an
                 assumption of homogeneity, leaving a potentially
                 significant performance opportunity unexplored. In this
                 paper, we investigate the key factors impacting the
                 available heterogeneity in modern WSCs, and the benefit
                 of exploiting this heterogeneity to maximize overall
                 performance. We also introduce a new metric,
                 opportunity factor, which can be used to quantify an
                 application's sensitivity to the heterogeneity in a
                 given WSC. For applications that are sensitive to
                 heterogeneity, we observe a performance improvement of
                 up to 70\% when employing our approach. In a WSC
                 composed of state-of-the-art machines, we can improve
                 the overall performance of the entire datacenter by
                 16\% over the status quo.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mars, J (Reprint Author), Univ Virginia,
                 Charlottesville, VA 22903 USA. Mars, Jason; Tang,
                 Lingjia, Univ Virginia, Charlottesville, VA 22903
                 USA.",
  author-email = "jom5x@cs.virginia.edu lt8f@cs.virginia.edu
                 rhundt@google.com",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Computer architecture; computer
                 centres; datacenters; Design studies; Distributed
                 architectures; diverse commodity microarchitectures;
                 Heterogeneous (hybrid) systems; homogeneous
                 warehouse-scale computers; integration and modeling;
                 machine configurations; mainframes; Microarchitecture;
                 Optimization; Scheduling and task partitioning; Super
                 (very large) computers; System architectures",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "22",
  unique-id =    "Mars:2011:HHW",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Michelogiannakis:2011:PCE,
  author =       "George Michelogiannakis and Nan Jiang and Daniel U.
                 Becker and William J. Dally",
  title =        "Packet Chaining: Efficient Single-Cycle Allocation for
                 On-Chip Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "33--36",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper introduces packet chaining, a simple and
                 effective method to increase allocator matching
                 efficiency and hence network performance, particularly
                 suited to networks with short packets and short cycle
                 times. Packet chaining operates by chaining packets
                 destined to the same output together, to reuse the
                 switch connection of a departing packet. This allows an
                 allocator to build up an efficient matching over a
                 number of cycles, like incremental allocation, but not
                 limited by packet length. For a 64-node 2D mesh at
                 maximum injection rate and with single-flit packets,
                 packet chaining increases network throughput by 15\%
                 compared to a conventional single-iteration separable
                 iSLIP allocator, outperforms a wavefront allocator, and
                 gives comparable throughput with an augmenting paths
                 allocator. Packet chaining achieves this performance
                 with a cycle time comparable to a single-iteration
                 separable allocator. Packet chaining also reduces
                 average network latency by 22.5\% compared to iSLIP.
                 Finally, packet chaining increases IPC up to 46\% (16\%
                 average) for application benchmarks because short
                 packets are critical in a typical cache-coherent CMP.
                 These are considerable improvements given the maturity
                 of network-on-chip routers and allocators.",
  acknowledgement = ack-nhfb,
  affiliation =  "Michelogiannakis, G (Reprint Author), Stanford Univ,
                 Stanford, CA 94305 USA. Michelogiannakis, George;
                 Jiang, Nan; Becker, Daniel U.; Dally, William J.,
                 Stanford Univ, Stanford, CA 94305 USA.",
  author-email = "mihelog@stanford.edu njiang37@stanford.edu
                 dub@stanford.edu dally@stanford.edu",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CCF-0702341];
                 National Security Agency [H98230-08-C-0272-P007];
                 Robert Bosch Fellowship; Prof. Michael Farmwald
                 Fellowship; Prof. Michael J. Flynn Stanford Graduate
                 Fellowship",
  funding-text = "This work was supported in part by the National
                 Science Foundation under Grant CCF-0702341, in part by
                 the National Security Agency under Contract
                 H98230-08-C-0272-P007 and in part by the Robert Bosch,
                 Prof. Michael Farmwald and Prof. Michael J. Flynn
                 Stanford Graduate Fellowships.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "allocator matching efficiency; Benchmark testing;
                 Interconnection architectures; network performance;
                 network-on-chip; network-on-chip routers; On-chip
                 interconnection networks; on-chip networks; packet
                 chaining; Resource management; single-iteration
                 separable iSLIP allocator; System-on-a-chip;
                 Throughput",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Michelogiannakis:2011:PCE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Ho:2011:EIB,
  author =       "Chen-Han Ho and Garret Staus and Aaron Ulmer and
                 Karthikeyan Sankaralingam",
  title =        "Exploring the Interaction Between Device Lifetime
                 Reliability and Security Vulnerabilities",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "37--40",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As technology scales, device reliability is becoming a
                 fundamental problem. Even though manufacture test can
                 guarantee product quality, due to various types of
                 wearout and failure modes, permanent faults appearing
                 in the filed is becoming an increasingly important and
                 real problem. Such types of wear-out creates permanent
                 faults in devices after release to the user during
                 their lifetime. In this paper, we perform a formal
                 investigation of the impact of permanent faults on
                 security, examine empirical evidence, and demonstrate a
                 real attack. Our results show that permanent stuck-at
                 faults may leave security holes in microprocessors. We
                 show that an adversary with knowledge of a fault can
                 launch attacks which can obtain critical secrets such
                 as a private key in 30 seconds.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ho, CH (Reprint Author), Univ Wisconsin, Madison, WI
                 53706 USA. Ho, Chen-Han; Staus, Garret; Ulmer, Aaron;
                 Sankaralingam, Karthikeyan, Univ Wisconsin, Madison, WI
                 53706 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Arithmetic and Logic Structures; Circuit faults;
                 Computer bugs; Control Structures and Microprogramming;
                 Cryptography; device lifetime reliability; failure
                 mode; fault tolerant computing; Hardware reliability;
                 Logic programming; microprocessor chips;
                 microprocessors; Permanent Fault; permanent fault;
                 private key; product quality; Program processors;
                 public key cryptography; Reliability; Reliability
                 engineering; Security; security vulnerability; wear-out
                 type; wearout mode",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Ho:2011:EIB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hernandez:2011:FTV,
  author =       "Carles Hernandez and Antoni Roca and Jose Flich and
                 Federico Silla and Jose Duato",
  title =        "Fault-Tolerant Vertical Link Design for Effective {3D}
                 Stacking",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "41--44",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Recently, 3D stacking has been proposed to alleviate
                 the memory bandwidth limitation arising in chip
                 multiprocessors (CMPs). As the number of integrated
                 cores in the chip increases the access to external
                 memory becomes the bottleneck, thus demanding larger
                 memory amounts inside the chip. The most accepted
                 solution to implement vertical links between stacked
                 dies is by using Through Silicon Vias (TSVs). However,
                 TSVs are exposed to misalignment and random defects
                 compromising the yield of the manufactured 3D chip. A
                 common solution to this problem is by
                 over-provisioning, thus impacting on area and cost. In
                 this paper, we propose a fault-tolerant vertical link
                 design. With its adoption, fault-tolerant vertical
                 links can be implemented in a 3D chip design at low
                 cost without the need of adding redundant TSVs (no
                 over-provision). Preliminary results are very promising
                 as the fault-tolerant vertical link design increases
                 switch area only by 6.69\% while the achieved
                 interconnect yield tends to 100\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hernandez, C (Reprint Author), Univ Politecn Valencia,
                 C Cami de Vera S-N, Valencia 46022, Spain. Hernandez,
                 Carles; Roca, Antoni; Flich, Jose; Silla, Federico;
                 Duato, Jose, Univ Politecn Valencia, Valencia 46022,
                 Spain.",
  author-email = "carherlu@gap.upv.es",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Spanish MEC; MICINN; European Commission
                 [CSD2006-00046, TIN2009-14475-C04]; NaNoC [248972]",
  funding-text = "This work was supported by the Spanish MEC and MICINN,
                 as well as European Commission FEDER funds, under
                 Grants CSD2006-00046 and TIN2009-14475-C04. It was also
                 partly supported by the project NaNoC (project label
                 248972) which is funded by the European Commission
                 within the Research Programme FP7.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "3D chip; 3D Stacking; 3D stacking; chip
                 multiprocessors; CMP; effective 3D stacking; external
                 memory; Fault Tolerance; fault tolerance; Fault
                 Tolerance; Fault tolerant systems; fault-tolerant
                 vertical link design; memory bandwidth limitation;
                 Memory management; microprocessor chips;
                 network-on-chip; NoC; Stacking; storage management
                 chips; Three dimensional displays; three-dimensional
                 integrated circuits; through silicon vias; TSV",
  number-of-cited-references = "20",
  oa =           "Green Published",
  ORCID-numbers = "Silla, Federico/0000-0002-6435-1200 Hernandez,
                 Carles/0000-0001-5393-3195",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Hernandez:2011:FTV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Choi:2011:EID,
  author =       "Inseok Choi and Minshu Zhao and Xu Yang and Donald
                 Yeung",
  title =        "Experience with Improving Distributed Shared Cache
                 Performance on {Tilera}'s {Tile} Processor",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "45--48",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.18",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper describes our experience with profiling and
                 optimizing physical locality for the distributed shared
                 cache (DSC) in Tilera's Tile multicore processor. Our
                 approach uses the Tile Processor's hardware performance
                 measurement counters (PMCs) to acquire page-level
                 access pattern profiles. A key problem we address is
                 imprecise PMC interrupts. Our profiling tools use
                 binary analysis to correct for interrupt ``skid'', thus
                 pinpointing individual memory operations that incur
                 remote DSC slice references and permitting us to sample
                 their access patterns. We use our access pattern
                 profiles to drive page homing optimizations for both
                 heap and static data objects. Our experiments show we
                 can improve physical locality for 5 out of 11 SPLASH2
                 benchmarks running on 32 cores, enabling 32.9\%-77.9\%
                 of DSC references to target the local DSC slice. To our
                 knowledge, this is the first work to demonstrate page
                 homing optimizations on a real system.",
  acknowledgement = ack-nhfb,
  affiliation =  "Choi, I (Reprint Author), Univ Maryland, Dept Elect \&
                 Comp Engn, College Pk, MD 20742 USA. Choi, Inseok;
                 Zhao, Minshu; Yang, Xu; Yeung, Donald, Univ Maryland,
                 Dept Elect \& Comp Engn, College Pk, MD 20742 USA.",
  author-email = "inseok@umd.edu mszhao@umd.edu yangxu@umd.edu
                 yeung@umd.edu",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; binary analysis; cache storage;
                 Computer architecture; Data streams; Design
                 methodology; Design studies; distributed shared cache
                 performance; hardware performance measurement counters;
                 microprocessor chips; Multi-core/single-chip
                 multiprocessors; Multicore processing; Multiple Data
                 Stream Architectures (Multiprocessors); multiprocessing
                 systems; Multiprocessing systems; page homing
                 optimization; page-level access pattern profile; PMC
                 interrupt; profiling tool; Tilera tile multicore
                 processor",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "5",
  unique-id =    "Choi:2011:EID",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Prieto:2011:MCM,
  author =       "Pablo Prieto and Valentin Puente and Jose-Angel
                 Gregorio",
  title =        "Multilevel Cache Modeling for Chip-Multiprocessor
                 Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "49--52",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.20",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper presents a simple analytical model for
                 predicting on-chip cache hierarchy effectiveness in
                 chip multiprocessors (CMP) for a state-of-the-art
                 architecture. Given the complexity of this type of
                 systems, we use rough approximations, such as the
                 empirical observation that the re-reference timing
                 pattern follows a power law and the assumption of a
                 simplistic delay model for the cache, in order to
                 provide a useful model for the memory hierarchy
                 responsiveness. This model enables the analytical
                 determination of average access time, which makes
                 design space pruning useful before sweeping the vast
                 design space of this class of systems. The model is
                 also useful for predicting cache hierarchy behavior in
                 future systems. The fidelity of the model has been
                 validated using a state-of-the-art, full-system
                 simulation environment, on a system with up to sixteen
                 out-of-order processors with cache-coherent caches and
                 using a broad spectrum of applications, including
                 complex multithread workloads. This simple model can
                 predict a near-to-optimal, on-chip cache distribution
                 while also estimating how future systems running future
                 applications might behave.",
  acknowledgement = ack-nhfb,
  affiliation =  "Prieto, P (Reprint Author), Univ Cantabria, Cantabria,
                 Spain. Prieto, Pablo; Puente, Valentin; Gregorio,
                 Jose-Angel, Univ Cantabria, Cantabria, Spain.",
  author-email = "prietop@unican.es vpuente@unican.es
                 monaster@unican.es",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Spanish Ministry of Science and Innovation
                 [TIN2010-18159]; HiPEAC2 European Network of
                 Excellence",
  funding-text = "This work has been supported by the Spanish Ministry
                 of Science and Innovation, under contracts
                 TIN2010-18159, and by the HiPEAC2 European Network of
                 Excellence. The authors would like to thank the
                 reviewers for their valuable comments.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "approximation theory; cache hierarchy behavior
                 prediction; cache storage; Cache storage;
                 cache-coherent caches; chip-multiprocessor systems;
                 complex multithread workloads; Complexity theory;
                 Computational modeling; design space; integrated
                 circuit design; Memory hierarchy; memory hierarchy
                 responsiveness; microprocessor chips;
                 Multi-core/single-chip multiprocessors; multilevel
                 cache modeling; multiprocessing systems;
                 Multiprocessing systems; near-to-optimal on-chip cache
                 distribution; on-chip cache hierarchy effectiveness
                 prediction; power law; re-reference timing pattern;
                 rough approximations; simplistic delay model
                 assumption; Software tools; Thermal analysis; Thermal
                 sensors",
  number-of-cited-references = "13",
  ORCID-numbers = "Prieto, Pablo/0000-0002-5818-1188 Puente,
                 Valentin/0000-0002-6904-3282 Gregorio, Jose
                 Angel/0000-0003-2214-303X",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Prieto:2011:MCM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Siozios:2011:SRT,
  author =       "Kostas Siozios and Dimitrios Rodopoulos and Dimitrios
                 Soudris",
  title =        "On Supporting Rapid Thermal Analysis",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "53--56",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.19",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Detailed thermal analysis is usually performed
                 exclusively at design time since it is a
                 computationally intensive task. In this paper, we
                 introduce a novel methodology for fast, yet accurate,
                 thermal analysis. The introduced methodology is
                 software supported by a new open source tool that
                 enables hierarchical thermal analysis with adaptive
                 levels of granularity. Experimental results prove the
                 efficiency of our approach since it leads to average
                 reduction of the execution overhead up to 70\% with a
                 penalty in accuracy ranging between 2\% and 8\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "Siozios, K (Reprint Author), Natl Tech Univ Athens,
                 Sch ECE, GR-10682 Athens, Greece. Siozios, Kostas;
                 Rodopoulos, Dimitrios; Soudris, Dimitrios, Natl Tech
                 Univ Athens, Sch ECE, GR-10682 Athens, Greece.",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Complexity theory; Computational modeling; Computer
                 Systems Organization; Design Methodologies; General;
                 Hardware; hierarchical thermal analysis; Modeling
                 techniques; Monitoring; open source tool; Performance
                 of Systems; Power Management; public domain software;
                 rapid thermal analysis; Reconfigurable Hardware;
                 Reconfigurable hardware; Reliability; software
                 engineering; software supported; Software tools;
                 thermal analysis; Thermal analysis; Thermal Monitoring;
                 Thermal sensors",
  number-of-cited-references = "8",
  ORCID-numbers = "Siozios, Kostas/0000-0002-0285-2202 Soudris,
                 Dimitrios/0000-0002-6930-6847",
  research-areas = "Computer Science",
  researcherid-numbers = "Soudris, Dimitrios/I-5252-2014 Siozios,
                 Kostas/F-9726-2011 Soudris, Dimitrios/O-8843-2019",
  times-cited =  "3",
  unique-id =    "Siozios:2011:SRT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2011:Cd,
  author =       "Anonymous",
  title =        "Cover 3",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "c3--c3",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.30",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:FCb,
  author =       "Anonymous",
  title =        "{[Front} cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "c1--c1",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.28",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:ICS,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [society information]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "c4--c4",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.31",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:PI,
  author =       "Anonymous",
  title =        "Publication information",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "c2--c2",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.29",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Sethumadhavan:2012:CHD,
  author =       "Simha Sethumadhavan and Ryan Roberts and Yannis
                 Tsividis",
  title =        "A Case for Hybrid Discrete-Continuous Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.22",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Current technology trends indicate that power- and
                 energy-efficiency will limit chip throughput in the
                 future. Current solutions to these problems, either in
                 the way of programmable or fixed-function digital
                 accelerators will soon reach their limits as
                 microarchitectural overheads are successively trimmed.
                 A significant departure from current computing methods
                 is required to carry forward computing advances beyond
                 digital accelerators. In this paper we describe how the
                 energy-efficiency of a large class of problems can be
                 improved by employing a hybrid of the discrete and
                 continuous models of computation instead of the
                 ubiquitous, traditional discrete model of computation.
                 We present preliminary analysis of domains and
                 benchmarks that can be accelerated with the new model.
                 Analysis shows that machine learning, physics and up to
                 one-third of SPEC, RMS and Berkeley suite of
                 applications can be accelerated with the new hybrid
                 model.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sethumadhavan, S (Reprint Author), Columbia Univ, New
                 York, NY 10027 USA. Sethumadhavan, Simha; Roberts,
                 Ryan; Tsividis, Yannis, Columbia Univ, New York, NY
                 10027 USA.",
  author-email = "simha@cs.columbia.edu",
  da =           "2019-06-20",
  doc-delivery-number = "953VM",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "DARPA; AFRL [FA8750-10-2-0253,
                 FA9950-09-1-0389]; NSF",
  funding-text = "Sethumadhavan's research is funded by grants from
                 DARPA, AFRL (FA8750-10-2-0253, FA9950-09-1-0389), the
                 NSF CAREER program, gifts from Microsoft Research and
                 Columbia University, and software donations from
                 Synopsys and Wind River. Roberts conducted this
                 research as a GRA in Sethumadhavan's Lab.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Approximation algorithms; Benchmark testing; Berkeley
                 suite; Computational modeling; Computer architecture;
                 computer architecture; Computer architecture; computer
                 architecture; computing methods; continuous models;
                 cryptography; Design studies; Differential equations;
                 discrete model; discrete models; domains analysis;
                 energy conservation; energy-efficiency; fixed-function
                 digital accelerators; forward computing advances;
                 hybrid discrete-continuous architectures; Hybrid
                 systems; machine learning; Mathematical model;
                 microarchitectural overheads; microprocessor chips;
                 power-efficiency; Processor architectures; RMS; SPEC;
                 Very large scale integration",
  number-of-cited-references = "16",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Sethumadhavan:2012:CHD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kong:2012:ASF,
  author =       "Ji Kong and Peilin Liu and Yu Zhang",
  title =        "Atomic Streaming: a Framework of On-Chip Data Supply
                 System for Task-Parallel {MPSoCs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.21",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "State of the art fabrication technology for
                 integrating numerous hardware resources such as
                 Processors/DSPs and memory arrays into a single chip
                 enables the emergence of Multiprocessor System-on-Chip
                 (MPSoC). Stream programming paradigm based on MPSoC is
                 highly efficient for single functionality scenario due
                 to its dedicated and predictable data supply system.
                 However, when memory traffic is heavily shared among
                 parallel tasks in applications with multiple
                 interrelated functionalities, performance suffers
                 through task interferences and shared memory
                 congestions which lead to poor parallel speedups and
                 memory bandwidth utilizations. This paper proposes a
                 framework of stream processing based on-chip data
                 supply system for task-parallel MPSoCs. In this
                 framework, stream address generations and data
                 computations are decoupled and parallelized to allow
                 full utilization of on-chip resources. Task
                 granularities are dynamically tuned to jointly optimize
                 the overall application performance. Experiments show
                 that proposed framework as well as the tuning scheme
                 are effective for joint optimization in task-parallel
                 MPSoCs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kong, J (Reprint Author), Shanghai Jiao Tong Univ, Sch
                 Elect Informat \& Elect Engn, Shanghai 200030, Peoples
                 R China. Kong, Ji; Liu, Peilin, Shanghai Jiao Tong
                 Univ, Sch Elect Informat \& Elect Engn, Shanghai
                 200030, Peoples R China.",
  author-email = "johnhophen@sjtu.edu.cn liupeilin@sjtu.edu.cn
                 zhyu@cn.ibm.com",
  da =           "2019-06-20",
  doc-delivery-number = "953VM",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "IBM Research-China under the IBM",
  funding-text = "This work has been partially supported by IBM
                 Research-China under the IBM Ph.D. Fellowship program
                 for the 2010-2011 academic year.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application studies resulting in better
                 multiple-processor systems; atomic streaming;
                 Bandwidth; data computations; Memory hierarchy;
                 Multi-core/single-chip multiprocessors; Multicore
                 processing; Multiple Data Stream Architectures
                 (Multiprocessors); Multiprocessing systems;
                 multiprocessor system-on-chip; on-chip data supply
                 system; Prefetching; shared memory congestions; shared
                 memory systems; stream address generations; stream
                 programming paradigm; Streaming media;
                 System-on-a-chip; system-on-chip; task interferences;
                 task-parallel MPSoC; Throughput",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kong:2012:ASF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Deb:2012:HSC,
  author =       "Abhishek Deb and Josep Maria Codina and Antonio
                 Gonzalez",
  title =        "A {HW\slash SW} Co-designed Programmable Functional
                 Unit",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.23",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "In this paper, we propose a novel programmable
                 functional unit (PFU) to accelerate general purpose
                 application execution on a modern out-of-order x86
                 processor. Code is transformed and instructions are
                 generated that run on the PFU using a co-designed
                 virtual machine (Cd-VM). Results presented in this
                 paper show that this HW/SW co-designed approach
                 produces average speedups in performance of 29\% in
                 SPECFP and 19\% in SPECINT, and up-to 55\%, over modern
                 out-of-order processor.",
  acknowledgement = ack-nhfb,
  affiliation =  "Deb, A (Reprint Author), Univ Politecn Cataluna, C
                 Jordi Girona 1-3, Barcelona, Spain. Deb, Abhishek;
                 Gonzalez, Antonio, Univ Politecn Cataluna, Barcelona,
                 Spain. Maria Codina, Josep; Gonzalez, Antonio, Intel
                 Res Labs Barcelona, Barcelona, Spain.",
  author-email = "abhishek@ac.upc.edu josep.m.codina@intel.com
                 antonio@intel.com",
  da =           "2019-06-20",
  doc-delivery-number = "953VM",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; hardware-software codesign;
                 Hardware/software interfaces; hw/sw co-designed;
                 Interface states; Load modeling; Micro-architecture
                 implementation considerations; Microarchitecture;
                 Processor Architectures; programmable functional unit;
                 Programmable functional units; Registers; virtual
                 machine",
  number-of-cited-references = "13",
  ORCID-numbers = "Gonzalez, Antonio/0000-0002-0009-0996",
  research-areas = "Computer Science",
  researcherid-numbers = "Gonzalez, Antonio/I-2961-2014",
  times-cited =  "0",
  unique-id =    "Deb:2012:HSC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Piscitelli:2012:HLP,
  author =       "Roberta Piscitelli and Andy D. Pimentel",
  title =        "A High-Level Power Model for {MPSoC} on {FPGA}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.24",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper presents a framework for high-level power
                 estimation of multiprocessor systems-on-chip (MPSoC)
                 architectures on FPGA. The technique is based on
                 abstract execution profiles, called event signatures.
                 As a result, it is capable of achieving good evaluation
                 performance, thereby making the technique highly useful
                 in the context of early system-level design space
                 exploration. We have integrated the power estimation
                 technique in a system-level MPSoC synthesis framework.
                 Using this framework, we have designed a range of
                 different candidate MPSoC architectures and compared
                 our power estimation results to those from real
                 measurements on a Virtex-6 FPGA board.",
  acknowledgement = ack-nhfb,
  affiliation =  "Piscitelli, R (Reprint Author), Univ Amsterdam, Inst
                 Informat, NL-1012 WX Amsterdam, Netherlands.
                 Piscitelli, Roberta; Pimentel, Andy D., Univ Amsterdam,
                 Inst Informat, NL-1012 WX Amsterdam, Netherlands.",
  author-email = "r.piscitelli@uva.nl a.d.pimentel@uva.nl",
  da =           "2019-06-20",
  doc-delivery-number = "953VM",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "MADNESS STREP",
  funding-text = "This work has been partially supported by the MADNESS
                 STREP-FP7 European Project.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "abstract execution profiles; Computational modeling;
                 Computer architecture; Estimation; event signatures;
                 Field programmable gate arrays; field programmable gate
                 arrays; Field programmable gate arrays; Formal models;
                 High-level power estimation; high-level power
                 estimation framework; high-level power model;
                 integrated circuit design; Mathematical model;
                 Microprocessors; MPSoC on FPGA; multiprocessing
                 systems; multiprocessor systems-on-chip architectures;
                 Performance Analysis and Design Aids; performance
                 evaluation; power aware computing; Power demand; power
                 estimation technique; Simulation; system-level design
                 space exploration; system-level MPSoC design space
                 exploration; system-level MPSoC synthesis framework;
                 system-on-chip; Virtex-6 FPGA board",
  number-of-cited-references = "15",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Piscitelli:2012:HLP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Finlayson:2012:OSP,
  author =       "Ian Finlayson and Gang-Ryung Uh and David Whalley and
                 Gary Tyson",
  title =        "An Overview of Static Pipelining",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.26",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "A new generation of mobile applications requires
                 reduced energy consumption without sacrificing
                 execution performance. In this paper, we propose to
                 respond to these conflicting demands with an innovative
                 statically pipelined processor supported by an
                 optimizing compiler. The central idea of the approach
                 is that the control during each cycle for each portion
                 of the processor is explicitly represented in each
                 instruction. Thus the pipelining is in effect
                 statically determined by the compiler. The benefits of
                 this approach include simpler hardware and that it
                 allows the compiler to perform optimizations that are
                 not possible on traditional architectures. The initial
                 results indicate that static pipelining can
                 significantly reduce power consumption without
                 adversely affecting performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Finlayson, I (Reprint Author), Florida State Univ,
                 Dept Comp Sci, Tallahassee, FL 32306 USA. Finlayson,
                 Ian; Whalley, David; Tyson, Gary, Florida State Univ,
                 Dept Comp Sci, Tallahassee, FL 32306 USA. Uh,
                 Gang-Ryung, Boise State Univ, Dept Comp Sci, Boise, ID
                 83725 USA.",
  author-email = "finlayso@cs.fsu.edu uh@cs.boisestate.edu
                 whalley@cs.fsu.edu tyson@cs.fsu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "953VM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CNS-0964413, CNS-0915926]",
  funding-text = "We thank the anonymous reviewers for their
                 constructive comments and suggestions. This research
                 was supported in part by NSF grants CNS-0964413 and
                 CNS-0915926.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Computer architecture; Energy
                 consumption; energy consumption reduction; execution
                 performance; General; mobile applications; optimising
                 compilers; Optimization; optimizing compiler; Pipeline
                 processing; pipeline processing; Pipeline processors;
                 power aware computing; Radio frequency; Registers;
                 statically pipelined processor",
  number-of-cited-references = "14",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Finlayson:2012:OSP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wu:2012:CID,
  author =       "Lisa Wu and Martha A. Kim and Stephen A. Edwards",
  title =        "Cache Impacts of Datatype Acceleration",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.25",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware acceleration is a widely accepted solution
                 for performance and energy efficient computation
                 because it removes unnecessary hardware for general
                 computation while delivering exceptional performance
                 via specialized control paths and execution units. The
                 spectrum of accelerators available today ranges from
                 coarse-grain off-load engines such as GPUs to
                 fine-grain instruction set extensions such as SSE. This
                 research explores the benefits and challenges of
                 managing memory at the data-structure level and
                 exposing those operations directly to the ISA. We call
                 these instructions Abstract Datatype Instructions
                 (ADIs). This paper quantifies the performance and
                 energy impact of ADIs on the instruction and data cache
                 hierarchies. For instruction fetch, our measurements
                 indicate that ADIs can result in 21-48\% and 16-27\%
                 reductions in instruction fetch time and energy
                 respectively. For data delivery, we observe a 22-40\%
                 reduction in total data read/write time and 9-30\% in
                 total data read/write energy.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wu, L (Reprint Author), Columbia Univ, Dept Comp Sci,
                 New York, NY 10027 USA. Wu, Lisa; Kim, Martha A.;
                 Edwards, Stephen A., Columbia Univ, Dept Comp Sci, New
                 York, NY 10027 USA.",
  author-email = "lisa@cs.columbia.edu martha@cs.columbia.edu
                 sedwards@cs.columbia.edu",
  da =           "2019-06-20",
  doc-delivery-number = "953VM",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "abstract data types; abstract datatype instruction;
                 Accelerators; ADI; cache hierarchy; Cache Hierarchy;
                 cache hierarchy; Cache memories; cache storage; coarse
                 grain off-load engine; data read-write energy; data
                 structure level; Data Structures; energy conservation;
                 energy efficient computation; energy impact; execution
                 unit; fine grain instruction set extension; hardware
                 acceleration; Hardware acceleration; hardware
                 acceleration; Hardware/software interfaces; Instruction
                 fetch; instruction fetch energy; instruction fetch
                 time; Instruction Set Extensions; instruction sets;
                 ISA; Memory hierarchy; memory management; Memory
                 Structures; Multicore processing; power aware
                 computing; Program processors; Support vector machines;
                 Vectors",
  number-of-cited-references = "15",
  ORCID-numbers = "Edwards, Stephen/0000-0003-2609-4861",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Wu:2012:CID",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2012:RL,
  author =       "Anonymous",
  title =        "2011 Reviewers List",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "25--26",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Lists the reviewers who contributed to IEEE Computer
                 Architecture Letters in 2011.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "IEEE publishing",
}

@Article{Anonymous:2012:TNQ,
  author =       "Anonymous",
  title =        "There now is a quick and easy way to find out about
                 our collection of {{\booktitle{Transactions}}}
                 [Advertisement]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "26--26",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.19",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Advertisement: Visit http://www.computer.org/whats-new
                 today!",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:ACP,
  author =       "Anonymous",
  title =        "Advertisement --- {Conference Publishing Services
                 (CPS)}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "28--28",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "IEEE Conference Publishing Services (CPS)
                 advertisement.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:AI,
  author =       "Anonymous",
  title =        "2011 Annual Index",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "??--??",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This index covers all technical items --- papers,
                 correspondence, reviews, etc. --- that appeared in this
                 periodical during the year, and items from previous
                 years that were commented upon or corrected in this
                 year. Departments and other items may also be covered
                 if they have been judged to have archival value. The
                 Author Index contains the primary entry for each item,
                 listed under the first author's name. The primary entry
                 includes the co-authors' names, the title of the paper
                 or other item, and its location, specified by the
                 publication abbreviation, year, month, and inclusive
                 pagination. The Subject Index contains entries
                 describing the item under all appropriate subject
                 headings, plus the first author's name, the publication
                 abbreviation, month, and year, and inclusive pages.
                 Note that the item title is found only under he primary
                 entry in the Author Index.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Indexes",
}

@Article{Anonymous:2012:Ca,
  author =       "Anonymous",
  title =        "{[Cover2]}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "c2--c2",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Provides a listing of current society officers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:Cb,
  author =       "Anonymous",
  title =        "{[Cover3]}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "c3--c3",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Provides a listing of current society officers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:FCT,
  author =       "Anonymous",
  title =        "{[Front} cover and table of contents]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "c1--c1",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Presents the table of contents for this issue of the
                 periodical.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:ICS,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [Back cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "c4--c4",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Provides a listing of current committee members and
                 society officers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Davis:2012:IVL,
  author =       "John D. Davis and Suzanne Rivoire and Moises
                 Goldszmidt and Ehsan K. Ardestani",
  title =        "Including Variability in Large-Scale Cluster Power
                 Models",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "29--32",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.27",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Studying the energy efficiency of large-scale computer
                 systems requires models of the relationship between
                 resource utilization and power consumption. Prior work
                 on power modeling assumes that models built for a
                 single node will scale to larger groups of machines.
                 However, we find that inter-node variability in
                 homogeneous clusters leads to substantially different
                 models for different nodes. Moreover, ignoring this
                 variability will result in significant prediction
                 errors when scaled to the cluster level. We report on
                 inter-node variation for four homogeneous five-node
                 clusters using embedded, laptop, desktop, and server
                 processors. The variation is manifested quantitatively
                 in the prediction error and qualitatively on the
                 resource utilization variables (features) that are
                 deemed relevant for the models. These results
                 demonstrate the need to sample multiple machines in
                 order to produce accurate cluster models.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rivoire, Suzanne, Sonoma State Univ, Rohnert Pk, CA
                 94928 USA. Ardestani, Ehsan K., Univ CA, Santa Cruz, CA
                 USA.",
  author-email = "john.d@microsoft.com suzanne.rivoire@sonoma.edu
                 moises@microsoft.com eka@soe.ucsc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computational modeling; Data models; evaluation;
                 Measurement; modeling; Power demand; Power Management;
                 Power measurement; Predictive models; Radiation
                 detectors; Servers; simulation of multiple-processor
                 systems",
  number-of-cited-references = "26",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Davis:2012:IVL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lakshminarayana:2012:DSP,
  author =       "Nagesh B. Lakshminarayana and Jaekyu Lee and Hyesoon
                 Kim and Jinwoo Shin",
  title =        "{DRAM} Scheduling Policy for {GPGPU} Architectures
                 Based on a Potential Function",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "33--36",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.32",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "GPGPU architectures (applications) have several
                 different characteristics compared to traditional CPU
                 architectures (applications): highly multithreaded
                 architectures and SIMD-execution behavior are the two
                 important characteristics of GPGPU computing. In this
                 paper, we propose a potential function that models the
                 DRAM behavior in GPGPU architectures and a DRAM
                 scheduling policy, alpha-SJF policy to minimize the
                 potential function. The scheduling policy essentially
                 chooses between SJF and FR-FCFS at run-time based on
                 the number of requests from each thread and whether the
                 thread has a row buffer hit.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lakshminarayana, NB (Reprint Author), Georgia Inst
                 Technol, Sch Comp Sci, Atlanta, GA 30332 USA.
                 Lakshminarayana, Nagesh B.; Lee, Jaekyu; Kim, Hyesoon;
                 Shin, Jinwoo, Georgia Inst Technol, Sch Comp Sci,
                 Atlanta, GA 30332 USA.",
  author-email = "nageshbl@cc.gatech.edu jaekyu.lee@cc.gatech.edu
                 hyesoon.kim@cc.gatech.edu jshin72@cc.gatech.edu",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Computer architecture; DRAM chips;
                 DRAM scheduling; DRAM scheduling policy; dynamic random
                 access memory; Equations; general-purpose graphics
                 processing unit; GPGPU; GPGPU architecture; graphics
                 processing units; Instruction sets; Mathematical model;
                 multi-threading; multithreaded architecture; Potential
                 function; potential function; Potential function;
                 Processor scheduling; Random access memory; row buffer
                 hit; scheduling; SIMD-execution behavior",
  number-of-cited-references = "5",
  research-areas = "Computer Science",
  researcherid-numbers = "Shin, Jinwoo/M-5389-2013",
  times-cited =  "7",
  unique-id =    "Lakshminarayana:2012:DSP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wang:2012:ISA,
  author =       "Yaohua Wang and Shuming Chen and Kai Zhang and
                 Jianghua Wan and Xiaowen Chen and Hu Chen and Haibo
                 Wang",
  title =        "Instruction Shuffle: Achieving {MIMD}-like Performance
                 on {SIMD} Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "37--40",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.34",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "SIMD architectures are less efficient for applications
                 with the diverse control-flow behavior, which can be
                 mainly attributed to the requirement of the identical
                 control-flow. In this paper, we propose a novel
                 instruction shuffle scheme that features an efficient
                 control-flow handling mechanism. The cornerstones are
                 composed of a shuffle source instruction buffer array
                 and an instruction shuffle unit. The shuffle unit can
                 concurrently deliver instructions of multiple distinct
                 control-flows from the instruction buffer array to
                 eligible SIMD lanes. Our instruction shuffle scheme
                 combines the best attributes of both the SIMD and MIMD
                 execution paradigms. Experimental results show that, an
                 average performance improvement of 86\% can be
                 achieved, at a cost of only 5.8\% area overhead.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wang, YH (Reprint Author), Natl Univ Def Technol, Sch
                 Comp Sci, Changsha, Hunan, Peoples R China. Wang,
                 Yaohua; Chen, Shuming; Zhang, Kai; Wan, Jianghua; Chen,
                 Xiaowen; Chen, Hu; Wang, Haibo, Natl Univ Def Technol,
                 Sch Comp Sci, Changsha, Hunan, Peoples R China.",
  author-email = "nudtyh@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Natural Science Foundation of
                 China [61070036, 61133007]; National 863 Program of
                 China [2009AA011704]",
  funding-text = "The work is partially supported by the National
                 Natural Science Foundation of China (No. 61070036), the
                 National Natural Science Foundation of China (No.
                 61133007), the National 863 Program of China (No.
                 2009AA011704).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Arrays; data dependent control-flow; diverse
                 control-flow behavior; identical control-flow behavior;
                 instruction buffer array; Instruction sets; instruction
                 shuffle; instruction shuffle unit; Kernel; MIMD
                 execution paradigm; MIMD-like performance; multiple
                 instruction multiple data; parallel processing; Process
                 control; Resource management; Scalability; shuffle
                 source instruction buffer array; SIMD; SIMD
                 architecture; SIMD execution paradigm; single
                 instruction multiple data; Vectors",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  researcherid-numbers = "Chen, Shuming/Q-1147-2018",
  times-cited =  "6",
  unique-id =    "Wang:2012:ISA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Panda:2012:BFB,
  author =       "Reena Panda and Paul V. Gratz and Daniel A.
                 Jim{\'e}nez",
  title =        "{B-Fetch}: Branch Prediction Directed Prefetching for
                 In-Order Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "41--44",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.33",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Computer architecture is beset by two opposing trends.
                 Technology scaling and deep pipelining have led to high
                 memory access latencies; meanwhile, power and energy
                 considerations have revived interest in traditional
                 in-order processors. In-order processors, unlike their
                 superscalar counterparts, do not allow execution to
                 continue around data cache misses. In-order processors,
                 therefore, suffer a greater performance penalty in the
                 light of the current high memory access latencies.
                 Memory prefetching is an established technique to
                 reduce the incidence of cache misses and improve
                 performance. In this paper, we introduce B-Fetch, a new
                 technique for data prefetching which combines branch
                 prediction based lookahead deep path speculation with
                 effective address speculation, to efficiently improve
                 performance in in-order processors. Our results show
                 that B-Fetch improves performance 38.8\% on SPEC
                 CPU2006 benchmarks, beating a current, state-of-the-art
                 prefetcher design at similar to 1/3 the hardware
                 overhead.",
  acknowledgement = ack-nhfb,
  affiliation =  "Panda, R (Reprint Author), Texas A\&M Univ, Dept Elect
                 \& Comp Engn, CESG, College Stn, TX 77843 USA. Panda,
                 Reena; Gratz, Paul V., Texas A\&M Univ, Dept Elect \&
                 Comp Engn, CESG, College Stn, TX 77843 USA. Jimenez,
                 Daniel A., Univ Texas San Antonio, Dept Comp Sci, San
                 Antonio, TX USA.",
  author-email = "reena.panda@tamu.edu pgratz@tamu.edu dj@cs.utsa.edu",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "address speculation; B-fetch; Benchmark testing;
                 Branch Prediction; branch prediction based lookahead
                 deep path speculation; branch prediction directed
                 prefetching; Cache memory; computer architecture;
                 Computer architecture; data cache; Data Cache
                 Prefetching; deep pipelining; energy consideration;
                 Hardware; in-order processor; In-order Processors;
                 memory access latency; memory prefetching; Memory
                 Systems; Pipelines; power aware computing; power
                 consideration; Prefetching; Process control; Registers;
                 storage management; superscalar processor; technology
                 scaling; Value Prediction",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Panda:2012:BFB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Miller:2012:MEP,
  author =       "Timothy N. Miller and Renji Thomas and Radu
                 Teodorescu",
  title =        "Mitigating the Effects of Process Variation in
                 Ultra-low Voltage Chip Multiprocessors using Dual
                 Supply Voltages and Half-Speed Units",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "45--48",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.36",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Energy efficiency is a primary concern for
                 microprocessor designers. One very effective approach
                 to improving processor energy efficiency is to lower
                 its supply voltage to very near to the transistor
                 threshold voltage. This reduces power consumption
                 dramatically, improving energy efficiency by an order
                 of magnitude. Low voltage operation, however, increases
                 the effects of parameter variation resulting in
                 significant frequency heterogeneity between (and
                 within) otherwise identical cores. This heterogeneity
                 severely limits the maximum frequency of the entire
                 CMP. We present a combination of techniques aimed at
                 reducing the effects of variation on the performance
                 and energy efficiency of near-threshold, many-core
                 CMPs. Dual Voltage Rail (DVR), mitigates core-to-core
                 variation with a dual-rail power delivery system that
                 allows post-manufacturing assignment of different
                 supply voltages to individual cores. This speeds up
                 slow cores by assigning them to a higher voltage and
                 saves power on fast cores by assigning them to a lower
                 voltage. Half-Speed Unit (HSU) mitigates within-core
                 variation by halving the frequency of select functional
                 blocks with the goal of boosting the frequency of
                 individual cores, thus raising the frequency ceiling
                 for the entire CMP. Together, these variation-reduction
                 techniques result in almost 50\% improvement in CMP
                 performance for the same power consumption over a mix
                 of workloads.",
  acknowledgement = ack-nhfb,
  affiliation =  "Miller, TN (Reprint Author), Ohio State Univ, Dept
                 Comp Sci \& Engn, Columbus, OH 43210 USA. Miller,
                 Timothy N.; Thomas, Renji; Teodorescu, Radu, Ohio State
                 Univ, Dept Comp Sci \& Engn, Columbus, OH 43210 USA.",
  author-email = "millerti@cse.ohio-state.edu thomasr@cse.ohio-state.edu
                 teodores@cse.ohio-state.edu",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CCF-1117799]",
  funding-text = "This work was supported in part by the National
                 Science Foundation under grant CCF-1117799 and an
                 allocation of computing time from the Ohio
                 Supercomputer Center. The authors would like to thank
                 the anonymous reviewers for their suggestions and
                 feedback.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; chip multiprocessors; Clocks; CMP
                 frequency ceiling; CMP performance; Computer
                 architecture; core-to-core variation; Delay; dual
                 supply voltage; dual voltage rail; dual-rail power
                 delivery system; energy conservation; Energy
                 efficiency; energy efficiency; Energy efficiency;
                 frequency heterogeneity; half-speed unit; low voltage
                 operation; microprocessor chips; microprocessor design;
                 Multiprocessing systems; near-threshold voltage;
                 parameter variation; power aware computing; power
                 consumption; Power demand; process variation; process
                 variation effect; Rails; supply voltage assignment;
                 Threshold voltage; transistor threshold voltage;
                 ultra-low voltage chip multiprocessors; within-core
                 variation",
  number-of-cited-references = "15",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Miller:2012:MEP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Li:2012:LSS,
  author =       "Yong Li and Rami Melhem and Alex K. Jones",
  title =        "Leveraging Sharing in Second Level
                 Translation-Lookaside Buffers for Chip
                 Multiprocessors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "49--52",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.35",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Traversing page table during virtual to physical
                 address translation causes significant pipeline stalls
                 when misses occur in the translation-lookaside buffer
                 (TLB). To mitigate this penalty, we propose a fast,
                 scalable, multi-level TLB organization that leverages
                 page sharing behaviors and performs efficient TLB entry
                 placement. Our proposed partial sharing TLB (PSTLB)
                 reduces TLB misses by around 60\%. PSTLB also improves
                 TLB performance by nearly 40\% compared to traditional
                 private TLBs and 17\% over the state of the art
                 scalable TLB proposal.",
  acknowledgement = ack-nhfb,
  affiliation =  "Li, Y (Reprint Author), Univ Pittsburgh, Dept Elect \&
                 Comp Engn, Pittsburgh, PA 15261 USA. Li, Yong, Univ
                 Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA
                 15261 USA.",
  author-email = "yol26@pitt.edu",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-0702452]",
  funding-text = "This work is supported by NSF award CCF-0702452",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; buffer storage; chip
                 multiprocessor; CMPs; Fluids; microprocessor chips;
                 multilevel TLB organization; multiprocessing systems;
                 Oceans; page sharing behavior; Partial Sharing; partial
                 sharing TLB; Prefetching; private TLB; program
                 interpreters; Runtime; second level
                 translation-lookaside buffers; Tiles; TLB entry
                 placement; TLBs; Virtual private networks;
                 virtual-to-physical address translation",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Li:2012:LSS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Delimitrou:2012:DDS,
  author =       "Christina Delimitrou and Sriram Sankar and Kushagra
                 Vaid and Christos Kozyrakis",
  title =        "Decoupling Datacenter Storage Studies from Access to
                 Large-Scale Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "53--56",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.37",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Suboptimal storage design has significant cost and
                 power impact in large-scale datacenters (DCs).
                 Performance, power and cost-optimized systems require
                 deep understanding of target workloads, and mechanisms
                 to effectively model different storage design choices.
                 Traditional benchmarking is invalid in cloud
                 data-stores, representative storage profiles are hard
                 to obtain, while replaying applications in different
                 storage configurations is impractical both in cost and
                 time. Despite these issues, current workload generators
                 are not able to reproduce key aspects of real
                 application patterns (e.g., spatial/temporal locality,
                 I/O intensity). In this paper, we propose a modeling
                 and generation framework for large-scale storage
                 applications. As part of this framework we use a state
                 diagram-based storage model, extend it to a
                 hierarchical representation, and implement a tool that
                 consistently recreates DC application I/O loads. We
                 present the principal features of the framework that
                 allow accurate modeling and generation of storage
                 workloads, and the validation process performed against
                 ten original DC application traces. Finally, we explore
                 two practical applications of this methodology: SSD
                 caching and defragmentation benefits on enterprise
                 storage. Since knowledge of the workload's spatial and
                 temporal locality is necessary to model these use
                 cases, our framework was instrumental in quantifying
                 their performance benefits. The proposed methodology
                 provides detailed understanding of the storage activity
                 of large-scale applications, and enables a wide
                 spectrum of storage studies, without the requirement to
                 access application code and full application
                 deployment.",
  acknowledgement = ack-nhfb,
  affiliation =  "Delimitrou, C (Reprint Author), Stanford Univ,
                 Stanford, CA 94305 USA. Delimitrou, Christina;
                 Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305
                 USA. Sankar, Sriram; Vaid, Kushagra, Microsoft Corp,
                 Seattle, WA USA.",
  author-email = "cdel@stanford.edu srsankar@microsoft.com
                 kvaid@microsoft.com kozyraki@stanford.edu",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cloud data-store; Computational modeling; computer
                 centres; cost impact; datacenter storage; Electronic
                 mail; enterprise storage defragmentation; Generators;
                 large-scale datacenter; Load modeling; Mass storage;
                 Modeling of computer architecture; Modeling techniques;
                 power impact; SSD caching; state diagram-based storage
                 model; Storage area networks; storage design choice;
                 storage management; storage profile; storage workload;
                 suboptimal storage design; Super (very large)
                 computers; Throughput; Very large scale integration;
                 workload spatial locality; workload temporal locality",
  number-of-cited-references = "7",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Delimitrou:2012:DDS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Chen:2012:NPD,
  author =       "Jie Chen and Guru Venkataramani and Gabriel Parmer",
  title =        "The Need for Power Debugging in the Multi-Core
                 Environment",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "57--60",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Debugging an application for power has a wide array of
                 benefits ranging from minimizing the thermal hotspots
                 to reducing the likelihood of CPU malfunction. In this
                 work, we justify the need for power debugging, and show
                 that performance debugging of a parallel application
                 does not automatically guarantee power balance across
                 multiple cores. We perform experiments and show our
                 results using two case study benchmarks, Volrend from
                 Splash-2 and Bodytrack from Parsec-1.0.",
  acknowledgement = ack-nhfb,
  affiliation =  "Chen, J (Reprint Author), George Washington Univ,
                 Washington, DC 20052 USA. Chen, Jie; Venkataramani,
                 Guru; Parmer, Gabriel, George Washington Univ,
                 Washington, DC 20052 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CCF-1117243]",
  funding-text = "This material is based upon work supported in part by
                 the National Science Foundation under Grant No.
                 CCF-1117243.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Bodytrack; Debugging; Instruction
                 sets; Multi-cores; multicore environment; Multicore
                 processing; multiprocessing systems; parallel
                 application; parallel programming; Parsec-1.0;
                 performance debugging; power aware computing; power
                 balance; Power Debugging; power debugging; Power
                 Debugging; Power demand; Power Imbalance; program
                 debugging; Splash-2; Volrend",
  number-of-cited-references = "18",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Chen:2012:NPD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Meza:2012:EES,
  author =       "Justin Meza and Jichuan Chang and HanBin Yoon and Onur
                 Mutlu and Parthasarathy Ranganathan",
  title =        "Enabling Efficient and Scalable Hybrid Memories Using
                 Fine-Granularity {DRAM} Cache Management",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "61--64",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hybrid main memories composed of DRAM as a cache to
                 scalable non-volatile memories such as phase-change
                 memory (PCM) can provide much larger storage capacity
                 than traditional main memories. A key challenge for
                 enabling high-performance and scalable hybrid memories,
                 though, is efficiently managing the metadata (e.g.,
                 tags) for data cached in DRAM at a fine granularity.
                 Based on the observation that storing metadata off-chip
                 in the same row as their data exploits DRAM row buffer
                 locality, this paper reduces the overhead of
                 fine-granularity DRAM caches by only caching the
                 metadata for recently accessed rows on-chip using a
                 small buffer. Leveraging the flexibility and efficiency
                 of such a fine-granularity DRAM cache, we also develop
                 an adaptive policy to choose the best granularity when
                 migrating data into DRAM. On a hybrid memory with a
                 512MB DRAM cache, our proposal using an 8KB on-chip
                 buffer can achieve within 6\% of the performance of,
                 and 18\% better energy efficiency than, a conventional
                 8MB SRAM metadata store, even when the energy overhead
                 due to large SRAM metadata storage is not considered.",
  acknowledgement = ack-nhfb,
  affiliation =  "Meza, J (Reprint Author), Carnegie Mellon Univ,
                 Pittsburgh, PA 15213 USA. Meza, Justin; Yoon, HanBin;
                 Mutlu, Onur, Carnegie Mellon Univ, Pittsburgh, PA 15213
                 USA. Chang, Jichuan; Ranganathan, Parthasarathy,
                 Hewlett Packard Labs, Palo Alto, CA USA.",
  author-email = "meza@cmu.edu jichuan.chang@hp.com hanbinyoon@cmu.edu
                 onur@cmu.edu partha.ranganathan@hp.com",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF CAREER [CCF-0953246]; NSF EAGER
                 [CCF-1147397]; Gigascale Systems Research Center",
  funding-text = "We thank the members of the SAFARI research group and
                 the anonymous reviewers for their comments and
                 suggestions. We gratefully acknowledge the support of
                 an NSF CAREER Award CCF-0953246, NSF EAGER Grant
                 CCF-1147397, and the Gigascale Systems Research Center.
                 Part of this work was done while Justin Meza and HanBin
                 Yoon were interns at Hewlett-Packard Labs.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Buffer storage; Cache memories; Cache
                 memory; cache storage; data migration; DRAM chips; DRAM
                 row buffer locality; dynamic random access memory;
                 fine-granularity DRAM cache management; hybrid main
                 memories; hybrid main memory; Indexes; Memory
                 management; meta data; metadata caching; metadata
                 management; metadata storage; non-volatile memories;
                 Phase change materials; phase-change memory; Random
                 access memory; scalable hybrid memory;
                 System-on-a-chip; tag storage",
  number-of-cited-references = "16",
  research-areas = "Computer Science",
  times-cited =  "35",
  unique-id =    "Meza:2012:EES",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zidenberg:2012:MHS,
  author =       "Tsahee Zidenberg and Isaac Keslassy and Uri Weiser",
  title =        "{MultiAmdahl}: How Should {I} Divide My Heterogeneous
                 Chip?",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "65--68",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Future multiprocessor chips will integrate many
                 different units, each tailored to a specific
                 computation. When designing such a system, a chip
                 architect must decide how to distribute the available
                 limited system resources, such as area and power, among
                 all the computational units. In this paper, we
                 introduce MultiAmdahl, an analytical optimization
                 technique for resource sharing among heterogeneous
                 units. MultiAmdahl takes into account the workload, the
                 performance of each computational unit, and the total
                 available resource. The results obtained by MultiAmdahl
                 allow us, for example, to provide a closed-form
                 solution for an optimal asymmetric-offload chip, and to
                 analyze the impact of different design constraints on
                 an optimal chip architecture.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zidenberg, T (Reprint Author), Technion Israel Inst
                 Technol, EE Dept, Haifa, Israel. Zidenberg, Tsahee;
                 Keslassy, Isaac; Weiser, Uri, Technion Israel Inst
                 Technol, EE Dept, Haifa, Israel.",
  author-email = "tsahee@tx.technion.ac.il isaac@ee.technion.ac.il
                 weiser@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "European Research Council [210389]; Intel
                 Heterogeneous Computing research grant",
  funding-text = "This work was partly supported by the European
                 Research Council Starting Grant No. 210389 and by the
                 Intel Heterogeneous Computing research grant.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "area resource; asymmetric-offload chip; Central
                 Processing Unit; Chip Multiprocessors; Computational
                 modeling; computational unit; Computer architecture;
                 design constraint; heterogeneous chip; heterogeneous
                 unit; Mathematical model; microprocessor chips;
                 Modeling of computer architecture; MultiAmdahl
                 analytical optimization technique; multiprocessing
                 systems; multiprocessor chip; optimal chip
                 architecture; Optimization; power resource; Program
                 processors; resource allocation; Resource management;
                 resource sharing",
  keywords-plus = "AMDAHLS LAW",
  number-of-cited-references = "7",
  research-areas = "Computer Science",
  times-cited =  "12",
  unique-id =    "Zidenberg:2012:MHS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2012:BC,
  author =       "Anonymous",
  title =        "{[Back} cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "c4--c4",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.38",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:BIC,
  author =       "Anonymous",
  title =        "{[Back} inside cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "c3--c3",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.37",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:FIC,
  author =       "Anonymous",
  title =        "{[Front} inside cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "c2--c2",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.36",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Skadron:2013:INE,
  author =       "Kevin Skadron",
  title =        "Introducing the New {Editor-in-Chief} of the
                 {{\booktitle{IEEE Computer Architecture Letters}}}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "1--1",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The out-going Editor-in-Chief introduces Jose F.
                 Mart{\'\i}nez as the new Editor-in-Chief (EIC) of the
                 IEEE Computer Architecture Letters (CAL). A brief
                 professional biography is included. In addition, it is
                 noted that CAL aims to provide fast-turnaround for
                 early work with outstanding promise. The majority of
                 decisions are returned within one month, nearly all
                 within six weeks, and all decisions are rendered within
                 two months. The overall acceptance rate has
                 consistently run at about 25\%. Many papers first
                 published in CAL go on to become full papers in premier
                 conferences and journals, and CAL's impact factor
                 continues to increase. CAL has been a valuable addition
                 to the publishing landscape in computer architecture
                 and under Prof. Martinez's leadership, we can look
                 forward to even greater impact in the future. I would
                 like to take this opportunity to thank all of the CAL
                 Associate Editors, authors, readers, and reviewers for
                 their great help and support.",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  number-of-cited-references = "0",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Skadron:2013:INE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2013:AI,
  author =       "Anonymous",
  title =        "2012 Annual Index",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This index covers all technical items - papers,
                 correspondence, reviews, etc. - that appeared in this
                 periodical during the year, and items from previous
                 years that were commented upon or corrected in this
                 year. Departments and other items may also be covered
                 if they have been judged to have archival value. The
                 Author Index contains the primary entry for each item,
                 listed under the first author's name. The primary entry
                 includes the co-authors' names, the title of the paper
                 or other item, and its location, specified by the
                 publication abbreviation, year, month, and inclusive
                 pagination. The Subject Index contains entries
                 describing the item under all appropriate subject
                 headings, plus the first author's name, the publication
                 abbreviation, month, and year, and inclusive pages.
                 Note that the item title is found only under the
                 primary entry in the Author Index.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Indexes",
}

@Article{Eeckhout:2013:MNE,
  author =       "Lieven Eeckhout",
  title =        "A Message from the New {Editor-in-Chief} and
                 Introduction of New {Associate Editors}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "2--2",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Eeckhout, L (Reprint Author), Univ Ghent, B-9000
                 Ghent, Belgium.",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  number-of-cited-references = "0",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Eeckhout:2013:MNE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Martinez:2013:MNE,
  author =       "J. Martinez",
  title =        "A Message from the New {Editor-in-Chief} and
                 Introduction of New {Associate} Editors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "2--4",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The incoming Editor-in-Chief states that his goal
                 during his tenure with IEEE Computer Architecture
                 Letters (CAL) will be to further increase its
                 visibility in our research community, and to attract
                 more submissions from computer architecture leaders.
                 The {"Best} of {CAL"} session at HPCA, which has taken
                 place for the last couple of years, is a good step in
                 this direction. He is also committed to continue
                 improving the coordination with authors and conference
                 program chairs, and to consolidate CAL's unique place
                 in the publication pipeline as the prime venue for
                 quick dissemination of high-quality novel ideas and
                 early results.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Tavakkol:2013:NSS,
  author =       "Arash Tavakkol and Mohammad Arjomand and Hamid
                 Sarbazi-Azad",
  title =        "{Network-on-SSD}: a Scalable and High-Performance
                 Communication Design Paradigm for {SSDs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In recent years, flash memory solid state disks (SSDs)
                 have shown a great potential to change storage
                 infrastructure because of its advantages of high speed
                 and high throughput random access. This promising
                 storage, however, greatly suffers from performance loss
                 because of frequent ``erase-before-write'' and
                 ``garbage collection'' operations. Thus. novel
                 circuit-level, architectural, and algorithmic
                 techniques are currently explored to address these
                 limitations. In parallel with others, current study
                 investigates replacing shared buses in multi-channel
                 architecture of SSDs with an interconnection network to
                 achieve scalable, high throughput, and reliable SSD
                 storage systems. Roughly speaking, such a communication
                 scheme provides superior parallelism that allows us to
                 compensate the main part of the performance loss
                 related to the aforementioned limitations through
                 increasing data storage and retrieval processing
                 throughput.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tavakkol, A (Reprint Author), Sharif Univ Technol,
                 Dept Comp Engn, HPCAN Lab, Tehran, Iran. Tavakkol,
                 Arash; Arjomand, Mohammad; Sarbazi-Azad, Hamid, Sharif
                 Univ Technol, Dept Comp Engn, HPCAN Lab, Tehran, Iran.
                 Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch
                 Comp Sci, Tehran, Iran.",
  author-email = "tavakkol@ce.sharif.edu arjomand@ce.sharif.edu
                 azad@sharif.edu",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "algorithmic technique; architectural technique;
                 Bandwidth; Buffer storage; circuit-level technique;
                 Complexity theory; Data storage systems; data storage
                 throughput; flash memories; Flash memory; flash memory
                 solid state disks; frequent erase-before-write
                 operations; garbage collection operations; high speed
                 random access; high throughput random access;
                 high-performance communication design paradigm;
                 integrated circuit design; integrated circuit
                 reliability; Inter-package parallelism; interconnection
                 network; Interconnection network; interconnection
                 network; Interconnections (Subsystems); Mass storage;
                 memory architecture; multichannel architecture;
                 multiprocessor interconnection networks;
                 network-on-chip; network-on-SSD; parallel memories;
                 Parallel processing; parallel storage; performance
                 evaluation; performance loss; retrieval processing
                 throughput; scalable communication design paradigm;
                 Solid state disk; SSD storage system reliability;
                 storage infrastructure; storage management; system
                 buses; Throughput",
  keywords-plus = "MEMORY",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Tavakkol:2013:NSS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Sun:2013:NWC,
  author =       "Guang Sun and Chia-Wei Chang and Bill Lin",
  title =        "A New Worst-Case Throughput Bound for Oblivious
                 Routing in Odd Radix Mesh Network",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "1/2 network capacity is often believed to be the limit
                 of worst-case throughput for mesh networks. However,
                 this letter provides a new worst-case throughput bound,
                 which is higher than 1/2 network capacity, for odd
                 radix two-dimensional mesh networks. In addition, we
                 propose a routing algorithm called U2TURN that can
                 achieve this worst-case throughput bound. U2TURN
                 considers all routing paths with at most 2 turns and
                 distributes the traffic loads uniformly in both X and Y
                 dimensions. Theoretical analysis and simulation results
                 show that U2TURN outperforms existing routing
                 algorithms in worst-case throughput. Moreover, U2TURN
                 achieves good average-throughput at the expense of
                 approximately 1.5x minimal average hop count.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sun, G (Reprint Author), Tsinghua Univ, Beijing,
                 Peoples R China. Sun, Guang, Tsinghua Univ, Beijing,
                 Peoples R China. Chang, Chia-Wei; Lin, Bill, Univ Calif
                 San Diego, San Diego, CA 92103 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithm design and analysis; average-case
                 throughput; Computer architecture; Interconnection
                 architectures; mesh; Mesh networks; network capacity;
                 network-on-chip; Networks-on-Chip (NoC); oblivious
                 routing; odd radix mesh network; odd radix
                 two-dimensional mesh network; On-chip interconnection
                 networks; Parallel algorithms; Routing; routing;
                 Routing; Routing protocols; Throughput; traffic load;
                 U2TURN; Worst-case analysis; worst-case throughput;
                 worst-case throughput bound",
  number-of-cited-references = "10",
  ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X",
  research-areas = "Computer Science",
  researcherid-numbers = "Lin, Binshan/A-9772-2009",
  times-cited =  "1",
  unique-id =    "Sun:2013:NWC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Karsli:2013:EDT,
  author =       "I. Burak Karsli and Pedro Reviriego and M. Fatih Balli
                 and O{\u{g}}uz Ergin and J. A. Maestro",
  title =        "Enhanced Duplication: a Technique to Correct Soft
                 Errors in Narrow Values",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Soft errors are transient errors that can alter the
                 logic value of a register bit causing data corruption.
                 They can be caused by radiation particles such as
                 neutrons or alpha particles. Narrow values are commonly
                 found in the data consumed or produced by processors.
                 Several techniques have recently been proposed to
                 exploit the unused bits in narrow values to protect
                 them against soft errors. These techniques replicate
                 the narrow value over the unused register bits such
                 that errors can be detected when the value is
                 duplicated and corrected when the value is tripled. In
                 this letter, a technique that can correct errors when
                 the narrow value is only duplicated is presented. The
                 proposed approach stores a modified duplicate of the
                 narrow value such that errors on the original value and
                 the duplicate can be distinguished and therefore
                 corrected. The scheme has been implemented at the
                 circuit level to evaluate its speed and also at the
                 architectural level to assess the benefits in
                 correcting soft errors. The results show that the
                 scheme is significantly faster than a parity check and
                 can improve substantially the number of soft errors
                 that are corrected compared to existing techniques.",
  acknowledgement = ack-nhfb,
  affiliation =  "Karsli, IB (Reprint Author), TOBB Univ Econ \&
                 Technol, Ankara, Turkey. Karsli, I. Burak; Balli, M.
                 Fatih; Ergin, O{\u{g}}uz, TOBB Univ Econ \& Technol,
                 Ankara, Turkey. Reviriego, Pedro; Maestro, J. A., Univ
                 Antonio de Nebrija, Madrid, Spain.",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Spanish Ministry of Science and Education
                 [AYA2009-13300-C03]; Scientific and Technological
                 Research Council of Turkey (TUBITAK) [112E004]",
  funding-text = "This work was supported in part by the Spanish
                 Ministry of Science and Education under Grant
                 AYA2009-13300-C03 and by the Scientific and
                 Technological Research Council of Turkey (TUBITAK)
                 under Grant 112E004. The work is a collaboration in the
                 framework of COST ICT Action 1103 ``Manufacturable and
                 Dependable Multicore Architectures at Nanoscale''.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "alpha particles; architectural level; Benchmark
                 testing; computer architecture; Data Cache; data
                 corruption; Data processing; enhanced duplication;
                 Error correction; Error Correction; Error correction;
                 Error-checking; Logic gates; logic value;
                 microprocessor chips; narrow values; Narrow Values;
                 narrow values; neutrons; Parity check codes;
                 processors; Program processors; radiation hardening
                 (electronics); radiation particles; Redundant design;
                 register bit; Registers; soft errors; Soft Errors; soft
                 errors",
  number-of-cited-references = "11",
  ORCID-numbers = "Sousa, Leonel/0000-0002-8066-221X Ergin,
                 O{\u{g}}uz/0000-0003-2701-3787 Maestro, Juan
                 Antonio/0000-0001-7133-9026 Reviriego,
                 Pedro/0000-0001-6805-6519",
  research-areas = "Computer Science",
  researcherid-numbers = "Sousa, Leonel/B-2749-2009 Ergin,
                 O{\u{g}}uz/E-5717-2010 Maestro, Juan
                 Antonio/L-6091-2014 Reviriego, Pedro/B-8353-2009",
  times-cited =  "2",
  unique-id =    "Karsli:2013:EDT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lyons:2013:SFF,
  author =       "Michael Lyons and Gu-Yeon Wei and David Brooks",
  title =        "{Shrink-Fit}: a Framework for Flexible Accelerator
                 Sizing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "RTL design complexity discouraged adoption of
                 reconfigurable logic in general purpose systems,
                 impeding opportunities for performance and energy
                 improvements. Recent improvements to HLS compilers
                 simplify RTL design and are easing this barrier. A new
                 challenge will emerge: managing reconfigurable
                 resources between multiple applications with custom
                 hardware designs. In this paper, we propose a method to
                 ``shrink-fit' accelerators within widely varying fabric
                 budgets. Shrink-fit automatically shrinks existing
                 accelerator designs within small fabric budgets and
                 grows designs to increase performance when larger
                 budgets are available. Our method takes advantage of
                 current accelerator design techniques and introduces a
                 novel architectural approach based on fine-grained
                 virtualization. We evaluate shrink-fit using a
                 synthesized implementation of an IDCT for decoding
                 JPEGs and show the IDCT accelerator can shrink by a
                 factor of 16x with minimal performance and area
                 overheads. Using shrink-fit, application designers can
                 achieve the benefits of hardware acceleration with
                 single RTL designs on FPGAs large and small.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lyons, M (Reprint Author), Harvard Univ, Sch Engn \&
                 Appl Sci, Cambridge, MA 02138 USA. Lyons, Michael; Wei,
                 Gu-Yeon; Brooks, David, Harvard Univ, Sch Engn \& Appl
                 Sci, Cambridge, MA 02138 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accelerators; computational complexity; Computer
                 applications; custom hardware design; Decoding;
                 discrete cosine transforms; fabric budget; field
                 programmable gate arrays; Field programmable gate
                 arrays; fine grained virtualization; flexible
                 accelerator sizing; FPGA; general purpose computers;
                 general purpose system; hardware acceleration;
                 Heterogeneous (hybrid) systems; HLS compiler; IDCT
                 accelerator; inverse transforms; JPEG decoding; program
                 compilers; Program processors; reconfigurable
                 architectural approach; reconfigurable architectures;
                 Reconfigurable hardware; reconfigurable logic;
                 reconfigurable resource management; RTL design
                 complexity; Runtime; shrink fit accelerator;
                 Special-Purpose and Application-Based Systems; temporal
                 logic; virtual machines; virtualisation",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Lyons:2013:SFF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Duong:2013:CAS,
  author =       "Nam Duong and Alexander V. Veidenbaum",
  title =        "Compiler-Assisted, Selective Out-Of-Order Commit",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper proposes an out-of-order instruction commit
                 mechanism using a novel compiler/architecture
                 interface. The compiler creates instruction ``blocks''
                 guaranteeing some commit conditions and the processor
                 uses the block information to commit certain
                 instructions out of order. Micro-architectural support
                 for the new commit mode is made on top of the standard,
                 ROB-based processor and includes out-of-order
                 instruction commit with register and load queue entry
                 release. The commit mode may be switched multiple times
                 during execution. Initial results for a 4-wide
                 processor show that, on average, 52\% instructions are
                 committed out of order resulting in 10\% to 26\%
                 speedups over in-order commit, with minimal hardware
                 overhead. The performance improvement is a result of an
                 effectively larger instruction window that allows more
                 cache misses to be overlapped for both L1 and L2
                 caches.",
  acknowledgement = ack-nhfb,
  affiliation =  "Duong, N (Reprint Author), Univ Calif Irvine, Dept
                 Comp Sci, Irvine, CA 92717 USA. Duong, Nam; Veidenbaum,
                 Alexander V., Univ Calif Irvine, Dept Comp Sci, Irvine,
                 CA 92717 USA.",
  author-email = "nlduong@ics.uci.edu alexv@ics.uci.edu",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architecture/compiler co-design; Benchmark testing;
                 block information; cache misses; cache storage; Cache
                 storage; cache storage; Cache storage; commit
                 conditions; compiler-architecture interface;
                 compiler-assisted selective out-of-order commit;
                 computer architecture; Computer architecture; computer
                 architecture; dynamically-scheduled and
                 statically-scheduled implementation; Hardware/software
                 interfaces; instruction blocks; instruction sets; L1
                 cache; L2 cache; load queue entry release;
                 microarchitectural support; minimal hardware overhead;
                 Out of order instruction; Out-of-order commit;
                 out-of-order instruction commit mechanism; overlapping
                 cache misses; performance evaluation; performance
                 improvement; Pipeline implementation; Pipeline
                 processors; program compilers; Program processors;
                 register; resource release; RISC/CISC; ROB-based
                 processor; Superscalar; VLIW architectures; Von Neumann
                 architectures",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Duong:2013:CAS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Nilakantan:2013:MES,
  author =       "Siddharth Nilakantan and Steven Battle and Mark
                 Hempstead",
  title =        "Metrics for Early-Stage Modeling of Many-Accelerator
                 Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The term `Dark Silicon'' has been coined to describe
                 the threat to microprocessor performance caused by
                 increasing transistor power density. Improving energy
                 efficiency is now the primary design goal for all
                 market segments of microprocessors from mobile to
                 server. Specialized hardware accelerators, designed to
                 run only a subset of workloads with orders of magnitude
                 energy efficiency improvement, are seen as a potential
                 solution. Selecting an ensemble of accelerators to best
                 cover the workloads run on a platform remains a
                 challenge. We propose metrics for accelerator selection
                 derived from a detailed communication-aware performance
                 model and present an automated methodology to populate
                 this model. Employing a combination of characterized
                 RTL and our selection metrics, we evaluate a set of
                 accelerators for a sample application and compare
                 performance to selections based on execution time and
                 Pollack's rule. We find that the architecture selected
                 by our communication-aware metric shows improved
                 performance over architectures selected based on
                 execution time and Pollack's rule, as they do not
                 account for speedup being limited by communication.",
  acknowledgement = ack-nhfb,
  affiliation =  "Nilakantan, S (Reprint Author), Drexel Univ, Dept
                 Elect \& Comp Engn, Philadelphia, PA 19104 USA.
                 Nilakantan, Siddharth; Battle, Steven; Hempstead, Mark,
                 Drexel Univ, Dept Elect \& Comp Engn, Philadelphia, PA
                 19104 USA.",
  author-email = "sn446@drexel.edu sjb328@drexel.edu mdh77@drexel.edu",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accelerators; Code Profiling; communication-aware
                 performance model; Computer architecture; computer
                 architecture; Computer Systems Organization; dark
                 silicon; General; hardware accelerators; Heterogeneous
                 (hybrid) systems; Heterogeneous Architectures;
                 magnitude energy efficiency improvement;
                 many-accelerator architectures; microprocessor;
                 microprocessor chips; Modeling; Modeling of computer
                 architecture; modelling; Multiprocessing systems; Other
                 Architecture Styles; performance evaluation; Pollack
                 rule; Processor Architectures; Program processors; RTL;
                 transistor power density; transistors",
  number-of-cited-references = "16",
  ORCID-numbers = "Nilakantan, Siddharth/0000-0003-1067-700X",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Nilakantan:2013:MES",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Delimitrou:2013:NCD,
  author =       "Christina Delimitrou and Christos Kozyrakis",
  title =        "The {Netflix} Challenge: Datacenter Edition",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "29--32",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The hundreds of thousands of servers in modern
                 warehouse scale systems make performance and efficiency
                 optimizations pressing design challenges. These systems
                 are traditionally considered homogeneous. However, that
                 is not typically the case. Multiple server generations
                 compose a heterogeneous environment, whose performance
                 opportunities have not been fully explored since
                 techniques that account for platform heterogeneity
                 typically do not scale to the tens of thousands of
                 applications hosted in large-scale cloud providers. We
                 present ADSM, a scalable and efficient recommendation
                 system for application-to-server mapping in large-scale
                 datacenters (DCs) that is QoS-aware. ADSM overcomes the
                 drawbacks of previous techniques, by leveraging robust
                 and computationally efficient analytical methods to
                 scale to tens of thousands of applications with minimal
                 overheads. It is also OoS-aware, mapping applications
                 to platforms while enforcing strict QoS guarantees.
                 ADSM is derived from validated analytical models, has
                 low and bounded prediction errors, is simple to
                 implement and scales to thousands of applications
                 without significant changes to the system. Over 390
                 real DC workloads, ADSM improves performance by 16\% on
                 average and up to 2.5x and efficiency by 22\% in a DC
                 with 10 different server configurations.",
  acknowledgement = ack-nhfb,
  affiliation =  "Delimitrou, C (Reprint Author), Stanford Univ,
                 Stanford, CA 94305 USA. Delimitrou, Christina;
                 Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305
                 USA.",
  author-email = "cdel@stanford.edu kozyraki@stanford.edu",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ADSM; application mapping; Application studies
                 resulting in better multiple-processor systems;
                 application-to-server mapping; Computer architecture;
                 computer centres; Computer System Implementation;
                 Computer Systems Organization; Data centers;
                 datacenter; design challenge; Design studies;
                 evaluation; Heterogeneous (hybrid) systems; Large and
                 Medium ( Mainframe ) Computers; Large-scale systems;
                 Measurement; modeling; Multiprocessing systems; Netflix
                 challenge; Other Architecture Styles; Parallel
                 Architectures; Performance of Systems; Processor
                 Architectures; QoS-aware; quality of service;
                 Scheduling; Scheduling and task partitioning; server
                 generation; simulation of multiple-processor systems;
                 Special-Purpose and Application-Based Systems; Super
                 (very large) computers; warehouse-scale system",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Delimitrou:2013:NCD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2013:RL,
  author =       "Anonymous",
  title =        "2012 reviewers list",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "33--34",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The publication offers a note of thanks and lists its
                 reviewers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "IEEE publishing",
}

@Article{Anonymous:2013:IOAa,
  author =       "Anonymous",
  title =        "{IEEE} Open Access Publishing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "35--35",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Advertisement: This publication offers open access
                 options for authors. IEEE open access publishing.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2013:ITN,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Transactions}}} Newsletter",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "36--36",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Advertisement: Stay connected with the IEEE Computer
                 Society Transactions by signing up for our new
                 Transactions Connection newsletter. It is free and
                 contains valuable information.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Martinez:2013:E,
  author =       "J. F. Martinez",
  title =        "Editorial",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "37--38",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.32",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Jian:2013:HPE,
  author =       "Xun Jian and John Sartori and Henry Duwe and Rakesh
                 Kumar",
  title =        "High Performance, Energy Efficient Chipkill Correct
                 Memory with Multidimensional Parity",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "39--42",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.21",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "It is well-known that a significant fraction of server
                 power is consumed in memory; this is especially the
                 case for servers with chipkill correct memories. We
                 propose a new chipkill correct memory organization that
                 decouples correction of errors due to local faults that
                 affect a single symbol in a word from correction of
                 errors due to device-level faults that affect an entire
                 column, sub-bank, or device. By using a combination of
                 two codes that separately target these two fault modes,
                 the proposed chipkill correct organization reduces code
                 overhead by half as compared to conventional chipkill
                 correct memories for the same rank size. Alternatively,
                 this allows the rank size to be reduced by half while
                 maintaining roughly the same total code overhead.
                 Simulations using PARSEC and SPEC benchmarks show that,
                 compared to a conventional double chipkill correct
                 baseline, the proposed memory organization, by
                 providing double chipkill correct at half the rank
                 size, reduces power by up to 41\%, 32\% on average over
                 a conventional baseline with the same chipkill correct
                 strength and access granularity that relies on linear
                 block codes alone, at only 1\% additional code
                 overhead.",
  acknowledgement = ack-nhfb,
  affiliation =  "Jian, X (Reprint Author), Univ Illinois, Urbana, IL
                 USA. Jian, Xun; Sartori, John; Duwe, Henry; Kumar,
                 Rakesh, Univ Illinois, Urbana, IL USA.",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "block codes; chipkill correct; chipkill correct memory
                 organization; code overhead reduction; Computer
                 architecture; device level fault; DRAM; DRAM chips;
                 error correction; error correction codes; fault mode;
                 fault tolerant computing; granular computing;
                 granularity access; linear block code; linear codes;
                 low power; Low power electronics; PARSEC; Random access
                 memory; rank size; reliable memory; server power
                 consumption; Servers; SPEC; storage management",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Jian:2013:HPE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Maddah:2013:DDS,
  author =       "Rakan Maddah and Sangyeun Cho and Rami Melhem",
  title =        "Data Dependent Sparing to Manage Better-Than-Bad
                 Blocks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "43--46",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.20",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We forecast that proper handling of unreliable storage
                 blocks (e.g., ``bad block management'' in solid-state
                 drives) will remain critical for future systems built
                 with advanced and emerging memory technologies. This
                 paper argues that the conventional block retirement and
                 sparing approach --- a block is retired as soon as it
                 shows faulty behavior --- is overly conservative and
                 inefficient. We observe that it is highly unlikely that
                 all faulty bits in a storage block manifest errors.
                 Consequently, we propose data dependent sparing, a
                 relaxed block retirement and sparing approach that
                 recycles faulty storage blocks. At small management
                 cost and with less than 1\% sparing, data dependent
                 sparing achieves the same lifetime as the conventional
                 approach with 20\% sparing.",
  acknowledgement = ack-nhfb,
  affiliation =  "Maddah, R (Reprint Author), Univ Pittsburgh, Dept Comp
                 Sci, Pittsburgh, PA 15260 USA. Maddah, Rakan; Cho,
                 Sangyeun; Melhem, Rami, Univ Pittsburgh, Dept Comp Sci,
                 Pittsburgh, PA 15260 USA.",
  author-email = "rmaddah@cs.pitt.edu cho@cs.pitt.edu
                 melhem@cs.pitt.edu",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-1064976, CCF-1059283,
                 CNS-1012070]",
  funding-text = "This work is supported in part by NSF grants
                 CCF-1064976, CCF-1059283, and CNS-1012070.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "better-than-bad block management; data dependent
                 sparing; data dependent sparing approach; Data storage
                 systems; fault tolerant computing; faulty bits; faulty
                 storage blocks; flash memory; Flash memory; flash
                 memory; management cost; memory technologies; phase
                 change memories; phase-change memory; phase-change
                 memory (PCM); relaxed block retirement approach;
                 solid-state drive; solid-state drive (SSD); Solid-state
                 drives; solid-state drives; Sparing; sparing; storage
                 block; storage management; stuck-at faults; unreliable
                 storage block handling",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Maddah:2013:DDS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2013:CFC,
  author =       "Hanjoon Kim and Yonggon Kim and John Kim",
  title =        "Clumsy Flow Control for High-Throughput Bufferless
                 On-Chip Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "47--50",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.22",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Bufferless on-chip networks are an alternative type of
                 on-chip network organization that can improve the
                 cost-efficiency of an on-chip network by removing
                 router input buffers. However, bufferless on-chip
                 network performance degrades at high load because of
                 the increased network contention and large number of
                 deflected packets. The energy benefit of bufferless
                 network is also reduced because of the increased
                 deflection. In this work, we propose a novel flow
                 control for bufferless on-chip networks in
                 high-throughput manycore accelerator architectures to
                 reduce the impact of deflection routing. By using a
                 clumsy flow control (CFC), instead of the per-hop flow
                 control that is commonly used in buffered on-chip
                 networks, we are able to reduce the amount of
                 deflection by up to 92\% on high-throughput workloads.
                 As a result, on average, CFC can approximately match
                 the performance of a baseline buffered router while
                 reducing the energy consumption by approximately
                 39\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, H (Reprint Author), Korea Adv Inst Sci \&
                 Technol, Dept Comp Sci, Taejon, South Korea. Kim,
                 Hanjoon; Kim, Yonggon; Kim, John, Korea Adv Inst Sci \&
                 Technol, Dept Comp Sci, Taejon, South Korea.",
  author-email = "hanj@kaist.ac.kr ilios@kaist.ac.kr jjk12@kaist.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "MKE, Korea, under the ITRC
                 [NIPA-2012-H0301-12-1011]; BST program through the NRF
                 of Korea; MEST [2012-0003579]",
  funding-text = "This research was supported in part by the MKE, Korea,
                 under the ITRC support program supervised by the NIPA
                 (NIPA-2012-H0301-12-1011) and in part by BST program
                 through the NRF of Korea funded by the
                 MEST(2012-0003579).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "bufferless NoC; bufferless router; CFC; clumsy flow
                 control; computer architecture; Computer architecture;
                 Computer Systems Organization; cost-efficiency
                 improvement; Data processing; deflection routing;
                 deflection routing impact reduction; energy benefit;
                 energy consumption reduction; flow control;
                 high-throughput bufferless on-chip networks;
                 high-throughput manycore accelerator architectures;
                 high-throughput workloads; Interconnection
                 architectures; microprocessor chips; Multiple Data
                 Stream Architectures (Multiprocessors); Multiprocessing
                 systems; network contention; network routing;
                 network-on-chip; On-chip interconnection networks;
                 on-chip network organization; on-chip networks;
                 Parallel architectures; Parallel Architectures;
                 performance evaluation; Processor Architectures; router
                 input buffer removal; System-on-chip",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  researcherid-numbers = "Kim, John/C-1792-2011",
  times-cited =  "7",
  unique-id =    "Kim:2013:CFC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kai:2013:GRP,
  author =       "Yi Kai and Yi Wang and Bin Liu",
  title =        "{GreenRouter}: Reducing Power by Innovating Router's
                 Architecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "51--54",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.23",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "High speed routers in Internet are becoming more
                 powerful, as well as more energy hungry. In this paper,
                 we present a new architecture of router named
                 GreenRouter which separates a line-card into two parts:
                 network interface card (DB) and packet processing card
                 (MB), connected by a two-stage switch fabric in traffic
                 flows' ingress and egress direction respectively.
                 Traffic from all DBs shares all the MBs in GreenRouter,
                 thus can be aggregated to a few active MBs on demand
                 and other MBs can be shut down to save power. Several
                 key issues to this new architecture are addressed. We
                 evaluate the power saving efficiency and give
                 preliminary simulation results. GreenRouter can well
                 adapt the traffic fluctuation and real trace
                 evaluations over one week shows that up to 63.7\% power
                 saving can be achieved while QoS constraints are
                 guaranteed.",
  acknowledgement = ack-nhfb,
  affiliation =  "Liu, B (Reprint Author), Tsinghua Univ, Dept Comp Sci
                 \& Technol, Beijing 100084, Peoples R China. Kai, Yi;
                 Wang, Yi; Liu, Bin, Tsinghua Univ, Dept Comp Sci \&
                 Technol, Beijing 100084, Peoples R China.",
  author-email = "kaiyi02@gmail.com pig020623@gmail.com
                 lmyujie@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSFC [61073171]; Tsinghua University
                 Initiative Scientific Research Program [20121080068];
                 Specialized Research Fund for the Doctoral Program of
                 Higher Education of China [20100002110051]",
  funding-text = "This work is supported by NSFC (61073171), Tsinghua
                 University Initiative Scientific Research Program
                 (20121080068), Specialized Research Fund for the
                 Doctoral Program of Higher Education of China
                 (20100002110051).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; DB; Energy efficiency;
                 energy-aware system; green computing; Green design;
                 GreenRouter; High-speed networks; Internet; line-card;
                 low power design; MB; network interface card; packet
                 processing card; power reduction; power saving
                 efficiency; QoS constraints; router; router
                 architecture innovation; Routers; telecommunication
                 network routing; Telecommunication traffic;
                 telecommunication traffic; traffic flow egress
                 direction; traffic flow ingress direction; traffic
                 fluctuation; two-stage switch fabric",
  number-of-cited-references = "6",
  ORCID-numbers = "Wang, Yi/0000-0002-9095-6879",
  research-areas = "Computer Science",
  researcherid-numbers = "Wang, Yi/A-8884-2015",
  times-cited =  "1",
  unique-id =    "Kai:2013:GRP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Joo:2013:HPS,
  author =       "Yongsoo Joo and Sangsoo Park",
  title =        "A Hybrid {PRAM} and {STT--RAM} Cache Architecture for
                 Extending the Lifetime of {PRAM} Caches",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "55--58",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.24",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "To extend the lifetime of phase change RAM (PRAM)
                 caches, we propose a hybrid cache architecture that
                 integrates a relatively small capacity of spin transfer
                 torque RAM (STT--RAM) write buffer with a PRAM cache.
                 Our hybrid cache improves the endurance limitation of
                 the PRAM cache by judiciously redirecting the write
                 traffic from an upper memory layer to the STT--RAM
                 write buffer. We have demonstrated through simulation
                 that the proposed hybrid cache outperforms existing
                 write-traffic reduction schemes with the same area
                 overhead. Moreover, our approach is orthogonal to the
                 existing schemes, providing an effective way of
                 investing die area for cache lifetime extension by
                 being used in combination with them.",
  acknowledgement = ack-nhfb,
  affiliation =  "Joo, Y (Reprint Author), Ewha Womans Univ, Dept Comp
                 Sci \& Engn, Seoul 120750, South Korea. Joo, Yongsoo;
                 Park, Sangsoo, Ewha Womans Univ, Dept Comp Sci \& Engn,
                 Seoul 120750, South Korea.",
  author-email = "ysjoo@ewha.ac.kr sangsoo.park@ewha.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Ewha Womans University",
  funding-text = "We thank Guangyu Sun and Cong Xu for their helpful
                 comments on NVRAM characteristics. This research was
                 supported by RP-Grant 2010 of Ewha Womans University.
                 Sangsoo Park is the corresponding author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache; cache lifetime extension; Cache memories; Cache
                 storage; cache storage; Computer architecture;
                 concurrency theory; Design Styles; endurance; Fault
                 tolerance; Hardware; hybrid cache architecture; hybrid
                 PRAM caches; investing die area; lifetime; memory
                 layer; Memory Structures; phase change memories; phase
                 change RAM; PRAM; Random access memory; Redundancy;
                 Redundant design; Reliability; spin transfer torque
                 RAM; STT RAM cache architecture; STT RAM write buffer;
                 STT--RAM; Testing and Fault-Tolerance; write traffic
                 reduction schemes",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  times-cited =  "10",
  unique-id =    "Joo:2013:HPS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Blem:2013:MMA,
  author =       "Emily Blem and Hadi Esmaeilzadeh and Renee St Amant
                 and Karthikeyan Sankaralingam and Doug Burger",
  title =        "Multicore Model from Abstract Single Core Inputs",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "59--62",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.27",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper describes a first order multicore model to
                 project a tighter upper bound on performance than
                 previous Amdahl's Law based approaches. The speedup
                 over a known baseline is a function of the core
                 performance, microarchitectural features, application
                 parameters, chip organization, and multicore topology.
                 The model is flexible enough to consider both CPU and
                 GPU like organizations as well as modern topologies
                 from symmetric to aggressive heterogeneous (asymmetric,
                 dynamic, and fused) designs. This extended model
                 incorporates first order effects-exposing more
                 bottlenecks than previous applications of Amdahl's
                 Law-while remaining simple and flexible enough to be
                 adapted for many applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Blem, E (Reprint Author), Univ Wisconsin, Madison, WI
                 53706 USA. Blem, Emily; Sankaralingam, Karthikeyan,
                 Univ Wisconsin, Madison, WI 53706 USA. Esmaeilzadeh,
                 Hadi, Univ Washington, Seattle, WA 98195 USA. St Amant,
                 Renee, Univ Texas Austin, Austin, TX 78712 USA.",
  author-email = "blem@cs.wisc.edu hadianeh@cs.washington.edu
                 stamant@cs.utexas.edu karu@cs.wisc.edu
                 dburger@microsoft.com",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "abstract single core inputs; aggressive heterogeneous
                 designs; Amdahl law based approach; application
                 parameters; chip organization; Computer Systems
                 Organization; CPU like organizations; first order
                 multicore model; General; GPU like organizations;
                 graphics processing units; microarchitectural features;
                 Modeling of computer architecture; multicore topology;
                 multicores; Multiple Data Stream Architectures
                 (Multiprocessors); multiprocessing systems; network
                 topology; parallelism; performance evaluation;
                 Performance modeling; Processor Architectures",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Blem:2013:MMA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Michaud:2013:DMT,
  author =       "Pierre Michaud",
  title =        "Demystifying Multicore Throughput Metrics",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "63--66",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.25",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Several different metrics have been proposed for
                 quantifying the throughput of multicore processors.
                 There is no clear consensus about which metric should
                 be used. Some studies even use several throughput
                 metrics. We show that there exists a relation between
                 single-thread average performance metrics and
                 throughput metrics, and that throughput metrics inherit
                 the meaning or lack of meaning of the corresponding
                 single-thread metric. We show that two popular
                 throughput metrics, the weighted speedup and the
                 harmonic mean of speedups, are inconsistent: they do
                 not give equal importance to all benchmarks. Moreover
                 we demonstrate that the weighted speedup favors
                 unfairness. We show that the harmonic mean of IPCs, a
                 seldom used throughput metric, is actually consistent
                 and has a physical meaning. We explain under which
                 conditions the arithmetic mean or the harmonic mean of
                 IPCs can be used as a strong indicator of throughput
                 increase.",
  acknowledgement = ack-nhfb,
  affiliation =  "Michaud, P (Reprint Author), INRIA Rennes, Rennes,
                 France. INRIA Rennes, Rennes, France.",
  author-email = "Pierre.Michaud@inria.fr",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; Computer Systems Organization;
                 evaluation; Measurement; Modeling; modeling;
                 Multi-core/single-chip multiprocessors; Multicore
                 processing; multicore processors; multicore throughput;
                 multicore throughput metrics; multiprocessing systems;
                 Parallel Architectures; Parallel architectures;
                 Performance evaluation; performance metric; Performance
                 of Systems; Processor Architectures; Program
                 processors; simulation of multiple-processor systems;
                 single thread metric; software metrics",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "7",
  unique-id =    "Michaud:2013:DMT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Tembey:2013:SSS,
  author =       "Priyanka Tembey and Augusto Vega and Alper
                 Buyuktosunoglu and Dilma {Da Silva} and Pradip Bose",
  title =        "{SMT} Switch: Software Mechanisms for Power Shifting",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "67--70",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.26",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Simultaneous multithreading (SMT) as a processor
                 design to achieve higher levels of system and
                 application throughput is a well-accepted and deployed
                 technique in most desktop and server processors. We
                 study the power implications of varying SMT levels
                 i.e., thread counts per core for various multi-threaded
                 applications on a real SMT multicore platform, and
                 introduce a novel software mechanism of changing SMT
                 level of a core to tune platform power. Power-shifting
                 policies by varying per core SMT levels for performance
                 benefits within a power cap are introduced. Projected
                 power savings (of 15\%) for a streaming parallel
                 benchmark can be attained using SMT-level power
                 shifting mechanisms.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tembey, P (Reprint Author), Georgia Tech, Atlanta, GA
                 30332 USA. Tembey, Priyanka, Georgia Tech, Atlanta, GA
                 30332 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application throughput; Computer architecture;
                 Computer Systems Organization; Hardware;
                 multi-threading; Multicore platforms; multiprocessing
                 systems; Multithreaded processors; Multithreading;
                 Operating Systems; Other Architecture Styles; Parallel
                 processing; power aware computing; Power Management;
                 Power shifting; Power system management; Process
                 Management; Processor Architectures; processor design;
                 Program processors; Scheduling; simultaneous
                 multithreading; SMT; SMT multicore platform; SMT
                 switch; SMT-level power shifting mechanism; Software
                 engineering; software mechanisms; Software/Software
                 Engineering; streaming parallel benchmark; tune
                 platform power",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Tembey:2013:SSS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2013:IOAb,
  author =       "Anonymous",
  title =        "{IEEE} Open Access Publishing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "71--71",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.33",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2013:SCI,
  author =       "Anonymous",
  title =        "Stay Connected to the {IEEE Computer Society}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "72--72",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.34",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2013:BC,
  author =       "Anonymous",
  title =        "{[Back} cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "c4--c4",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.31",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2013:BIC,
  author =       "Anonymous",
  title =        "{[Back} inside cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "c3--c3",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.30",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2013:FC,
  author =       "Anonymous",
  title =        "{[Front} cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "c1--c1",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.28",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2013:FIC,
  author =       "Anonymous",
  title =        "{[Front} inside cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "c2--c2",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.29",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Arelakis:2014:CVA,
  author =       "Angelos Arelakis and Per Stenstr{\"o}m",
  title =        "A Case for a Value-Aware Cache",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.31",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Replication of values causes poor utilization of
                 on-chip cache memory resources. This paper addresses
                 the question: How much cache resources can be
                 theoretically and practically saved if value
                 replication is eliminated? We introduce the concept of
                 value-aware caches and show that a sixteen times
                 smaller value-aware cache can yield the same miss rate
                 as a conventional cache. We then make a case for a
                 value-aware cache design using Huffman-based
                 compression. Since the value set is rather stable
                 across the execution of an application, one can afford
                 to reconstruct the coding tree in software. The
                 decompression latency is kept short by our proposed
                 novel pipelined Huffman decoder that uses canonical
                 codewords. While the (loose) upper-bound compression
                 factor is 5.2X, we show that, by eliminating
                 cache-block alignment restrictions, it is possible to
                 achieve a compression factor of 3.4X for practical
                 designs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Arelakis, A (Reprint Author), Chalmers, Gothenburg,
                 Sweden. Arelakis, Angelos; Stenstrom, Per, Chalmers,
                 Gothenburg, Sweden.",
  author-email = "angelos@chalmers.se per.stenstrom@chalmers.se",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Swedish Research Council",
  funding-text = "This research is supported by the Swedish Research
                 Council. The simulations ran on the resources provided
                 by the Swedish National Infrastructure for Computing
                 (SNIC) at C3SE.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B Hardware; B.3 Memory Structures; B.3.2 Design
                 Styles; B.3.2.b Cache memories; cache storage;
                 cache-block alignment restriction elimination; Clocks;
                 coding tree reconstruction; data compression; data
                 handling; Decoding; decompression latency; E Data; E.4
                 Coding and Information Theory; E.4.a Data compaction
                 and compression; Engines; Huffman codes; Huffman
                 coding; Huffman-based compression; Indexes; on-chip
                 cache memory resources; System-on-a-chip; tree codes;
                 value replication; value-aware cache design",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Arelakis:2014:CVA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Chen:2014:PEC,
  author =       "Zheng Chen and Huaxi Gu and Yintang Yang and Luying
                 Bai and Hui Li",
  title =        "A Power Efficient and Compact Optical Interconnect for
                 Network-on-Chip",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Optical interconnect is a promising alternative to
                 substitute the electrical interconnect for intra-chip
                 communications. The topology of optical Network-on-Chip
                 (ONoC) has a great impact on the network performance.
                 However, the size of ONoC is limited by the power
                 consumption and crosstalk noise, which are mainly
                 resulted from the waveguide crossings in the topology.
                 In this paper, a diagonal Mesh topology (DMesh) is
                 proposed to relieve the limitation of scalability by
                 reducing the number of waveguide crossing, which is
                 only 20\% that of Mesh. In addition, the number of
                 optical routers in DMesh is less than half of that in
                 Mesh-based ONoC. Due to its compact architecture and
                 favorable scalability, DMesh topology is suitable for
                 large-scale ONoC design.",
  acknowledgement = ack-nhfb,
  affiliation =  "Chen, Z (Reprint Author), Xidian Univ Xian, State Key
                 Lab Integrated Serv Networks, Xian, Peoples R China.
                 Chen, Zheng; Gu, Huaxi; Bai, Luying; Li, Hui, Xidian
                 Univ Xian, State Key Lab Integrated Serv Networks,
                 Xian, Peoples R China. Yang, Yintang, Xidian Univ Xian,
                 Inst Microelect, Xian, Peoples R China.",
  author-email = "chenzheng8331@stu.xidian.edu.cn hxgu@xidian.edu.cn
                 ytyang@xidian.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation of China
                 [61070046, 60803038]; State Key Lab [ISN1104001];
                 Fundamental Research Funds for the Central Universities
                 [K5051301003]; 111 Project [B08038]",
  funding-text = "This work is supported by the National Science
                 Foundation of China Grant No. 61070046 and 60803038,
                 the special fund from State Key Lab Grant No.
                 ISN1104001, the Fundamental Research Funds for the
                 Central Universities Grant No. K5051301003, the 111
                 Project Grant No. B08038.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "compact optical interconnect; crosstalk noise;
                 diagonal mesh topology; DMesh topology; integrated
                 optoelectronics; intra-chip communications; large-scale
                 ONoC design; mesh-based ONoC; multiprocessors; network
                 performance; Network topology; network-on-chip; optical
                 interconnections; Optical interconnects; optical
                 network-on-chip; optical router; Optical routers;
                 optical routers; power consumption; power efficient
                 interconnect; Topology; topology; Topology; waveguide
                 crossings; wavelength division multiplexing; Wavelength
                 division multiplexing; wavelength division
                 multiplexing",
  number-of-cited-references = "9",
  ORCID-numbers = "Gu, Huaxi/0000-0002-6409-2229",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Chen:2014:PEC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Cota:2014:AMR,
  author =       "Emilio G. Cota and Paolo Mantovani and Michele
                 Petracca and Mario R. Casu and Luca P. Carloni",
  title =        "Accelerator Memory Reuse in the Dark Silicon Era",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.29",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Accelerators integrated on-die with General-Purpose
                 CPUs (GP-CPUs) can yield significant performance and
                 power improvements. Their extensive use, however, is
                 ultimately limited by their area overhead; due to their
                 high degree of specialization, the opportunity cost of
                 investing die real estate on accelerators can become
                 prohibitive, especially for general-purpose
                 architectures. In this paper we present a novel
                 technique aimed at mitigating this opportunity cost by
                 allowing GP-CPU cores to reuse accelerator memory as a
                 non-uniform cache architecture (NUCA) substrate. On a
                 system with a last level-2 cache of 128kB, our
                 technique achieves on average a 25\% performance
                 improvement when reusing four 512 kB accelerator memory
                 blocks to form a level-3 cache. Making these blocks
                 reusable as NUCA slices incurs on average in a 1.89\%
                 area overhead with respect to equally-sized ad hoc
                 cache slices.",
  acknowledgement = ack-nhfb,
  affiliation =  "Cota, EG (Reprint Author), Columbia Univ, New York, NY
                 10027 USA. Cota, Emilio G.; Mantovani, Paolo; Carloni,
                 Luca P., Columbia Univ, New York, NY 10027 USA.
                 Petracca, Michele, Cadence Design Syst Inc, San Jose,
                 CA USA. Casu, Mario R., Politecn Torino, Turin,
                 Italy.",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [1018236,
                 1219001]; ONR Young Investigator Award; Gigascale
                 Systems Research Center; Focus Center Research Program
                 (FCRP), a Semiconductor Research Corporation entity",
  funding-text = "This research is partially supported by the National
                 Science Foundation under Awards \#: 1018236 and
                 1219001, an ONR Young Investigator Award, and the
                 Gigascale Systems Research Center, one of six research
                 centers funded under the Focus Center Research Program
                 (FCRP), a Semiconductor Research Corporation entity.
                 The authors thank John Demme and the anonymous
                 reviewers for their insightful comments.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; accelerator architectures; Accelerator
                 architectures; accelerator architectures; accelerator
                 memory reuse; cache formation; Cache memory; cache
                 slice; cache storage; dark silicon era; general purpose
                 CPU; general-purpose architecture; GP-CPU; Memory
                 management; nonuniform cache architecture; NUCA
                 substrate; Power demand; Silicon; Transform coding",
  keywords-plus = "CACHES",
  number-of-cited-references = "18",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Cota:2014:AMR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Chou:2014:EPE,
  author =       "Yu-Liang Chou and Shaoshan Liu and Eui-Young Chung and
                 Jean-Luc Gaudiot",
  title =        "An Energy and Performance Efficient {DVFS} Scheme for
                 Irregular Parallel Divide-and-Conquer Algorithms on the
                 {Intel SCC}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The divide-and-conquer paradigm can be used to express
                 many computationally significant problems, but an
                 important subset of these applications is inherently
                 load-imbalanced. Load balancing is a challenge for
                 irregular parallel divide-and-conquer algorithms and
                 efficiently solving these applications will be a key
                 requirement for future many-core systems. To address
                 the load imbalance issue, instead of attempting to
                 dynamically balancing the workloads, this paper
                 proposes an energy and performance efficient Dynamic
                 Voltage and Frequency Scaling (DVFS) scheduling scheme,
                 which takes into account the load imbalance behavior
                 exhibited by these applications. More specifically, we
                 examine the core of the divide-and-conquer paradigm and
                 determine that the base-case-reached point where
                 recursion stops is a suitable place in a
                 divide-and-conquer paradigm to apply the proposed DVFS
                 scheme. To evaluate the proposed scheme, we implement
                 four representative irregular parallel
                 divide-and-conquer algorithms, tree traversal,
                 quicksort, finding primes, and n-queens puzzle, on the
                 Intel Single-chip Cloud Computer (SCC) many-core
                 machine. We demonstrate that, on average, the proposed
                 scheme can improve performance by 41\% while reducing
                 energy consumption by 36\% compared to the baseline
                 running the whole computation with the default
                 frequency configuration (400MHz).",
  acknowledgement = ack-nhfb,
  affiliation =  "Chou, YL (Reprint Author), Univ Calif Irvine, Irvine,
                 CA 92697 USA. Chou, Yu-Liang; Gaudiot, Jean-Luc, Univ
                 Calif Irvine, Irvine, CA 92697 USA. Liu, Shaoshan,
                 Microsoft Corp, Redmond, WA 98052 USA. Chung,
                 Eui-Young, Yonsei Univ, Seoul 120749, South Korea.",
  author-email = "d943010010@gmail.com shaoliu@microsoft.com
                 eychung@yonsei.ac.kr gaudiot@uci.edu",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "US National Science Foundation
                 [CCF-1065448]; National Research Foundation of Korea
                 (NRF) [2012S1A2A1A01031420]; Ministry of Education,
                 Science and Technology [2012-047670]; National Science
                 Council [NSC 101-2917-I-564-079]",
  funding-text = "This work is partly supported by the US National
                 Science Foundation under Grant No. CCF-1065448, by the
                 National Research Foundation of Korea (NRF) under Grant
                 No. 2012S1A2A1A01031420, by the Ministry of Education,
                 Science and Technology under Grant No. 2012-047670, and
                 by the National Science Council under Grant No. NSC
                 101-2917-I-564-079. Any opinions, findings, and
                 conclusions expressed in this material are those of the
                 authors and do not necessarily reflect the views of
                 these sponsors.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "base-case-reached point; D Software/Software
                 Engineering; D.4 Operating Systems; D.4 Operating
                 Systems < D.4.7 Organization and Design; D.4.7.b
                 Distributed systems; D.4.7.f Parallel systems; D.4.8
                 Performance < D.4.8.a Measurements < Distributed
                 processing; divide and conquer methods;
                 Divide-and-conquer; DVFS; dynamic voltage and frequency
                 scaling; energy conservation; energy consumption
                 reduction; energy efficient DVFS scheme; finding
                 primes; frequency 400 MHz; Intel SCC; Intel single-chip
                 cloud computer; irregular parallel divide-and-conquer
                 algorithms; Load Imbalance; load imbalance behavior;
                 many-core machine; microprocessor chips;
                 multiprocessing systems; n-queens puzzle; Operating
                 systems; parallel algorithms; Parallel processing;
                 performance efficient DVFS scheme; Performance
                 evaluation; power aware computing; processor
                 scheduling; quicksort; recursion stops; resource
                 allocation; Software engineering; tree traversal",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Chou:2014:EPE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Rotem:2014:BUI,
  author =       "Nadav Rotem and Yosi {Ben Asher}",
  title =        "Block Unification {IF}-conversion for High Performance
                 Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.28",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Graphics Processing Units accelerate data-parallel
                 graphic calculations using wide SIMD vector units.
                 Compiling programs to use the GPU's SIMD architectures
                 require converting multiple control flow paths into a
                 single stream of instructions. IF-conversion is a
                 compiler transformation, which converts control
                 dependencies into data dependencies, and it is used by
                 vectorizing compilers to eliminate control flow and
                 enable efficient code generation. In this work we
                 enhance the IF-conversion transformation by using a
                 block unification method to improve the currently used
                 block flattening method. Our experimental results
                 demonstrate that our IF-conversion method is effective
                 in reducing the number of predicated instructions and
                 in boosting kernel execution speed.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rotem, N (Reprint Author), Univ Haifa, Dept Comp Sci,
                 IL-31999 Haifa, Israel. Rotem, Nadav; Ben Asher, Yosi,
                 Univ Haifa, Dept Comp Sci, IL-31999 Haifa, Israel.",
  author-email = "rotemn@cs.haifa.ac.il yosi@cs.haifa.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "block flattening method; block unification
                 IF-conversion; block unification method; code
                 generation; Code generation; compiler transformation;
                 Compilers; Computer architecture; data-parallel graphic
                 calculations; GPU SIMD architectures; Graphics
                 processing unit; graphics processing units; high
                 performance architectures; Kernel; Merging; multiple
                 control flow paths; parallel processing; Processors;
                 program compilers; Programming Languages; Registers;
                 Software/Software Engineering; vectorizing compilers;
                 Vectors; wide SIMD vector units",
  number-of-cited-references = "15",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Rotem:2014:BUI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Ilic:2014:CAR,
  author =       "Aleksandar Ilic and Frederico Pratas and Leonel
                 Sousa",
  title =        "Cache-aware Roofline model: Upgrading the loft",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The Roofline model graphically represents the
                 attainable upper bound performance of a computer
                 architecture. This paper analyzes the original Roofline
                 model and proposes a novel approach to provide a more
                 insightful performance modeling of modern architectures
                 by introducing cache-awareness, thus significantly
                 improving the guidelines for application optimization.
                 The proposed model was experimentally verified for
                 different architectures by taking advantage of built-in
                 hardware counters with a curve fitness above 90\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ilic, A (Reprint Author), Univ Tecn Lisboa, INESC ID
                 IST, Lisbon, Portugal. Ilic, Aleksandar; Pratas,
                 Frederico; Sousa, Leonel, Univ Tecn Lisboa, INESC ID
                 IST, Lisbon, Portugal.",
  author-email = "ilic@inesc-id.pt fcpp@inesc-id.pt las@inesc-id.pt",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "national funds through FCT (Fundacao para a
                 Ciencia e a Tecnologia) [PTDC/EEI-ELC/3152/2012,
                 PEst-OE/EEI/LA0021/2011, PTDC/EEA-ELC/117329/2010]; FCT
                 [SFRH/BPD/87734/2012]",
  funding-text = "This work was supported by national funds through FCT
                 (Fundacao para a Ciencia e a Tecnologia), under
                 projects PTDC/EEI-ELC/3152/2012,
                 PEst-OE/EEI/LA0021/2011, and PTDC/EEA-ELC/117329/2010.
                 F. Pratas also acknowledges the FCT scholarship
                 SFRH/BPD/87734/2012.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application optimization; application optimization;
                 Application optimization; built-in hardware counters;
                 C.0.d Modeling of computer architecture < C.0 General <
                 C Computer Systems Organization; C.0.e System
                 architectures; C.4.d Modeling techniques < C.4
                 Performance of Systems < C Computer Systems
                 Organization; C.4.g Measurement; cache storage;
                 cache-aware Roofline model; cache-awareness; computer
                 architecture; computer architecture upper bound
                 performance; curve fitness; evaluation; integration and
                 modeling < C.0 General < C Computer Systems
                 Organization; Modeling; modeling; Multicore computer
                 architectures; Multiprocessing systems; multiprocessing
                 systems; Performance evaluation; Performance modeling;
                 Simulation; simulation of multiple-processor systems <
                 C.4 Performance of Systems < C Computer Syst",
  number-of-cited-references = "10",
  ORCID-numbers = "Ilic, Aleksandar/0000-0002-8594-3539 Sousa,
                 Leonel/0000-0002-8066-221X",
  research-areas = "Computer Science",
  researcherid-numbers = "Ilic, Aleksandar/L-1943-2014 Sousa,
                 Leonel/B-2749-2009",
  times-cited =  "24",
  unique-id =    "Ilic:2014:CAR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Efraim:2014:EAR,
  author =       "Rotem Efraim and Ran Ginosar and C. Weiser and Avi
                 Mendelson",
  title =        "Energy Aware Race to Halt: a Down to {EARtH} Approach
                 for Platform Energy Management",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.32",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The EARtH algorithm finds the optimal voltage and
                 frequency operational point of the processor in order
                 to achieve minimum energy of the computing platform.
                 The algorithm is based on a theoretical model employing
                 a small number of parameters, which are extracted from
                 real systems using off-line and run-time methods. The
                 model and algorithm have been validated on real systems
                 using 45nm, 32nm and 22nm Intel (R) Core processors.
                 The algorithm can save up to 44\% energy compared with
                 the commonly used fixed frequency policies.",
  acknowledgement = ack-nhfb,
  affiliation =  "Efraim, R (Reprint Author), Intel Corp, Santa Clara,
                 CA 95051 USA. Efraim, Rotem, Intel Corp, Santa Clara,
                 CA 95051 USA. Ginosar, Ran; Weiser, C.; Mendelson, Avi,
                 Technion Israeli Inst Technol, Haifa, Israel.",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithm design and analysis; B Hardware; B.9 Power
                 Management; B.9.2 Energy-aware systems; C Computer
                 Systems Organization; C.4 Performance of Systems; C.5
                 Computer System Implementation; C.5.4 VLSI Systems;
                 C.5.5 Servers; Computational modeling; Earth; EARtH
                 algorithm; energy aware race to halt; Energy
                 management; Energy measurement; fixed frequency
                 policies; Frequency measurement; frequency operational
                 point; Heterogeneous cores; Intel core processors;
                 microprocessor chips; off-line methods; optimal
                 voltage; platform energy management; power aware
                 computing; Power Management; run-time methods; size 22
                 nm; size 32 nm; size 45 nm; Voltage measurement",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "Efraim:2014:EAR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Cakmakci:2014:EVA,
  author =       "Yaman {\c{C}}akmak{\c{c}}i and O{\u{g}}uz Ergin",
  title =        "Exploiting Virtual Addressing for Increasing
                 Reliability",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "29--32",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "A novel method to protect a system against errors
                 resulting from soft errors occurring in the virtual
                 address (VA) storing structures such as translation
                 lookaside buffers (TLB), physical register file (PRF)
                 and the program counter (PC) is proposed in this paper.
                 The work is motivated by showing how soft errors impact
                 the structures that store virtual page numbers (VPN). A
                 solution is proposed by employing linear block encoding
                 methods to be used as a virtual addressing scheme at
                 link time. Using the encoding scheme to assign VPNs for
                 VAs, it is shown that the system can tolerate soft
                 errors using software with the help of the discussed
                 decoding techniques applied to the page fault handler.
                 The proposed solution can be used on all of the
                 architectures using virtually indexed addressing. The
                 main contribution of this paper is the decreasing of
                 AVF for data TLB by 42.5\%, instruction TLB by 40.3\%,
                 PC by 69.2\% and PRF by 33.3\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "{\c{C}}akmak{\c{c}}i, Y (Reprint Author), TOBB Univ
                 Econ \& Technol, Dept Comp Engn, Ankara, Turkey.
                 {\c{C}}akmak{\c{c}}i, Yaman; Ergin, O{\u{g}}uz, TOBB
                 Univ Econ \& Technol, Dept Comp Engn, Ankara, Turkey.",
  author-email = "ycakmakci@etu.edu.tr oergin@etu.edu.tr",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Scientific and Technological Research
                 Council of Turkey (TUBITAK) [112E004]",
  funding-text = "This work was supported in part by the Scientific and
                 Technological Research Council of Turkey (TUBITAK)
                 under Grant 112E004. The work is in the framework of
                 COST ICT Action 1103 Manufacturable and Dependable
                 Multicore Architectures at Nanoscale.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "AVF; B Hardware; B.3 Memory Structures; B.3.2 Design
                 Styles; B.3.2.h Virtual memory; B.3.4 Reliability,
                 Testing and Fault-Tolerance; buffer storage; decoding
                 techniques; encoding; Fault tolerance; Hardware; linear
                 block encoding methods; Memory management; page fault
                 handler; PC; physical register file; PRF; program
                 counter; soft errors; TLB; translation lookaside
                 buffers; virtual address storing structures; virtual
                 addressing; virtual addressing scheme; Virtual memory;
                 virtual page numbers; virtually indexed addressing;
                 VPN",
  keywords-plus = "SOFT ERRORS",
  number-of-cited-references = "10",
  ORCID-numbers = "Ergin, O{\u{g}}uz/0000-0003-2701-3787",
  research-areas = "Computer Science",
  researcherid-numbers = "Ergin, O{\u{g}}uz/E-5717-2010",
  times-cited =  "1",
  unique-id =    "Cakmakci:2014:EVA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zhu:2014:EWC,
  author =       "Yuhao Zhu and Aditya Srikanth and Jingwen Leng and
                 Vijay Janapa Reddi",
  title =        "Exploiting Webpage Characteristics for
                 Energy-Efficient Mobile {Web} Browsing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "33--36",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.33",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Web browsing on mobile devices is undoubtedly the
                 future. However, with the increasing complexity of
                 webpages, the mobile device's computation capability
                 and energy consumption become major pitfalls for a
                 satisfactory user experience. In this paper, we propose
                 a mechanism to effectively leverage processor frequency
                 scaling in order to balance the performance and energy
                 consumption of mobile web browsing. This mechanism
                 explores the performance and energy tradeoff in webpage
                 loading, and schedules webpage loading according to the
                 webpages' characteristics, using the different
                 frequencies. The proposed solution achieves 20.3\%
                 energy saving compared to the performance mode, and
                 improves webpage loading performance by 37.1\% compared
                 to the battery saving mode.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zhu, YH (Reprint Author), Univ Texas Austin, Dept
                 Elect \& Comp Engn, Austin, TX 78712 USA. Zhu, Yuhao;
                 Srikanth, Aditya; Leng, Jingwen; Reddi, Vijay Janapa,
                 Univ Texas Austin, Dept Elect \& Comp Engn, Austin, TX
                 78712 USA.",
  author-email = "yzhu@utexas.edu aditya.srik@utexas.edu
                 jingwen@utexas.edu vj@ece.utexas.edu",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "C Computer Systems Organization; C.2
                 Communication/Networking and Information Technology;
                 C.2.8 Mobile Computing; Cascading style sheets; Cutoff;
                 EDP; Energy; energy conservation; energy consumption;
                 Energy consumption; energy-efficient mobile Web
                 browsing; HTML; Internet; Load modeling; Loading;
                 Market research; Mobile communication; mobile
                 computing; mobile device computation capability;
                 Performance; power aware computing; processor frequency
                 scaling; user experience; Web page characteristics; Web
                 page loading performance; Webpages",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Zhu:2014:EWC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Morad:2014:GMO,
  author =       "Amir Morad and Tomer Y. Morad and Leonid Yavits and
                 Ran Ginosar and Uri Weiser",
  title =        "Generalized {MultiAmdahl}: Optimization of
                 Heterogeneous Multi-Accelerator {SoC}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "37--40",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.34",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Consider a workload comprising a consecutive sequence
                 of program execution segments, where each segment can
                 either be executed on general purpose processor or
                 offloaded to a hardware accelerator. An analytical
                 optimization framework based on MultiAmdahl framework
                 and Lagrange multipliers, for selecting the optimal set
                 of accelerators and for allocating resources among them
                 under constrained area is proposed. Due to the
                 practical implementation of accelerators, the optimal
                 architecture under area constraints may exclude some of
                 the accelerators. As the fraction of the workload that
                 can be accelerated decreases, resources (e.g. area) may
                 shift from accelerators into the general purpose
                 processor. The framework can be extended in a number of
                 ways, spanning from SoC partitioning, bandwidth to
                 power distribution, energy and other constrained
                 resources.",
  acknowledgement = ack-nhfb,
  affiliation =  "Morad, A (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
                 Morad, Amir; Morad, Tomer Y.; Yavits, Leonid; Ginosar,
                 Ran; Weiser, Uri, Technion Israel Inst Technol, Dept
                 Elect Engn, IL-32000 Haifa, Israel.",
  author-email = "amirm@tx.technion.ac.il tomerm@tx.technion.ac.il
                 yavits@tx.technion.ac.il ran@ee.technion.ac.il
                 uri.weiser@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; analytical optimization framework; Chip
                 Multiprocessors; general purpose processor; generalized
                 multiAmdhal framework; Hardware; hardware accelerator;
                 heterogeneous multiaccelerator SoC partitioning;
                 Lagrange multiplier; Mathematical model; Modeling of
                 computer architecture; MultiAmdahl; Multicore
                 processing; optimisation; Optimization; power
                 distribution bandwidth; program execution segment;
                 resource allocation; Resource management;
                 System-on-a-chip; system-on-chip",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Morad:2014:GMO",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kvatinsky:2014:MBM,
  author =       "Shahar Kvatinsky and Yuval H. Nacson and Yoav Etsion
                 and Eby G. Friedman and Avinoam Kolodny and Uri C.
                 Weiser",
  title =        "Memristor-Based Multithreading",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "41--44",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Switch on Event Multithreading (SoE MT, also known as
                 coarse-grained MT and block MT) processors run multiple
                 threads on a pipeline machine, while the pipeline
                 switches threads on stall events (e.g., cache miss).
                 The thread switch penalty is determined by the number
                 of stages in the pipeline that are flushed of in-flight
                 instructions. In this paper, Continuous Flow
                 Multithreading (CFMT), a new architecture of SoE MT, is
                 introduced. In CFMT, a multistate pipeline register
                 (MPR) holds the microarchitectural state of multiple
                 different threads within the execution pipeline stages,
                 where only one thread is active at a time. The MPRs
                 eliminate the need to flush in-flight instructions and
                 therefore significantly improve performance. In recent
                 years, novel memory technologies such as Resistive RAM
                 (RRAM) and Spin Torque Transfer Magnetoresistive RAM
                 (STT-MRAM), have been developed. All of these
                 technologies are nonvolatile, store data as resistance,
                 and can be described as ``memristors''. Memristors are
                 power efficient, dense, and fast as compared to
                 standard memory technologies such as SRAM, DRAM, and
                 Flash. Memristors therefore provide the opportunity to
                 place the MPRs physically within the pipeline stages. A
                 performance analysis of CFMT is compared to
                 conventional SoE MT processors, demonstrating up to a
                 2X performance improvement, while the operational
                 mechanism, due to the use of memristors, is low power
                 and low complexity as compared to conventional SoE MT
                 processors.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kvatinsky, S (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
                 Kvatinsky, Shahar; Etsion, Yoav; Kolodny, Avinoam;
                 Weiser, Uri C., Technion Israel Inst Technol, Dept
                 Elect Engn, IL-32000 Haifa, Israel. Etsion, Yoav,
                 Technion Israel Inst Technol, Dept Comp Sci, IL-32000
                 Haifa, Israel. Friedman, Eby G., Univ Rochester, Dept
                 Elect \& Comp Engn, Rochester, NY 14627 USA.",
  author-email = "skva@tx.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Hasso Plattner Institute",
  funding-text = "This work was supported by the Hasso Plattner
                 Institute. The authors thank Ravi Patel for his
                 comments and area overhead estimation and to Nimrod
                 Wald and Guy Satat for their help in evaluating the
                 architecture.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B Hardware; B.3 Memory Structures; B.7 Integrated
                 Circuits; B.7.1 Types and Design Styles; B.7.1.e Memory
                 technologies; C Computer Systems Organization; C.0
                 General; C.0.a Emerging technologies; C.0.d Modeling of
                 computer architecture; CFMT; Computer architecture;
                 continuous flow multithreading; in-flight instructions;
                 Integrated circuits; Memory management; memristor;
                 memristor-based multithreading; memristors; MPR;
                 multi-threading; multistate pipeline register;
                 multithreaded processors; Multithreading; novel memory
                 technologies; phase change memory; random-access
                 storage; resistive RAM; RRAM; RRAM, STT-MRAM; SoE MT
                 processors; spin torque transfer magnetoresistive RAM;
                 STT- MRAM; STT-MRAM; switch on event multithreading
                 processors; Systems design and analysis",
  keywords-plus = "RESISTIVE SWITCHING MEMORIES",
  number-of-cited-references = "21",
  research-areas = "Computer Science",
  times-cited =  "10",
  unique-id =    "Kvatinsky:2014:MBM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wingbermuehle:2014:OAS,
  author =       "Joseph G. Wingbermuehle and Ron K. Cytron and Roger D.
                 Chamberlain",
  title =        "Optimization of Application-Specific Memories",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "45--48",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memory access times are the primary bottleneck for
                 many applications today. This ``memory wall'' is due to
                 the performance disparity between processor cores and
                 main memory. To address the performance gap, we propose
                 the use of custom memory subsystems tailored to the
                 application rather than attempting to optimize the
                 application for a fixed memory subsystem. Custom
                 subsystems can take advantage of application-specific
                 properties as well as memory-specific properties to
                 improve access times or write-backs given constraints
                 on size or power.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wingbermuehle, JG (Reprint Author), Washington Univ,
                 Dept Comp Sci \& Engn, St Louis, MO 63130 USA.
                 Wingbermuehle, Joseph G.; Cytron, Ron K.; Chamberlain,
                 Roger D., Washington Univ, Dept Comp Sci \& Engn, St
                 Louis, MO 63130 USA.",
  author-email = "wingbej@wustl.edu cytron@wustl.edu roger@wustl.edu",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CNS-09095368,
                 CNS-0931693]",
  funding-text = "This work is supported by the National Science
                 Foundation under grants CNS-09095368 and CNS-0931693.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "access time improvement; application-specific memory
                 optimization; B Hardware; B.3 Memory Structures; B.3.2
                 Design Styles; B.3.3 Performance Analysis and Design
                 Aids; B.3.3.b Simulation; C Computer Systems
                 Organization; C.1 Processor Architectures; C.1.5
                 Micro-architecture implementation considerations;
                 C.1.5.e Memory hierarchy; cache; cache storage;
                 Computer architecture; custom memory subsystems; fixed
                 memory subsystem; Hardware; memory access times; Memory
                 management; memory wall; memory-specific properties;
                 Multiprocessing systems; performance disparity;
                 Performance evaluation; performance gap; processor
                 cores; write-backs given constraints",
  number-of-cited-references = "21",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Wingbermuehle:2014:OAS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Xu:2014:STM,
  author =       "Yunlong Xu and Rui Wang and Nilanjan Goswami and Tao
                 Li and Depei Qian",
  title =        "Software Transactional Memory for {GPU}
                 Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "49--52",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "To make applications with dynamic data sharing among
                 threads benefit from GPU acceleration, we propose a
                 novel software transactional memory system for GPU
                 architectures (GPU-STM). The major challenges include
                 ensuring good scalability with respect to the massively
                 multithreading of GPUs, and preventing livelocks caused
                 by the SIMT execution paradigm of GPUs. To this end, we
                 propose (1) a hierarchical validation technique and (2)
                 an encounter-time lock-sorting mechanism to deal with
                 the two challenges, respectively. Evaluation shows that
                 GPU-STM outperforms coarse-grain locks on GPUs by up to
                 20x.",
  acknowledgement = ack-nhfb,
  affiliation =  "Xu, YL (Reprint Author), Xi An Jiao Tong Univ, Sch
                 Elect \& Informat Engn, Xian 710049, Peoples R China.
                 Xu, Yunlong; Qian, Depei, Xi An Jiao Tong Univ, Sch
                 Elect \& Informat Engn, Xian 710049, Peoples R China.
                 Wang, Rui; Qian, Depei, Beihang Univ, Sch Engn \& Comp
                 Sci, Beijing, Peoples R China. Goswami, Nilanjan; Li,
                 Tao, Univ Florida, ECE Dept, Gainesville, FL USA.",
  author-email = "xjtu.ylxu@stu.xjtu.edu.cn rui.wang@jsi.buaa.edu.cn
                 nil@ufl.edu taoli@ece.ufl.edu depeiq@xjtu.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF of China [61133004, 61128004,
                 61073011]; 863 Program of China [2012AA010902]",
  funding-text = "This work is supported by NSF of China under grant
                 61133004, 61128004 and 61073011, and 863 Program of
                 China under grant 2012AA010902.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "dynamic data sharing; encounter-time lock-sorting
                 mechanism; GPU acceleration; GPU architectures;
                 GPU-STM; graphics processing units; hierarchical
                 validation technique; multi-threading; Multicore
                 processing; multicore processor; Multicore Processors;
                 multiprocessing systems; Multiprocessing systems;
                 multithreading; parallel architectures; Parallel
                 processing; Parallel Programming; parallel programming;
                 Parallel Programming; Run-time Environments; Runtime
                 environment; SIMD processor; SIMD Processors; SIMT
                 execution paradigm; software transactional memory
                 system; sorting",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Xu:2014:STM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Shim:2014:TMP,
  author =       "Keun Sup Shim and Mieszko Lis and Omer Khan and
                 Srinivas Devadas",
  title =        "Thread Migration Prediction for Distributed Shared
                 Caches",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "53--56",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.30",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Chip-multiprocessors (CMPs) have become the mainstream
                 parallel architecture in recent years; for scalability
                 reasons, designs with high core counts tend towards
                 tiled CMPs with physically distributed shared caches.
                 This naturally leads to a Non-Uniform Cache Access
                 (NUCA) design, where on-chip access latencies depend on
                 the physical distances between requesting cores and
                 home cores where the data is cached. Improving data
                 locality is thus key to performance, and several
                 studies have addressed this problem using data
                 replication and data migration. In this paper, we
                 consider another mechanism, hardware-level thread
                 migration. This approach, we argue, can better exploit
                 shared data locality for NUCA designs by effectively
                 replacing multiple round-trip remote cache accesses
                 with a smaller number of migrations. High migration
                 costs, however, make it crucial to use thread
                 migrations judiciously; we therefore propose a novel,
                 on-line prediction scheme which decides whether to
                 perform a remote access (as in traditional NUCA
                 designs) or to perform a thread migration at the
                 instruction level. For a set of parallel benchmarks,
                 our thread migration predictor improves the performance
                 by 24\% on average over the shared-NUCA design that
                 only uses remote accesses.",
  acknowledgement = ack-nhfb,
  affiliation =  "Shim, KS (Reprint Author), MIT, 77 Massachusetts Ave,
                 Cambridge, MA 02139 USA. Shim, Keun Sup; Lis, Mieszko;
                 Devadas, Srinivas, MIT, Cambridge, MA 02139 USA. Khan,
                 Omer, Univ Connecticut, Storrs, CT USA.",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B Hardware; B.3 Memory Structures; B.3.2 Design
                 Styles; B.3.2.g Shared memory; Benchmark testing; C
                 Computer Systems Organization; C.1 Processor
                 Architectures; C.1.4 Parallel Architectures; Cache
                 Coherence; cache storage; chip-multiprocessors; CMPs;
                 Coherence; Computer architecture; Context; core counts;
                 Data Locality; data locality improvement; data
                 migration; data replication; Distributed Caches;
                 hardware-level thread migration prediction; home cores;
                 Instruction sets; integrated circuit design; mainstream
                 parallel architecture; microprocessor chips;
                 multiprocessing systems; nonuniform cache access
                 design; on-chip access latencies; online prediction
                 scheme; Parallel Architecture; parallel architectures;
                 physical distributed shared caches; Protocols;
                 Registers; requesting cores; shared-NUCA design",
  number-of-cited-references = "13",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Shim:2014:TMP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2014:TCa,
  author =       "Anonymous",
  title =        "Table of Contents",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "C1--C4",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2360655",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2014:ITPa,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Transactions on Pattern Analysis and
                 Machine Intelligence}} Editorial Board}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "C2--C2",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2360656",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2014:ITPb,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Transactions on Pattern Analysis and
                 Machine Intelligence}}} Information for Authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "C3--C3",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2360657",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2014:ICSa,
  author =       "Anonymous",
  title =        "{IEEE Computer Society}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "C4--C4",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2360658",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Lavasani:2014:FBL,
  author =       "Maysam Lavasani and Hari Angepat and Derek Chiou",
  title =        "An {FPGA}-based In-Line Accelerator for {Memcached}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "57--60",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We present a method for accelerating server
                 applications using a hybrid CPU+FPGA architecture and
                 demonstrate its advantages by accelerating Memcached, a
                 distributed key-value system. The accelerator,
                 implemented on the FPGA fabric, processes request
                 packets directly from the network, avoiding the CPU in
                 most cases. The accelerator is created by profiling the
                 application to determine the most commonly executed
                 trace of basic blocks which are then extracted. Traces
                 are executed speculatively within the FPGA. If the
                 control flow exits the trace prematurely, the side
                 effects of the computation are rolled back and the
                 request packet is passed to the CPU. When compared to
                 the best reported software numbers, the Memcached
                 accelerator is 9.15x more energy efficient for common
                 case requests.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lavasani, M (Reprint Author), Univ Texas Austin, Dept
                 Elect \& Comp Engn, Austin, TX 78712 USA. Lavasani,
                 Maysam; Angepat, Hari; Chiou, Derek, Univ Texas Austin,
                 Dept Elect \& Comp Engn, Austin, TX 78712 USA.",
  author-email = "maysamlavasani@utexas.edu hangepat@utexas.edu
                 derek@utexas.edu",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerating server; C.1.3.f Heterogeneous (hybrid)
                 systems; C.2.4.a Client/server; cache storage;
                 Client-server systems; Computer architecture; control
                 flow; distributed key-value system; distributed
                 processing; field programmable gate arrays; Field
                 programmable gate arrays; FPGA-based in-line
                 accelerator; hybrid CPU+FPGA architecture; Hybrid
                 systems; Memcached accelerator; Program processors;
                 reconfigurable architectures; request packet; rolled
                 back; software numbers",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "24",
  unique-id =    "Lavasani:2014:FBL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Song:2014:AFB,
  author =       "Xiang Song and Jian Yang and Haibo Chen",
  title =        "Architecting Flash-based Solid-State Drive for
                 High-performance {I/O} Virtualization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "61--64",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.22",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Flash-based solid-state drive (SSD) is now being
                 widely deployed in cloud computing platforms due to the
                 potential advantages of better performance and less
                 energy consumption. However, current virtualization
                 architecture lacks support for high-performance I/O
                 virtualization over persistent storage, which results
                 in sub-optimal I/O performance for guest virtual
                 machines (VMs) on SSD. Further, current software-based
                 I/O virtualization violates the ``don't hide power''
                 principle due to inefficient support for some advanced
                 SSD commands (e.g., TRIM) and constrained parallelism,
                 leading to sub-optimal performance and life cycle. This
                 paper observes that the massive internal parallelism
                 and the block emulation in the flash translation layer
                 (FTL) make flash-based SSD an ideal candidate to
                 support high-performance I/O virtualization for
                 persistent storage. Based on this observation, we
                 propose VFlash, the first storage I/O virtualization
                 architecture that extends existing SSDs with trivial
                 hardware changes to directly expose multiple virtual
                 SSDs to guest VMs. Performance evaluation using a
                 modified FlashSim with two FTL schemes (i.e., DFTL and
                 FAST) shows that VFlash incurs only small performance
                 overhead over native SSDs and can efficiently exploit
                 parallelism.",
  acknowledgement = ack-nhfb,
  affiliation =  "Chen, HB (Reprint Author), Shanghai Jiao Tong Univ,
                 Sch Software, Inst Parallel \& Distributed Syst,
                 Shanghai 200030, Peoples R China. Song, Xiang; Yang,
                 Jian; Chen, Haibo, Shanghai Jiao Tong Univ, Sch
                 Software, Inst Parallel \& Distributed Syst, Shanghai
                 200030, Peoples R China.",
  author-email = "haibochen@sjtu.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "China National Natural Science Foundation
                 [61003002]; Intel",
  funding-text = "This work was supported by China National Natural
                 Science Foundation under grant numbered 61003002 and a
                 grant from Intel.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B.4.4 Performance Analysis and Design Aids; C.4.g
                 Measurement; cloud computing; Cloud computing; cloud
                 computing platforms; Computer architecture; energy
                 consumption; evaluation; flash memories; flash-based
                 solid-state drive; high performance I/O virtualization
                 architecture; I/O virtualization; modeling;
                 Multiprocessing systems; Parallel processing;
                 Performance evaluation; performance evaluation; Random
                 access memory; simulation of multiple-processor
                 systems; software-based I/O virtualization; Solid state
                 circuits; Solid State Drive; SSD commands; virtual
                 machines; virtualisation; VM",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Song:2014:AFB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wu:2014:ATE,
  author =       "Carole-Jean Wu",
  title =        "Architectural Thermal Energy Harvesting Opportunities
                 for Sustainable Computing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "65--68",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Increased power dissipation in computing devices has
                 led to a sharp rise in thermal hotspots, creating
                 thermal runaway. To reduce the additional power
                 requirement caused by increased temperature, current
                 approaches apply cooling mechanisms to remove heat or
                 apply management techniques to avoid thermal
                 emergencies by slowing down heat generation. This paper
                 proposes to tackle the heat management problem of
                 computing platforms with a fundamentally new approach -
                 instead of heat removal using cooling mechanisms and
                 heat avoidance using dynamic thermal/power management
                 techniques, this work investigates the mechanisms to
                 recover wasted heat into reusable energy for
                 sustainable computing. Through recent advancements in
                 thermoelectric materials, we allow wasted heat energy
                 generated by computing devices to be recovered,
                 transformed, and harvested as electricity that can be
                 directly used within the system. We demonstrate a
                 real-system setup where we recover 0.3 to 1 watt of
                 power with the CPU running at 70 to 105 degrees C,
                 using a COTS thermoelectric device on top of the CPU.
                 Through this research, we hope to motivate more
                 in-depth efforts to explore heat energy harvesting
                 opportunities on computing devices and inspire
                 plausible solutions to overcome the technical
                 challenges discussed in this paper.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wu, CJ (Reprint Author), Arizona State Univ, Sch Comp,
                 Dept Comp Sci Engn, Tempe, AZ 85281 USA. Arizona State
                 Univ, Sch Comp, Dept Comp Sci Engn, Tempe, AZ 85281
                 USA.",
  author-email = "carole-jean.wu@asu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architectural thermal energy harvesting; cooling;
                 Cooling; cooling mechanisms; dynamic thermal-power
                 management technique; Energy conservation; energy
                 harvesting; Energy-aware systems; heat generation; heat
                 management problem; power dissipation; Power
                 distribution; power engineering computing; Resistance
                 heating; sustainable computing; Temperature
                 measurement; Temperature-aware design; thermal energy
                 storage; thermal runaway; Waste heat",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  times-cited =  "5",
  unique-id =    "Wu:2014:ATE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yavits:2014:CHO,
  author =       "Leonid Yavits and Amir Morad and Ran Ginosar",
  title =        "Cache Hierarchy Optimization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "69--72",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.18",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Power consumption, off-chip memory bandwidth, chip
                 area and Network on Chip (NoC) capacity are among main
                 chip resources limiting the scalability of Chip
                 Multiprocessors (CMP). A closed form analytical
                 solution for optimizing the CMP cache hierarchy and
                 optimally allocating area among hierarchy levels under
                 such constrained resources is developed. The
                 optimization framework is extended by incorporating the
                 impact of data sharing on cache miss rate. An
                 analytical model for cache access time as a function of
                 cache size is proposed and verified using CACTI
                 simulation.",
  acknowledgement = ack-nhfb,
  affiliation =  "Yavits, L (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
                 Yavits, Leonid; Morad, Amir; Ginosar, Ran, Technion
                 Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa,
                 Israel.",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "ICRI-CI; Hasso-Plattner-Institut",
  funding-text = "We thank Prof. Uri Weiser and Yaniv Ben Itzhak for
                 their review and remarks. This research was partially
                 funded by the ICRI-CI and Hasso-Plattner-Institut.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Analytical models; Analytical Performance Models;
                 Bandwidth; Cache Hierarchy; cache hierarchy
                 optimization; cache storage; CACTI simulation; chip
                 area; Chip Multiprocessor; chip multiprocessors; CMP;
                 Computational modeling; data sharing; Integrated
                 circuit modeling; Multiprocessing systems; network on
                 chip; network-on-chip; NoC; off-chip memory bandwidth;
                 optimisation; Optimization; power consumption; Resource
                 Allocation Optimization; Resource Allocation
                 Optimizations; Resource management",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Yavits:2014:CHO",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yazdanshenas:2014:CLL,
  author =       "Sadegh Yazdanshenas and Marzieh Ranjbar Pirbasti and
                 Mahdi Fazeli and Ahmad Patooghy",
  title =        "Coding Last Level {STT-RAM} Cache For High Endurance
                 And Low Power",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "73--76",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "STT-RAM technology has recently emerged as one of the
                 most promising memory technologies. However, its major
                 problems, limited write endurance and high write
                 energy, are still preventing it from being used as a
                 drop-in replacement of SRAM cache. In this paper, we
                 propose a novel coding scheme for STT-RAM last level
                 cache based on the concept of value locality. We reduce
                 switching probability in cache by swapping common
                 patterns with limited weight codes (LWC) to make writes
                 less often as well as more uniform. We also define some
                 policies for swapping these patterns. Our evaluation
                 shows that bit write variance in memory cells can be
                 reduced by about 20\% on average resulting in a more
                 uniform wear-out directly enhancing lifetime and
                 improving cell reliability. In addition, writes in
                 cache lines can be reduced by about 12\% compared to
                 one of the most effective circuit level techniques
                 known as early write termination (EWT) [12]. Our method
                 increases memory hierarchy access time by about 0.08\%
                 on average, which is negligible. We have shown that our
                 method doesn't adversely affect last level cache
                 energy-delay(2). The non-uniformity caused by the
                 coding scheme can be used for another coding scheme at
                 main memory or L1 cache depending on their
                 technologies.",
  acknowledgement = ack-nhfb,
  affiliation =  "Yazdanshenas, S (Reprint Author), Iran Univ Sci \&
                 Technol, Sch Comp Engn, Tehran, Iran. Yazdanshenas,
                 Sadegh; Pirbasti, Marzieh Ranjbar; Fazeli, Mahdi;
                 Patooghy, Ahmad, Iran Univ Sci \& Technol, Sch Comp
                 Engn, Tehran, Iran.",
  author-email = "sadegh\_yazdanshenas@comp.iust.ac.ir
                 m\_ranjbar@comp.iust.ac.ir m\_fazeli@iust.ac.ir
                 patooghy@iust.ac.ir",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B Hardware; B.3 Memory Structures; bit write variance;
                 C Computer Systems Organization; C.1 Processor
                 Architectures; cache; cache storage; cell reliability;
                 circuit level technique; coding scheme; Computer
                 architecture; early write termination; Encoding;
                 limited weight code; limited weight codes; memory
                 endurance; memory technology; nonvolatile memory;
                 Nonvolatile memory; probability; Random access memory;
                 random-access storage; STT-RAM; STT-RAM cache;
                 switching probability; Three-dimensional displays;
                 write energy; write hotspot",
  keywords-plus = "MEMORY; CIRCUIT; ENERGY; MRAM",
  number-of-cited-references = "13",
  ORCID-numbers = "Fazeli, Mahdi/0000-0002-2874-6256 Patooghy,
                 Ahmad/0000-0003-2647-2797",
  research-areas = "Computer Science",
  researcherid-numbers = "Fazeli/S-9574-2018",
  times-cited =  "14",
  unique-id =    "Yazdanshenas:2014:CLL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Martinsen:2014:HTL,
  author =       "Jan Kasper Martinsen and Hakan Grahn and Anders
                 Isberg",
  title =        "Heuristics for Thread-Level Speculation in {Web}
                 Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "77--80",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.26",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib",
  abstract =     "JavaScript is a sequential programming language, and
                 Thread-Level Speculation has been proposed to
                 dynamically extract parallelism in order to take
                 advantage of parallel hardware. In previous work, we
                 have showed significant speed-ups with a simple on/off
                 speculation heuristic. In this paper, we propose and
                 evaluate three heuristics for dynamically adapt the
                 speculation: a 2-bit heuristic, an exponential
                 heuristic, and a combination of these two. Our results
                 show that the combined heuristic is able to both
                 increase the number of successful speculations and
                 decrease the execution time for 15 popular web
                 applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Martinsen, JK (Reprint Author), Blekinge Inst Technol,
                 Sch Comp, SE-37179 Karlskrona, Sweden. Martinsen, Jan
                 Kasper; Grahn, Hakan, Blekinge Inst Technol, Sch Comp,
                 SE-37179 Karlskrona, Sweden. Isberg, Anders, Sony
                 Mobile Commun AB, SE-22188 Lund, Sweden.",
  author-email = "Jan.Kasper.Martinsen@bth.se Hakan.Grahn@bth.se
                 Anders.Isberg@sonymobile.com",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Industrial Excellence Center EASE -
                 Embedded Applications Software Engineering; BESQ+
                 research project --- Knowledge Foundation in Sweden
                 [20100311]",
  funding-text = "This work was partly funded by the Industrial
                 Excellence Center EASE --- Embedded Applications
                 Software Engineering, (http://ease.cs.lth.se), and the
                 BESQ+ research project funded by the Knowledge
                 Foundation (grant number 20100311) in Sweden.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "2-bit heuristic; Automatic Parallelization; Benchmark
                 testing; C.1.4 Parallel Architectures; C.1.4.f
                 Speculative multi-threading; exponential heuristic;
                 Instruction sets; Internet; Java; JavaScript; Multicore
                 processors; Multithreading; Parallel Computing;
                 parallel hardware; Parallel processing; parallel
                 programming; sequential programming language; Social
                 network services; thread-level speculation; Web
                 applications",
  number-of-cited-references = "12",
  oa =           "Green Published",
  ORCID-numbers = "Martinsen, Jan Kasper/0000-0001-8915-3633 Grahn,
                 Hakan/0000-0001-9947-1088",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Martinsen:2014:HTL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Nandakumar:2014:OKS,
  author =       "Vivek S. Nandakumar and Ma{\l}gorzata Marek-Sadowska",
  title =        "On Optimal Kernel Size for Integrated {CPU--GPUs} ---
                 a Case Study",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "81--84",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.27",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Integrated CPU-GPU architectures with a fully
                 addressable shared memory completely eliminate any
                 CPU-GPU data transfer overhead. Since such
                 architectures are relatively new, it is unclear what
                 level of interaction between the CPU and GPU attains
                 the best energy efficiency. Too coarse grained or
                 larger kernels with fairly low CPU--GPU interaction
                 could cause poor utilization of the shared resources
                 while too fine grained kernels could cause frequent
                 interrupts of GPU computation and performance
                 degradation. Also larger kernels require larger shared
                 resources causing increase in area and parasitics which
                 affect the latency sensitive CPU cores. In this paper,
                 we show the effect of granularity on the overall
                 system's energy efficiency using a synthetic workload.
                 We describe how our framework models a truly unified
                 shared memory in integrated architectures with frequent
                 CPU--GPU communication.",
  acknowledgement = ack-nhfb,
  affiliation =  "Nandakumar, VS (Reprint Author), Univ Calif Santa
                 Barbara, Dept Elect \& Comp Engn, Santa Barbara, CA
                 93106 USA. Nandakumar, Vivek S.; Marek-Sadowska,
                 Malgorzata, Univ Calif Santa Barbara, Dept Elect \&
                 Comp Engn, Santa Barbara, CA 93106 USA.",
  author-email = "vivek@ece.ucsb.edu mms@ece.uscb.edu",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "SRC grant [2236]",
  funding-text = "This work was supported by SRC grant \#2236.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B.3.2.g Shared memory; B.4.4.b Simulation; B.9.2
                 Energy-aware systems; C.1.3.f Heterogeneous (hybrid)
                 systems; C.4.g Measurement; Central Processing Unit;
                 Computational modeling; CPU-GPU communication; CPU-GPU
                 data transfer overhead; CPU-GPU interaction; D.4.4
                 Communications Management; energy efficiency; Energy
                 efficiency; evaluation; fine grained kernels; fully
                 addressable shared memory; GPU computation; graphics
                 processing units; Graphics processing units; integrated
                 CPU-GPU architectures; latency sensitive CPU cores;
                 Memory management; modeling; optimal kernel size;
                 overall system energy efficiency; performance
                 degradation; performance evaluation; power aware
                 computing; shared memory systems; simulation of
                 multiple-processor systems",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Nandakumar:2014:OKS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Liu:2014:PTE,
  author =       "Qixiao Liu and Victor Jimenez and Miquel Moreto and
                 Jaume Abella and Francisco J. Cazorla and Mateo
                 Valero",
  title =        "Per-task Energy Accounting in Computing Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "85--88",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.24",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We present for the first time the concept of per-task
                 energy accounting (PTEA) and relate it to per-task
                 energy metering (PTEM). We show the benefits of
                 supporting both in future computing systems. Using the
                 shared last-level cache (LLC) as an example: (1) We
                 illustrate the complexities in providing PTEM and PTEA;
                 (2) we present an idealized PTEM model and an accurate
                 and low-cost implementation of it; and (3) we introduce
                 a hardware mechanism to provide accurate PTEA in the
                 cache.",
  acknowledgement = ack-nhfb,
  affiliation =  "Liu, QX (Reprint Author), Univ Politecn Cataluna,
                 E-08028 Barcelona, Spain. Liu, Qixiao; Jimenez, Victor;
                 Moreto, Miquel; Valero, Mateo, Univ Politecn Cataluna,
                 E-08028 Barcelona, Spain. Liu, Qixiao; Jimenez, Victor;
                 Moreto, Miquel; Abella, Jaume; Cazorla, Francisco J.;
                 Valero, Mateo, Barcelona Supercomp Ctr, Barcelona,
                 Spain. Cazorla, Francisco J., Spanish Natl Res Council
                 IIIA CSIC, Barcelona, Spain.",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Spanish Ministry of Science and Innovation
                 [TIN2012-34557]; HiPEAC Network of Excellence; Chinese
                 Scholarship Council [2010608015]",
  funding-text = "This work has been partially supported by the Spanish
                 Ministry of Science and Innovation under grant
                 TIN2012-34557 and the HiPEAC Network of Excellence.
                 Qixiao Liu has also been funded by the Chinese
                 Scholarship Council under grant 2010608015.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; cache storage; Computational
                 modeling; computing systems; Energy consumption; Energy
                 management; Monitoring; Multicore processing; per-task
                 energy accounting; per-task energy metering; power
                 aware computing; PTEA; PTEM model; Radiation detectors;
                 shared last-level cache",
  number-of-cited-references = "20",
  oa =           "Green Published",
  ORCID-numbers = "Cazorla, Francisco/0000-0002-3344-376X Moreto Planas,
                 Miquel/0000-0002-9848-8758 Valero,
                 Mateo/0000-0003-2917-2482 Abella,
                 Jaume/0000-0001-7951-4028 Liu,
                 Qixiao/0000-0002-8196-7584",
  research-areas = "Computer Science",
  researcherid-numbers = "Cazorla, Francisco/D-7261-2016 Moreto Planas,
                 Miquel/C-1823-2016 Valero, Mateo/L-5709-2014 Abella,
                 Jaume/B-7422-2016",
  times-cited =  "2",
  unique-id =    "Liu:2014:PTE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Mahmoodi:2014:RCC,
  author =       "Hamid Mahmoodi and Sridevi Srinivasan Lakshmipuram and
                 Manish Arora and Yashar Asgarieh and Houman Homayoun
                 and Bill Lin and Dean M. Tullsen",
  title =        "Resistive Computation: a Critique",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "89--92",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.23",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Resistive Computation was suggested by [6] as an idea
                 for tacking the power wall by replacing conventional
                 CMOS logic with Magnetic Tunnel Junction (MTJ) based
                 Look-Up Tables (LUTs). Spin Transfer Torque RAM
                 (STTRAM) is an emerging CMOS-compatible non-volatile
                 memory technology based on Magnetic Tunnel Junctions as
                 a memory bit [3]. The principal advantage of STTRAM is
                 that it is leakage-resistant, which is an important
                 characteristic beyond the 45nm technology node, where
                 leakage concerns are becoming a limiting factor in
                 microprocessor performance. Although STTRAM is a good
                 candidate for replacing SRAM for on-chip memory, we
                 argue in this article MTJ-based LUTs are unnecessarily
                 expensive in terms of area, power, and performance when
                 implementing fixed combinational logic that does not
                 require the reprogramming ability provided by MTJs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mahmoodi, H (Reprint Author), San Francisco State
                 Univ, San Francisco, CA 94132 USA. Arora, Manish;
                 Asgarieh, Yashar; Lin, Bill; Tullsen, Dean M., Univ
                 Calif San Diego, La Jolla, CA 92093 USA. Mahmoodi,
                 Hamid; Lakshmipuram, Sridevi Srinivasan, San Francisco
                 State Univ, San Francisco, CA 94132 USA. Homayoun,
                 Houman, George Mason Univ, Fairfax, VA 22030 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B.2.1 Design Styles; B.6.1.e Memory used as logic;
                 B.7.1.a Advanced technologies; B.9.1 Low-power design;
                 C.0.a Emerging technologies; CMOS integrated circuits;
                 CMOS-compatible nonvolatile memory technology; Delays;
                 dynamic current-mode logic; fixed combinational logic;
                 leakage power; leakage-resistance; Logic gates; look-up
                 tables; Low power electronics; magnetic tunnel
                 junction; Magnetic tunneling; magnetic tunnelling;
                 magnetic-tunnel junctions; memory bit; MRAM; MTJ-based
                 LUT; Power distribution; random-access storage;
                 Resistive computation; resistive computation; Resistive
                 computation; spin transfer torque RAM; STTRAM; Table
                 lookup; table lookup; Transistors",
  keywords-plus = "TECHNOLOGY; CIRCUIT",
  number-of-cited-references = "10",
  ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X",
  research-areas = "Computer Science",
  researcherid-numbers = "Lin, Binshan/A-9772-2009",
  times-cited =  "4",
  unique-id =    "Mahmoodi:2014:RCC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Eyerman:2014:RCW,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Restating the Case for Weighted-{IPC} Metrics to
                 Evaluate Multiprogram Workload Performance",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "93--96",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Weighted speedup is nowadays the most commonly used
                 multiprogram workload performance metric. Weighted
                 speedup is a weighted-IPC metric, i.e., the
                 multiprogram IPC of each program is first weighted with
                 its isolated IPC. Recently, Michaud questions the
                 validity of weighted-IPC metrics by arguing that they
                 are inconsistent and that weighted speedup favors
                 unfairness [4]. Instead, he advocates using the
                 arithmetic or harmonic mean of the raw IPC values of
                 the programs in the multiprogram workload. We show that
                 weighted-IPC metrics are not inconsistent, and that
                 weighted speedup is fair in giving equal importance to
                 each program. We argue that, in contrast to raw-IPC
                 metrics, weighted-IPC metrics have a system-level
                 meaning, and that raw-IPC metrics are affected by the
                 inherent behavior of the programs. We also show that
                 the choice of a metric may adversely affect the
                 conclusions from an experiment. We suggest to use two
                 weighted-IPC metrics-system throughput (STP) and
                 average normalized turnaround time (ANTT)-for
                 evaluating multiprogram workload performance, and to
                 avoid raw-IPC metrics.",
  acknowledgement = ack-nhfb,
  affiliation =  "Eyerman, S (Reprint Author), Univ Ghent, B-9000 Ghent,
                 Belgium. Eyerman, Stijn; Eeckhout, Lieven, Univ Ghent,
                 B-9000 Ghent, Belgium.",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Research Foundation --- Flanders (FWO);
                 European Research Council under the European Community
                 [259295]",
  funding-text = "Stijn Eyerman is supported through a postdoctoral
                 fellowship by the Research Foundation --- Flanders
                 (FWO). Additional support is provided by the European
                 Research Council under the European Community's Seventh
                 Framework Programme (FP7/2007-2013) / ERC Grant
                 agreement no. 259295.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ANTT; average normalized turnaround time; Benchmark
                 testing; C Computer Systems Organization; C.1 Processor
                 Architectures; C.1.3 Other Architecture Styles; C.1.3.h
                 Multithreaded processors; C.1.4 Parallel Architectures;
                 C.1.4.e Multi-core/single-chip multiprocessors; C.4
                 Performance of Systems; C.4.c Measurement techniques;
                 Degradation; Harmonic analysis; harmonic mean;
                 Multicore processing; multiprocessing systems;
                 multiprogram IPC; multiprogram workload performance
                 metric; multiprogramming; raw-IPC metrics; STP; system
                 throughput; system-level meaning; Throughput; Weight
                 measurement; weighted speedup; weighted-IPC metric",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "Eyerman:2014:RCW",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wolff:2014:RUR,
  author =       "Sonya R. Wolff and Ronald D. Barnes",
  title =        "Revisiting Using the Results of Pre-Executed
                 Instructions in Runahead Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "97--100",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.21",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Long-latency cache accesses cause significant
                 performance-impacting delays for both in-order and
                 out-of-order processor systems. To address these
                 delays, runahead pre-execution has been shown to
                 produce speedups by warming-up cache structures during
                 stalls caused by long-latency memory accesses. While
                 improving cache related performance, basic runahead
                 approaches do not otherwise utilize results from
                 accurately pre-executed instructions during normal
                 operation. This simple model of execution is
                 potentially inefficient and performance constraining.
                 However, a previous study showed that exploiting the
                 results of accurately pre-executed runahead
                 instructions for out-of-order processors provide little
                 performance improvement over simple re-execution. This
                 work will show that, unlike out-of-order runahead
                 architectures, the performance improvement from
                 runahead result use for an in-order pipeline is more
                 significant, on average, and in some situations
                 provides dramatic performance improvements. For a set
                 of SPEC CPU2006 benchmarks which experience performance
                 improvement from basic runahead, the addition of result
                 use to the pipeline provided an additional speedup of
                 1.14X (high --- 1.48X) for an in-order processor model
                 compared to only 1.05X (high --- 1.16X) for an
                 out-of-order one. When considering benchmarks with poor
                 data cache locality, the average speedup increased to
                 1.21X for in-order compared to only 1.10X for
                 out-of-order.",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; C.1.5.c Superscalar
                 dynamically-scheduled and statically-scheduled
                 implementation; C.1.5.e Memory hierarchy; cache
                 storage; data cache locality; Hidden Markov models;
                 in-order processor systems; long-latency cache
                 accesses; long-latency memory accesses; Memory Wall;
                 multiprocessing systems; Out of order; out-of-order
                 processor systems; out-of-order runahead architectures;
                 Pipeline processing; Pre-Execution; preexecuted
                 runahead instructions; Registers; Runahead; runahead
                 processors; SPEC CPU2006 benchmarks",
  keywords-plus = "PIPELINES",
  number-of-cited-references = "20",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Wolff:2014:RUR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2014:SGA,
  author =       "Youngsok Kim and Jaewon Lee and Donggyu Kim and
                 Jangwoo Kim",
  title =        "{ScaleGPU}: {GPU} Architecture for Memory-Unaware
                 {GPU} Programming",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "101--104",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.19",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Programmer-managed GPU memory is a major challenge in
                 writing GPU applications. Programmers must rewrite and
                 optimize an existing code for a different GPU memory
                 size for both portability and performance.
                 Alternatively, they can achieve only portability by
                 disabling GPU memory at the cost of significant
                 performance degradation. In this paper, we propose
                 ScaleGPU, a novel GPU architecture to enable
                 high-performance memory-unaware GPU programming.
                 ScaleGPU uses GPU memory as a cache of CPU memory to
                 provide programmers a view of CPU memory-sized
                 programming space. ScaleGPU also achieves high
                 performance by minimizing the amount of CPU-GPU data
                 transfers and by utilizing the GPU memory's high
                 bandwidth. Our experiments show that ScaleGPU can run a
                 GPU application on any GPU memory size and also
                 improves performance significantly. For example,
                 ScaleGPU improves the performance of the hotspot
                 application by similar to 48\% using the same size of
                 GPU memory and reduces its memory size requirement by
                 similar to 75\% maintaining the target performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, Y (Reprint Author), POSTECH, Dept Comp Sci \&
                 Engn, Pohang, South Korea. Kim, Youngsok; Lee, Jaewon;
                 Kim, Donggyu; Kim, Jangwoo, POSTECH, Dept Comp Sci \&
                 Engn, Pohang, South Korea.",
  author-email = "elixir@postech.ac.kr spiegel0@postech.ac.kr
                 vteori@postech.ac.kr jangwoo@postech.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Basic Science Research Program through the
                 National Research Foundation of Korea (NRF) ---
                 Ministry of Education, Science and Technology
                 [2011-0014817]; NRF Grant --- Korean Government
                 (NRF-Global Ph.D. Fellowship Program)",
  funding-text = "This research was supported by Basic Science Research
                 Program through the National Research Foundation of
                 Korea (NRF) funded by the Ministry of Education,
                 Science and Technology (2011-0014817) and NRF Grant
                 funded by the Korean Government (NRF-2012-Global Ph.D.
                 Fellowship Program).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "C.1.2.j SIMD processors; C.1.4.e
                 Multi-core/single-chip multiprocessors; C.1.5.e Memory
                 hierarchy; cache; cache storage; code rewrite; CPU
                 memory-sized programming space; CPU-GPU data transfers;
                 Data transfer; GPU applications; GPU architecture; GPU
                 memory high bandwidth; GPU memory size; graphics
                 processing units; Graphics processing units; graphics
                 processing units; high-performance memory-unaware GPU
                 programming; I.3.1.a Graphics processors; Instruction
                 sets; memory architecture; Memory management; memory
                 size requirement; programmer-managed GPU memory;
                 Programming; Random access memory; ScaleGPU",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "7",
  unique-id =    "Kim:2014:SGA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Sankar:2014:SFL,
  author =       "Sriram Sankar and Sudhanva Gurumurthi",
  title =        "Soft Failures in Large Datacenters",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "105--108",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.25",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "A major problem in managing large-scale datacenters is
                 diagnosing and fixing machine failures. Most large
                 datacenter deployments have a management infrastructure
                 that can help diagnose failure causes, and manage
                 assets that were fixed as part of the repair process.
                 Previous studies identify only actual hardware
                 replacements to calculate Annualized Failure Rate (AFR)
                 and component reliability. In this paper, we show that
                 service availability is significantly affected by soft
                 failures and that this class of failures is becoming an
                 important issue at large datacenters with minimum human
                 intervention. Soft failures in the datacenter do not
                 require actual hardware replacements, but still result
                 in service downtime, and are equally important because
                 they disrupt normal service operation. We show failure
                 trends observed in a large datacenter deployment of
                 commodity servers and motivate the need to modify
                 conventional datacenter designs to help reduce soft
                 failures and increase service availability.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sankar, S (Reprint Author), Microsoft Corp, Redmond,
                 WA 98052 USA. Sankar, Sriram, Microsoft Corp, Redmond,
                 WA 98052 USA. Sankar, Sriram; Gurumurthi, Sudhanva,
                 Univ Virginia, Charlottesville, VA 22903 USA.
                 Gurumurthi, Sudhanva, Adv Micro Devices Inc, AMD Res,
                 Sunnyvale, CA 94088 USA.",
  author-email = "sriram.sankar@microsoft.com
                 Sudhanva.Gurumurthi@amd.com",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "AFR; annualized failure rate; asset management; C.4
                 Performance of Systems; C.5.5 Servers;
                 Characterization; Client-server systems; commodity
                 servers; component reliability; computer centres; Data
                 centers; Datacenter; datacenter deployments; datacenter
                 designs; datacenter management; failure cause
                 diagnosis; fault diagnosis; Hard disks; hardware
                 replacements; Large-scale systems; machine failure
                 diagnosis; machine failure fixing; Maintenance
                 engineering; Management; management infrastructure;
                 Market research; Reliability; repair process; service
                 availability; soft failures; Transient analysis",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Sankar:2014:SFL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2014:VPT,
  author =       "Daehoon Kim and Hwanju Kim and Jaehyuk Huh",
  title =        "{vCache}: Providing a Transparent View of the {LLC} in
                 Virtualized Environments",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "109--112",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.20",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Since most of the current multi-core processors use a
                 large last-level cache (LLC), efficient use of an LLC
                 is critical for the overall performance of multi-cores.
                 To improve the caching efficiency, page coloring is a
                 representative software-based approach to allow the OS
                 to control placement of pages on an LLC to improve
                 their cache utility and to avoid conflicts among cores.
                 However, system virtualization, with additional address
                 translation by the hypervisor, can make page coloring
                 techniques used by the guest OS ineffective, as guest
                 physical addresses used by the guest OS for coloring
                 differ from real addresses used for cache indexing in
                 the LLCs. In this paper, we propose a novel LLC
                 architecture to provide the guest OS with a flexible
                 control over LLC placement in virtualized systems. The
                 proposed vCache architecture can preserve coloring
                 information set by the guest OS. In addition to color
                 preservation, vCache can potentially eliminate the
                 traditional limitation of page coloring, the cost of
                 dynamic color changes for memory pages. By using the
                 pollute buffer mechanism, one of the color-based cache
                 optimization techniques, vCache shows performance
                 improvement of benchmark applications up to 33\%
                 without degrading the performance of another co-running
                 application in the VM.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, D (Reprint Author), Korea Adv Inst Sci \&
                 Technol, Dept Comp Sci, Taejon, South Korea. Kim,
                 Daehoon; Kim, Hwanju; Huh, Jaehyuk, Korea Adv Inst Sci
                 \& Technol, Dept Comp Sci, Taejon, South Korea.",
  author-email = "daehoon@calab.kaist.ac.kr hjukim@calab.kaist.ac.kr
                 jhuh@calab.kaist.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "SW Computing R\&D Program of
                 KEIT(UX-oriented Mobile SW Platform) --- Ministry of
                 Trade, Industry, and Energy [2011-10041313]",
  funding-text = "This research was supported by the SW Computing R\&D
                 Program of KEIT(2011-10041313, UX-oriented Mobile SW
                 Platform) funded by the Ministry of Trade, Industry,
                 and Energy.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "address translation; B.3.2.b Cache memories; benchmark
                 applications; buffer mechanism; C.1.4.e
                 Multi-core/single-chip multiprocessors; C.1.5.e Memory
                 hierarchy; cache indexing; Cache partitioning; cache
                 storage; Cache storage; cache utility improvement;
                 caching efficiency improvement; co-running application;
                 color-based cache optimization techniques; coloring
                 information preservation; core conflict avoidance;
                 dynamic color cost; guest OS; guest physical address;
                 hypervisor; last-level cache; LLC architecture; LLC
                 placement; Memory management; memory pages; Multicore
                 processing; multicore processor performance;
                 multiprocessing systems; operating systems (computers);
                 Page coloring; page coloring; page placement control;
                 paged storage; software-based approach; system
                 virtualization; transparent LLC view; vCache
                 architecture; Virtual machine monitors; virtual
                 machines; virtualisation; Virtualization; virtualized
                 environments; VM",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  researcherid-numbers = "Huh, Jaehyuk/C-1716-2011",
  times-cited =  "2",
  unique-id =    "Kim:2014:VPT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2014:TCb,
  author =       "Anonymous",
  title =        "Table of Contents",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "C1--C1",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2368891",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2014:ICAa,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Computer Architecture Letters}}
                 Editorial Board}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "C2--C2",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2368892",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2014:ICAb,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Computer Architecture Letters}}}
                 Information for Authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "C3--C3",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2368893",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2014:ICSb,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [advertisement]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "C4--C4",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2368894",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Liao:2015:AWL,
  author =       "Jianwei Liao and Fengxiang Zhang and Li Li and
                 Guoqiang Xiao",
  title =        "Adaptive Wear-Leveling in Flash-Based Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2329871",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The paper presents an adaptive wear-leveling scheme
                 based on several wear-thresholds in different periods.
                 The basic idea behind this scheme is that blocks can
                 have different wear-out speeds and the wear-leveling
                 mechanism does not conduct data migration until the
                 erasure counts of some hot blocks hit a threshold.
                 Through a series of emulation experiments based on
                 several realistic disk traces, we show that the
                 proposed wear-leveling mechanism can reduce total
                 erasure counts and yield uniform erasure counts among
                 all blocks at the late lifetime of the storage devices.
                 As a result, not only can the performance of storage
                 systems be advanced, the lifespan of the flash-based
                 memory can also be extended to a certain degree.",
  acknowledgement = ack-nhfb,
  affiliation =  "Liao, JW (Reprint Author), Southwest Univ, Coll Comp
                 \& Informat Sci, Chongqing, Peoples R China. Liao,
                 Jianwei; Zhang, Fengxiang; Li, Li; Xiao, Guoqiang,
                 Southwest Univ, Coll Comp \& Informat Sci, Chongqing,
                 Peoples R China.",
  author-email = "liaojianwei@il.is.s.u-okyo.ac.jp zhangfx@swu.edu.cn
                 lily@swu.edu.cn gqxiao@swu.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Adaptive systems; adaptive wear-leveling; Ash;
                 Benchmark testing; data migration; delayed migration;
                 disk traces; emulation experiments; Equations; erasure
                 evenness; extending lifetime; flash memories;
                 flash-based memory; Flash-based storage devices; Market
                 research; Servers; Standards; total erasure count
                 reduction; wear; wear-leveling; wear-leveling
                 mechanism; wear-out speeds; wear-thresholds",
  number-of-cited-references = "11",
  ORCID-numbers = "Liao, Jianwei/0000-0001-6149-6650",
  research-areas = "Computer Science",
  researcherid-numbers = "Liao, Jianwei/C-5339-2016",
  times-cited =  "4",
  unique-id =    "Liao:2015:AWL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2015:IIC,
  author =       "Anonymous",
  title =        "2014 Index {{\booktitle{IEEE Computer Architecture
                 Letters}}} Vol. 13",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "1--5",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2387774",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Indexes",
}

@Article{Chen:2015:HSC,
  author =       "Jie Chen and Guru Venkataramani",
  title =        "A Hardware-Software Cooperative Approach for
                 Application Energy Profiling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2323711",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Energy consumption by software applications is a
                 critical issue that determines the future of multicore
                 software development. In this article, we propose a
                 hardware-software cooperative approach that uses
                 hardware support to efficiently gather the
                 energy-related hardware counters during program
                 execution, and utilizes parameter estimation models in
                 software to compute the energy consumption by
                 instructions at a finer grain level (say basic block).
                 We design mechanisms to minimize collinearity in
                 profiler data, and present results to validate our
                 energy estimation methodology.",
  acknowledgement = ack-nhfb,
  affiliation =  "Chen, J (Reprint Author), George Washington Univ, Dept
                 Elect \& Comp Engn, Washington, DC 20052 USA. Chen,
                 Jie; Venkataramani, Guru, George Washington Univ, Dept
                 Elect \& Comp Engn, Washington, DC 20052 USA.",
  author-email = "jiec@gwu.edu guruv@gwu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application energy profiling; Benchmark testing;
                 Energy consumption; energy consumption; energy
                 debugging; energy estimation; energy estimation
                 methodology; Energy profiling; energy-related hardware
                 counters; Estimation; Hardware; hardware-software
                 codesign; hardware-software cooperative approach;
                 Mathematical model; multicore software development;
                 multiprocessing systems; Parameter estimation;
                 parameter estimation models; power aware computing;
                 profiler data collinearity; program execution;
                 Software; software applications",
  keywords-plus = "POWER",
  number-of-cited-references = "12",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Chen:2015:HSC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2015:ASM,
  author =       "Dae-Hyun Kim and Prashant J. Nair and Moinuddin K.
                 Qureshi",
  title =        "Architectural Support for Mitigating Row Hammering in
                 {DRAM} Memories",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2332177",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "DRAM scaling has been the prime driver of increasing
                 capacity of main memory systems. Unfortunately, lower
                 technology nodes worsen the cell reliability as it
                 increases the coupling between adjacent DRAM cells,
                 thereby exacerbating different failure modes. This
                 paper investigates the reliability problem due to Row
                 Hammering, whereby frequent activations of a given row
                 can cause data loss for its neighboring rows. As DRAM
                 scales to lower technology nodes, the threshold for the
                 number of row activations that causes data loss for the
                 neighboring rows reduces, making Row Hammering a
                 challenging problem for future DRAM chips. To overcome
                 Row Hammering, we propose two architectural solutions:
                 First, Counter-Based Row Activation (CRA), which uses a
                 counter with each row to count the number of row
                 activations. If the count exceeds the row hammering
                 threshold, a dummy activation is sent to neighboring
                 rows proactively to refresh the data. Second,
                 Probabilistic Row Activation (PRA), which obviates
                 storage overhead of tracking and simply allows the
                 memory controller to proactively issue dummy
                 activations to neighboring rows with a small
                 probability for all memory access. Our evaluations show
                 that these solutions are effective at mitigating Row
                 hammering while causing negligible performance loss (<
                 1 percent).",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, DH (Reprint Author), Georgia Inst Technol, Dept
                 ECE, Atlanta, GA 30363 USA. Kim, Dae-Hyun; Nair,
                 Prashant J.; Qureshi, Moinuddin K., Georgia Inst
                 Technol, Dept ECE, Atlanta, GA 30363 USA.",
  author-email = "dhkim@ece.gatech.edu pnair6@ece.gatech.edu
                 moin@ece.gatech.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architectural support; cell reliability; Computer
                 architecture; counter-based row activation; data
                 errors; data retention; DRAM chips; DRAM memories; DRAM
                 scaling; Dynamic random access memory; Dynamic random
                 access memory, row hammering, data retention, data
                 errors; Leakage currents; Logic gates; Microprocessors;
                 probabilistic row activation; probability; Radiation
                 detectors; Random access memory; reliability;
                 reliability problem; row hammering; Transistors",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "23",
  unique-id =    "Kim:2015:ASM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Nathan:2015:AGC,
  author =       "Ralph Nathan and Daniel J. Sorin",
  title =        "{Argus-G}: Comprehensive, Low-Cost Error Detection for
                 {GPGPU} Cores",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2298391",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We have developed and evaluated Argus-G, an error
                 detection scheme for general purpose GPU (GPGPU) cores.
                 Argus-G is a natural extension of the Argus error
                 detection scheme for CPU cores, and we demonstrate how
                 to modify Argus such that it is compatible with GPGPU
                 cores. Using an RTL prototype, we experimentally show
                 that Argus-G can detect the vast majority of injected
                 errors at relatively low performance, area, and power
                 costs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Nathan, R (Reprint Author), Duke Univ, Durham, NC
                 27708 USA. Nathan, Ralph; Sorin, Daniel J., Duke Univ,
                 Durham, NC 27708 USA.",
  author-email = "ralph.nathan@duke.edu sorin@ee.duke.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Argus-G; Benchmark testing; Conferences; CPU cores;
                 error detection; fault tolerance; general purpose GPU
                 cores; GPGPU cores; Graphics processing units; graphics
                 processing units; Graphics processors; Hardware;
                 Hardware design languages; Instruction sets; low-cost
                 error detection; Registers",
  number-of-cited-references = "18",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Nathan:2015:AGC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{O:2015:CCI,
  author =       "Seongil O and Sanghyuk Kwon and Young Hoon Son and
                 Yujin Park and Jung Ho Ahn",
  title =        "{CIDR}: a Cache Inspired Area-Efficient {DRAM}
                 Resilience Architecture against Permanent Faults",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2324894",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "area overhead; area-efficient DRAM resilience
                 architecture; Arrays; augmented cache; bit errors;
                 Bloom filter; cache data array; cache storage; cache
                 tags; cache-inspired DRAM resilience architecture;
                 CIDR; Circuit faults; cost-sensitive main-memory DRAM
                 devices; data structures; Decoding; device failure
                 rates; DRAM arrays; DRAM chips; DRAM, error resilience,
                 permanent faults, row and column sparing, Bloom filter,
                 DRAM-side caching; energy overhead minimization; error
                 statistics; fault diagnosis; faulty cells; I/O pads;
                 memory architecture; permanent faults; processor-memory
                 interfaces; Random access memory; Resilience;
                 single-bit error rates; Testing; testing phase",
}

@Article{Seongil:2015:CCI,
  author =       "O. Seongil and Sanghyuk Kwon and Young Hoon Son and
                 Yujin Park and Jung Ho Ahn",
  title =        "{CIDR}: a Cache Inspired Area-Efficient {DRAM}
                 Resilience Architecture against Permanent Faults",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2324894",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Faulty cells have become major problems in
                 cost-sensitive main-memory DRAM devices. Conventional
                 solutions to reduce device failure rates due to cells
                 with permanent faults, such as populating spare rows
                 and relying on error-correcting codes, have had limited
                 success due to high area overheads. In this paper, we
                 propose CIDR, a novel cache-inspired DRAM resilience
                 architecture, which substantially reduces the area
                 overhead of handling bit errors from these faulty
                 cells. A DRAM device adopting CIDR has a small cache
                 next to its I/O pads to replace accesses to the
                 addresses that include the faulty cells with ones that
                 correspond to the cache data array. We minimize the
                 energy overhead of accessing the cache tags for every
                 read or write by adding a Bloom filter in front of the
                 cache. The augmented cache is programmed once during
                 the testing phase and is out of the critical path on
                 normal accesses because both cache and DRAM arrays are
                 accessed in parallel, making CIDR transparent to
                 existing processor-memory interfaces. Compared to the
                 conventional architecture relying on spare rows, CIDR
                 lowers the area overhead of achieving equal failure
                 rates over a wide range of single-bit error rates, such
                 as 23.6 x lower area overhead for a bit-error rate of
                 10(-5) and a device failure rate of 10(-3).",
  acknowledgement = ack-nhfb,
  affiliation =  "Seongil, O (Reprint Author), Seoul Natl Univ, Dept
                 Transdisciplinary Studies, Seoul, South Korea. Seongil,
                 O.; Kwon, Sanghyuk; Son, Young Hoon; Park, Yujin; Ahn,
                 Jung Ho, Seoul Natl Univ, Dept Transdisciplinary
                 Studies, Seoul, South Korea.",
  author-email = "swdfish@snu.ac.kr kkwon114@snu.ac.kr yhson96@snu.ac.kr
                 comesay@snu.ac.kr gajh@snu.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bloom filter; DRAM; DRAM-side caching; error
                 resilience; permanent faults; row and column sparing",
  number-of-cited-references = "13",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Seongil:2015:CCI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Gupta:2015:CEO,
  author =       "Ujjwal Gupta and Umit Y. Ogras",
  title =        "Constrained Energy Optimization in Heterogeneous
                 Platforms Using Generalized Scaling Models",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "21--25",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2326603",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Platform energy consumption and responsiveness are two
                 major considerations for mobile systems since they
                 determine the battery life and user satisfaction,
                 respectively. We first present models for power
                 consumption, response time and energy consumption of
                 heterogeneous mobile platforms. Then, we use these
                 models to optimize the energy consumption of baseline
                 platforms under response time and temperature
                 constraints with and without introducing new resources.
                 We show that the optimal design choices depend on
                 dynamic power management algorithm, and adding new
                 resources is more energy efficient than scaling
                 existing resources alone.",
  acknowledgement = ack-nhfb,
  affiliation =  "Gupta, U (Reprint Author), Arizona State Univ, Sch
                 Elect Comp \& Energy Engn, Tempe, AZ 85281 USA. Gupta,
                 Ujjwal; Ogras, Umit Y., Arizona State Univ, Sch Elect
                 Comp \& Energy Engn, Tempe, AZ 85281 USA.",
  author-email = "ujjwal@asu.edu umit@asu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "battery life determine; Computers; constrained energy
                 optimization; dynamic power management algorithm;
                 Energy consumption; Energy optimization; generalized
                 scaling models; heterogeneous architectures;
                 heterogeneous mobile platforms; Mobile communication;
                 mobile computing; mobile platforms; mobile systems;
                 MpSoC; Multicore processing; Optimization; performance;
                 platform energy consumption; power aware computing;
                 power consumption; Power demand; response time;
                 temperature constraints; Time factors; user
                 satisfaction",
  keywords-plus = "AMDAHLS LAW; MULTIAMDAHL; ACCELERATOR; MANAGEMENT;
                 CPU; ERA",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Gupta:2015:CEO",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Farmahini-Farahani:2015:DAA,
  author =       "Amin Farmahini-Farahani and Jung Ho Ahn and Katherine
                 Morrow and Nam Sung Kim",
  title =        "{DRAMA}: an Architecture for Accelerated Processing
                 Near Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "26--29",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2333735",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Improving energy efficiency is crucial for both mobile
                 and high-performance computing systems while a large
                 fraction of total energy is consumed to transfer data
                 between storage and processing units. Thus, reducing
                 data transfers across the memory hierarchy of a
                 processor (i.e., off-chip memory, on-chip caches, and
                 register file) can greatly improve the energy
                 efficiency. To this end, we propose an architecture,
                 DRAMA, that 3D-stacks coarse-grain reconfigurable
                 accelerators (CGRAs) atop off-chip DRAM devices. DRAMA
                 does not require changes to the DRAM device
                 architecture, apart from through-silicon vias (TSVs)
                 that connect the DRAM device's internal I/O bus to the
                 CGRA layer. We demonstrate that DRAMA can reduce the
                 energy consumption to transfer data across the memory
                 hierarchy by 66-95 percent while achieving speedups of
                 up to 18 x over a commodity processor.",
  acknowledgement = ack-nhfb,
  affiliation =  "Farmahini-Farahani, A (Reprint Author), Univ
                 Wisconsin, Dept Elect \& Comp Engn, 1415 Johnson Dr,
                 Madison, WI 53706 USA. Farmahini-Farahani, Amin;
                 Morrow, Katherine; Kim, Nam Sung, Univ Wisconsin, Dept
                 Elect \& Comp Engn, Madison, WI 53706 USA. Ahn, Jung
                 Ho, Seoul Natl Univ, Dept Transdisciplinary Studies,
                 Seoul 151742, South Korea.",
  author-email = "farmahinifar@wisc.edu gajh@snu.ac.kr
                 kati@engr.wisc.edu nskim3@wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "3D-stacking; 3D-stacks coarse-grain reconfigurable
                 accelerators; accelerated near memory processing;
                 Acceleration; accelerator; Arrays; data transfers;
                 DRAM; DRAM chips; DRAM devices; DRAMA architecture;
                 dynamic random access memory; energy conservation;
                 energy consumption reduction; energy efficiency;
                 energy-efficient computing; high-performance computing
                 systems; Kernel; memory hierarchy; Memory management;
                 mobile computing systems; Near memory processing; Near
                 memory processing, DRAM, 3D-stacking, energy-efficient
                 computing, accelerator; processing units; Random access
                 memory; Registers; storage management; storage units;
                 through-silicon vias; total energy fraction; TSV",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "7",
  unique-id =    "Farmahini-Farahani:2015:DAA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Carlson:2015:EPM,
  author =       "Trevor E. Carlson and Siddharth Nilakantan and Mark
                 Hempstead and Wim Heirman",
  title =        "Epoch Profiles: Microarchitecture-Based Application
                 Analysis and Optimization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "30--33",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2329873",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The performance of data-intensive applications, when
                 running on modern multi-and many-core processors, is
                 largely determined by their memory access behavior. Its
                 most important contributors are the frequency and
                 latency of off-chip accesses and the extent to which
                 long-latency memory accesses can be overlapped with
                 useful computation or with each other. In this paper we
                 present two methods to better understand application
                 and microarchitectural interactions. An epoch profile
                 is an intuitive way to understand the relationships
                 between three important characteristics: the on-chip
                 cache size, the size of the reorder window of an
                 out-of-order processor, and the frequency of processor
                 stalls caused by long-latency, off-chip requests
                 (epochs). By relating these three quantities one can
                 more easily understand an application's memory
                 reference behavior and thus significantly reduce the
                 design space. While epoch profiles help to provide
                 insight into the behavior of a single application,
                 developing an understanding of a number of applications
                 in the presence of area and core count constraints
                 presents additional challenges. Epoch-based
                 microarchitectural analysis is presented as a better
                 way to understand the trade-offs for memory-bound
                 applications in the presence of these physical
                 constraints. Through epoch profiling and optimization,
                 one can significantly reduce the multidimensional
                 design space for hardware/software optimization through
                 the use of high-level model-driven techniques.",
  acknowledgement = ack-nhfb,
  affiliation =  "Carlson, TE (Reprint Author), Univ Ghent, Sint
                 Pietersnieuwstr 41, B-9000 Ghent, East Flanders,
                 Belgium. Carlson, Trevor E., Univ Ghent, B-9000 Ghent,
                 East Flanders, Belgium. Nilakantan, Siddharth;
                 Hempstead, Mark, Drexel Univ, Dept Elect \& Comp Engn,
                 Bossone Res Ctr, Philadelphia, PA 19104 USA. Heirman,
                 Wim, Intel Corp, Leuven, Flemish Brabant, Belgium.",
  author-email = "trevor.carlson@elis.ugent.be sn446@drexel.edu
                 mhempstead@drexel.edu wim.heirman@intel.com",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Computational modeling; Frequency
                 measurement; memory-level parallelism;
                 Microarchitecture; Microarchitecture analysis; Out of
                 order; System-on-chip; visualization",
  number-of-cited-references = "6",
  oa =           "Green Published",
  ORCID-numbers = "Carlson, Trevor/0000-0001-8742-134X Nilakantan,
                 Siddharth/0000-0003-1067-700X Heirman,
                 Wim/0000-0003-2286-1525",
  research-areas = "Computer Science",
  researcherid-numbers = "Carlson, Trevor/M-4945-2016",
  times-cited =  "0",
  unique-id =    "Carlson:2015:EPM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Power:2015:GGH,
  author =       "Jason Power and Joel Hestness and Marc S. Orr and Mark
                 D. Hill and David A. Wood",
  title =        "{gem5-gpu}: a Heterogeneous {CPU--GPU} Simulator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "34--36",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2299539",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "gem5-gpu is a new simulator that models tightly
                 integrated CPU-GPU systems. It builds on gem5, a
                 modular full-system CPU simulator, and GPGPU-Sim, a
                 detailed GPGPU simulator. gem5-gpu routes most memory
                 accesses through Ruby, which is a highly configurable
                 memory system in gem5. By doing this, it is able to
                 simulate many system configurations, ranging from a
                 system with coherent caches and a single virtual
                 address space across the CPU and GPU to a system that
                 maintains separate GPU and CPU physical address spaces.
                 gem5-gpu can run most unmodified CUDA 3.2 source code.
                 Applications can launch non-blocking kernels, allowing
                 the CPU and GPU to execute simultaneously. We present
                 gem5-gpu's software architecture and a brief
                 performance validation. We also discuss possible
                 extensions to the simulator. gem5-gpu is open source
                 and available at gem5-gpu.cs.wisc.edu.",
  acknowledgement = ack-nhfb,
  affiliation =  "Power, J (Reprint Author), Univ Wisconsin, Dept Comp
                 Sci, 1210 W Dayton St, Madison, WI 53706 USA. Power,
                 Jason; Hestness, Joel; Orr, Marc S.; Hill, Mark D.;
                 Wood, David A., Univ Wisconsin, Dept Comp Sci, Madison,
                 WI 53706 USA.",
  author-email = "powerjg@cs.wisc.edu hestness@cs.wisc.edu
                 morr@cs.wisc.edu markhill@cs.wisc.edu
                 david@cs.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Coherence; Computational modeling; Computer
                 architecture; computer architecture; gem5-gpu
                 simulator; general-purpose graphics processors;
                 GPGPUSim; Graphics processing units; graphics
                 processing units; heterogeneous (hybrid) systems;
                 heterogeneous CPU-GPU simulator; Kernel; Modeling
                 techniques; modular full-system CPU simulator;
                 nonblocking kernels; Object oriented modeling;
                 Protocols; simulators; software architecture",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "62",
  unique-id =    "Power:2015:GGH",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Manatunga:2015:HSS,
  author =       "Dilan Manatunga and Joo Hwan Lee and Hyesoon Kim",
  title =        "Hardware Support for Safe Execution of Native Client
                 Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "37--40",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2309601",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Over the past few years, there has been vast growth in
                 the area of the web browser as an applications
                 platform. One example of this trend is Google's Native
                 Client (NaCl) platform, which is a software-fault
                 isolation mechanism that allows the running of native
                 x86 or ARM code on the browser. One of the security
                 mechanisms employed by NaCl is that all branches must
                 jump to the start of a valid instruction. In order to
                 achieve this criteria though, all return instructions
                 are replaced by a specific branch instruction sequence,
                 which we call NaCl returns, that are guaranteed to
                 return to a valid instruction. However, these NaCl
                 returns lose the advantage of the highly accurate
                 return-address stack (RAS) in exchange for the less
                 accurate indirect branch predictor. In this paper, we
                 propose a NaCl-RAS mechanism that can identify and
                 accurately predict 76.9 on average compared to the 39.5
                 of a traditional BTB predictor.",
  acknowledgement = ack-nhfb,
  affiliation =  "Manatunga, D (Reprint Author), Georgia Inst Technol,
                 Sch Comp Sci, Atlanta, GA 30332 USA. Manatunga, Dilan;
                 Lee, Joo Hwan; Kim, Hyesoon, Georgia Inst Technol, Sch
                 Comp Sci, Atlanta, GA 30332 USA.",
  author-email = "dmanatunga@gatech.edu joohwan.lee@gatech.edu
                 hyesoon@cc.gatech.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; ARM code; Benchmark testing; branch
                 instruction sequence; branch prediction accuracy; BTB
                 predictor; Detectors; fault diagnosis; Google;
                 Hardware; hardware support; NaCl-RAS mechanism; Native
                 client; native client applications; native x86; online
                 front-ends; return address prediction; return-address
                 stack; safe execution; Security; security mechanism;
                 security of data; Software; software fault isolation;
                 software-fault isolation mechanism; Web browser",
  keywords-plus = "SANDBOX; CODE",
  number-of-cited-references = "5",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Manatunga:2015:HSS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Liu:2015:LHP,
  author =       "Longjun Liu and Chao Li and Hongbin Sun and Yang Hu
                 and Jingmin Xin and Nanning Zheng and Tao Li",
  title =        "Leveraging Heterogeneous Power for Improving
                 Datacenter Efficiency and Resiliency",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "41--45",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2363084",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Power mismatching between supply and demand has
                 emerged as a top issue in modern datacenters that are
                 under-provisioned or powered by intermittent power
                 supplies. Recent proposals are primarily limited to
                 leveraging uninterruptible power supplies (UPS) to
                 handle power mismatching, and therefore lack the
                 capability of efficiently handling the irregular peak
                 power mismatches. In this paper we propose hPower, the
                 first heterogeneous energy buffering strategy that
                 incorporates supercapacitors into existing datacenters
                 to handle power mismatch. Our technique exploits power
                 supply diversity and smart load assignment to provide
                 efficiency-aware and emergency-aware power mismatch
                 management. We show that hPower could improve energy
                 efficiency by 30 percent, extend UPS lifetime by 4.3 x,
                 and reduce system downtime by 36 percent. It allows
                 datacenters to adapt themselves to various power supply
                 anomalies, thereby improving operational efficiency and
                 resiliency.",
  acknowledgement = ack-nhfb,
  affiliation =  "Liu, LJ (Reprint Author), Xi An Jiao Tong Univ, Sch
                 Elect \& Informat Engn, Xian 710049, Peoples R China.
                 Liu, Longjun; Sun, Hongbin; Xin, Jingmin; Zheng,
                 Nanning, Xi An Jiao Tong Univ, Sch Elect \& Informat
                 Engn, Xian 710049, Peoples R China. Li, Chao, Shanghai
                 Jiao Tong Univ, Dept Comp Sci \& Engn, Shanghai 200030,
                 Peoples R China. Hu, Yang; Li, Tao, Univ Florida, Dept
                 Elect \& Comp Engn, Gainesville, FL USA.",
  author-email = "longjun.liu@stu.xjtu.edu.cn lichao@cs.sjtu.edu.cn
                 hsun@mail.xjtu.edu.cn huyang.ece@ufl.edu
                 jxin@mail.xjtu.edu.cn nnzheng@mail.xjtu.edu.cn
                 taoli@ece.ufl.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Batteries; computer centres; computer system
                 implementation; Computer System Implementation;
                 computer system implementation; data center efficiency;
                 data center resiliency; efficiency-aware power mismatch
                 management; emergency-aware power mismatch management;
                 energy conservation; Energy efficiency; Energy-aware
                 systems; Energy-Aware Systems; heterogeneous energy
                 buffering strategy; heterogeneous power; hPower;
                 performance of systems; Performance of Systems; power
                 aware computing; Power demand; power mismatching; power
                 supply anomalies; power supply diversity; Servers;
                 smart load assignment; Supercapacitors;
                 supercapacitors; system downtime reduction;
                 uninterruptible power supplies; Uninterruptible power
                 systems; UPS",
  number-of-cited-references = "16",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Liu:2015:LHP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wang:2015:LNV,
  author =       "Rui Wang and Wangyuan Zhang and Tao Li and Depei
                 Qian",
  title =        "Leveraging Non-Volatile Storage to Achieve Versatile
                 Cache Optimizations",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "46--49",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2298412",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The efficiency of caches plays a vital role in
                 microprocessor. In this paper, we introduce a novel and
                 flexible cache substrate that employs non-volatile yet
                 versatile SRAM (NV2-SRAM) cell design, which
                 synergistically integrates new memory devices into the
                 standard SRAM cells. Our experiments show that it can
                 achieve a 67 percent energy saving and 3: 1 x
                 reliability improvement over the SRAM based cache,
                 outperforming the drowsy cache design in terms of both
                 power efficiency and reliability. Moreover, the
                 proposed cache architecture can be used to improve the
                 performance of prefetching schemes by 10 percent.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wang, R (Reprint Author), Beihang Univ, Sch Comp Sci
                 \& Engn, State Key Lab Software Dev Environm, Beijing
                 100191, Peoples R China. Wang, Rui; Qian, Depei,
                 Beihang Univ, Sch Comp Sci \& Engn, State Key Lab
                 Software Dev Environm, Beijing 100191, Peoples R China.
                 Zhang, Wangyuan; Li, Tao, Univ Florida, ECE Dept,
                 Gainesville, FL 32611 USA.",
  author-email = "rui.wang@jsi.buaa.edu.cn zhangwangyuan@gmail.com
                 taoli@ece.ufl.edu depeiq@buaa.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache architecture; Cache memories; cache storage;
                 Computer architecture; energy saving; flexible cache
                 substrate; low-power design; Magnetic tunneling; memory
                 structures; microprocessor; Microprocessors;
                 Nonvolatile memory; nonvolatile storage; nonvolatile
                 yet versatile SRAM cell design; NV2-SRAM cell design;
                 Prefetching; prefetching schemes; reliability
                 improvement; SRAM; SRAM based cache; SRAM cells; SRAM
                 chips; storage management; versatile cache
                 optimizations",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Wang:2015:LNV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Mohammadi:2015:DDB,
  author =       "Milad Mohammadi and Song Han and Tor M. Aamodt and
                 William J. Dally",
  title =        "On-Demand Dynamic Branch Prediction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "50--53",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2330820",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In out-of-order (OoO) processors, speculative
                 execution with high branch prediction accuracy is
                 employed to achieve good single thread performance. In
                 these processors the branch prediction unit tables
                 (BPU) are accessed in parallel with the instruction
                 cache before it is known whether a fetch group contains
                 branch instructions. For integer applications, we find
                 85 percent of BPU lookups are done for non-branch
                 operations and of the remaining lookups, 42 percent are
                 done for highly biased branches that can be predicted
                 statically with high accuracy. We evaluate on-demand
                 branch prediction (ODBP), a novel technique that uses
                 compiler generated hints to identify those instructions
                 that can be more accurately predicted statically to
                 eliminate unnecessary BPU lookups. We evaluate an
                 implementation of ODBP that combines static and dynamic
                 branch prediction. For a four wide superscalar
                 processor, ODBP delivers as much as 9 percent
                 improvement in average energy-delay (ED) product, 7
                 percent core average energy saving, and 3 percent
                 speedup. ODBP also enables the use of large BPU's for a
                 given power budget.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mohammadi, M (Reprint Author), Stanford Univ, Dept
                 Elect Engn, Stanford, CA 94305 USA. Mohammadi, Milad;
                 Han, Song; Dally, William J., Stanford Univ, Dept Elect
                 Engn, Stanford, CA 94305 USA. Aamodt, Tor M., Univ
                 British Columbia, Dept Elect \& Comp Engn, Vancouver,
                 BC V6T 1Z4, Canada.",
  author-email = "milad@stanford.edu songhan@stanford.edu
                 aamodt@ece.ubc.ca dally@stanford.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; ahead prediction; BPU lookup; branch
                 instruction; branch prediction accuracy; branch
                 prediction unit table; cache storage; compiler
                 generated hints; Computer architecture; core average
                 energy saving; ED product; Energy efficiency;
                 energy-delay product; energy-delay product
                 optimization; Equations; instruction cache; instruction
                 sets; Mathematical model; nonbranch operation; ODBP;
                 on-demand branch prediction; on-demand dynamic branch
                 prediction; OoO processor; out-of-order processor;
                 parallel processing; Pipelines; power budget; program
                 compilers; Program processors; single thread
                 performance; speculative execution; static and dynamic
                 branch prediction hybrid; static branch prediction;
                 superscalar processor; table lookup; Tin",
  keywords-plus = "MICROPROCESSOR; DESIGN",
  number-of-cited-references = "27",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Mohammadi:2015:DDB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Azriel:2015:PMT,
  author =       "Leonid Azriel and Avi Mendelson and Uri Weiser",
  title =        "Peripheral Memory: a Technique for Fighting Memory
                 Bandwidth Bottleneck",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "54--57",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2319077",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memory bottleneck has always been a major cause for
                 limiting the performance of computer systems. While in
                 the past latency was the major concern, today, lack of
                 bandwidth becomes a limiting factor as well, as a
                 result of exploiting more parallelism with the growing
                 number of cores per die, which intensifies the pressure
                 on the memory bus. In such an environment, any
                 additional traffic to memory, such as the I/O traffic
                 may lead to degradation of the overall performance of
                 the system. This work introduces the concept of
                 Peripheral Memory, a software controlled memory that
                 resides in the I/O domain and can be used for
                 offloading I/O traffic from CPU memory. The Peripheral
                 Memory handles `I/O exclusive data', data originated
                 and terminated at I/O domain, and which does not need
                 any processing by the CPU. The paper analyses the
                 impact of I/O traffic on the overall performance of the
                 current systems and demonstrates that in numerous
                 applications, I/O exclusive data occupies major part of
                 memory bandwidth, as a result, degrading CPU processing
                 performance and increasing power. The paper considers
                 four different implementations of the Peripheral
                 Memory: pageable, pinned, non-coherent split-traffic
                 and copy-on-access. Our full-system simulator indicates
                 that non-coherent split traffic configuration is the
                 most efficient implementation, which can provide up to
                 four times speedup in the I/O processing rate for
                 typical I/O intensive applications. In addition, based
                 on Power model and measurements tools, the paper
                 demonstrates that the Peripheral Memory in a server
                 system can lead to reduction of tens of Watts in the
                 overall system power consumption or 10-20 percent of
                 the system power budget.",
  acknowledgement = ack-nhfb,
  affiliation =  "Azriel, L (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
                 Azriel, Leonid; Mendelson, Avi; Weiser, Uri, Technion
                 Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa,
                 Israel.",
  author-email = "leonida@tx.technion.ac.il
                 avi.mendelson@tce.technion.ac.il
                 uri.weiser@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; bandwidth allocation; Benchmark testing;
                 computer system performance; CPU memory; full-system
                 simulator; I/O domain; I/O traffic offloading;
                 input/output devices; Instruction sets; interconnection
                 architectures; main memory; memory bandwidth
                 bottleneck; memory bus; Memory management; parallelism;
                 performance evaluation; Performance evaluation;
                 peripheral memory; Power demand; Power measurement;
                 server system; software controlled memory; storage
                 management; system buses",
  keywords-plus = "NETWORK; I/O",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Azriel:2015:PMT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wang:2015:PTM,
  author =       "Zhaoguo Wang and Han Yi and Ran Liu and Mingkai Dong
                 and Haibo Chen",
  title =        "Persistent Transactional Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "58--61",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2329832",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper proposes persistent transactional memory
                 (PTM), a new design that adds durability to
                 transactional memory (TM) by incorporating with the
                 emerging non-volatile memory (NVM). PTM dynamically
                 tracks transactional updates to cache lines to ensure
                 the ACI (atomicity, consistency and isolation)
                 properties during cache flushes and leverages an undo
                 log in NVM to ensure PTM can always consistently
                 recover transactional data structures from a machine
                 crash. This paper describes the PTM design based on
                 Intel's restricted transactional memory. A preliminary
                 evaluation using a concurrent key/value store and a
                 database with a cache-based simulator shows that the
                 additional cache line flushes are small.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wang, ZG (Reprint Author), Shanghai Jiao Tong Univ,
                 Shanghai Key Lab Scalable Comp \& Syst, Shanghai
                 200030, Peoples R China. Wang, Zhaoguo; Yi, Han; Liu,
                 Ran; Dong, Mingkai; Chen, Haibo, Shanghai Jiao Tong
                 Univ, Shanghai Key Lab Scalable Comp \& Syst, Shanghai
                 200030, Peoples R China. Wang, Zhaoguo; Yi, Han; Liu,
                 Ran; Dong, Mingkai; Chen, Haibo, Shanghai Jiao Tong
                 Univ, Inst Parallel \& Distributed Syst, Shanghai
                 200030, Peoples R China.",
  author-email = "tigerwang1986@gmail.com ken.yihan1990@gmail.com
                 naruilone@gmail.com mingkaidong@gmail.com
                 haibochen@sjtu.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ACI properties; Batteries; cache line flushes; cache
                 storage; cache-based simulator; Computer crashes; Data
                 structures; Databases; Hardware; Hardware transactional
                 memory; non-volatile random access memory; Nonvolatile
                 memory; nonvolatile memory; NVM; persistent
                 transactional memory; PTM design; Registers",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "5",
  unique-id =    "Wang:2015:PTM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Gibert:2015:PSR,
  author =       "Enric Gibert and Raul Mart{\'\i}nez and Carlos
                 Madriles and Josep M. Codina",
  title =        "Profiling Support for Runtime Managed Code: Next
                 Generation Performance Monitoring Units",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "62--65",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2321398",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Given the increase of runtime managed code
                 environments in desktop, server, and mobile segments,
                 agile, flexible, and accurate performance monitoring
                 capabilities are required in order to perform wise code
                 transformations and optimizations. Common profiling
                 strategies, mainly based on instrumentation and current
                 performance monitoring units (PMUs), are not adequate
                 and new innovative designs are necessary. In this
                 paper, we present the desired characteristics of what
                 we call next generation PMUs and advocate for
                 hardware/software collaborative approaches where
                 hardware implements the profiling hooks and mechanisms
                 and software implements the complex heuristics. We then
                 propose a first design in which the hardware uses a
                 small, yet flexible table to profile specific code
                 regions and the software decides what/when/how to
                 profile. This first design meets all required features
                 and we aim it as the seed for future PMUs extensions to
                 enable novel dynamic code transformations and
                 optimizations.",
  acknowledgement = ack-nhfb,
  affiliation =  "Gibert, E (Reprint Author), Intel Corp, Intel Labs,
                 Intel Barcelona Res Ctr IBRC, Edifici Nexus 2, Planta
                 0-D, Jordi Girona 29, Barcelona, Spain. Gibert, Enric;
                 Martinez, Raul; Madriles, Carlos; Codina, Josep M.,
                 Intel Corp, Intel Labs, Intel Barcelona Res Ctr IBRC,
                 Barcelona, Spain.",
  author-email = "enric.gibert.codina@intel.com raul.martinez@intel.com
                 carlos.madriles.gimeno@intel.com
                 josep.m.codina@intel.com",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "dynamic code optimizations; dynamic code
                 transformations; groupware; Hardware; hardware-software
                 collaborative approaches; instrumentation; Instruments;
                 just in time (JIT) compiler; Monitoring; next
                 generation performance monitoring units; optimising
                 compilers; Optimization; Performance monitoring unit
                 (PMU); Phasor measurement units; PMUs; profiling;
                 profiling hooks; profiling support; Runtime; runtime
                 managed code; runtime managed code environments;
                 Software; software performance evaluation; system
                 monitoring",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Gibert:2015:PSR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{You:2015:QSA,
  author =       "Daecheol You and Ki-Seok Chung",
  title =        "Quality of Service-Aware Dynamic Voltage and Frequency
                 Scaling for Embedded {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "66--69",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2319079",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Dynamic voltage and frequency scaling (DVFS) is a key
                 technique for reducing processor power consumption in
                 mobile devices. In recent years, mobile system-on-chips
                 (SoCs) has supported DVFS for embedded graphics
                 processing units (GPUs) as the processing power of
                 embedded GPUs has been increasing steadily. The major
                 challenge of applying DVFS to a processing unit is to
                 meet the quality of service (QoS) requirement while
                 achieving a reasonable power reduction. In the case of
                 GPUs, the QoS requirement can be specified as the
                 frame-per-second (FPS) which the target GPU should
                 achieve. The proposed DVFS technique ensures a
                 consistent GPU performance by scaling the operating
                 clock frequency in a way that it maintains a uniform
                 FPS.",
  acknowledgement = ack-nhfb,
  affiliation =  "You, D (Reprint Author), Hanyang Univ, Dept Elect Comp
                 \& Commun Engn, Embedded Syst Chip Lab, Seoul 133791,
                 South Korea. You, Daecheol; Chung, Ki-Seok, Hanyang
                 Univ, Dept Elect Comp \& Commun Engn, Embedded Syst
                 Chip Lab, Seoul 133791, South Korea.",
  author-email = "khsrdc@hanyang.ac.kr kchung@hanyang.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Clocks; Correlation; DVFS; dynamic
                 voltage scaling; embedded GPU; Energy consumption;
                 energy-aware systems; frequency scaling; graphics
                 processing unit; Graphics processing units; graphics
                 processing units; Graphics processors;
                 hardware/software interfaces; low-power design; mobile
                 device; mobile system-on-chips; operating clock
                 frequency; power aware computing; processor power
                 consumption; Quality of service; quality of service;
                 SoC; System-on-chip; system-on-chip",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "You:2015:QSA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lee:2015:RDA,
  author =       "Sungjin Lee and Jihong Kim and Arvind",
  title =        "Refactored Design of {I/O} Architecture for Flash
                 Storage",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "70--74",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2329423",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Flash storage devices behave quite differently from
                 hard disk drives (HDDs); a page on flash has to be
                 erased before it can be rewritten, and the erasure has
                 to be performed on a block which consists of a large
                 number of contiguous pages. It is also important to
                 distribute writes evenly among flash blocks to avoid
                 premature wearing. To achieve interoperability with
                 existing block I/O subsystems for HDDs, NAND flash
                 devices employ an intermediate software layer, called
                 the flash translation layer (FTL), which hides these
                 differences. Unfortunately, FTL implementations require
                 powerful processors with a large amount of DRAM in
                 flash controllers and also incur many unnecessary I/O
                 operations which degrade flash storage performance and
                 lifetime. In this paper, we present a refactored design
                 of I/O architecture for flash storage which
                 dramatically increases storage performance and lifetime
                 while decreasing the cost of the flash controller. In
                 comparison with page-level FTL, our preliminary
                 experiments show a reduction of 19 percent in I/O
                 operations, improvement of I/O performance by 9 percent
                 and storage lifetime by 36 percent. In addition, our
                 scheme uses only 1/128 DRAM memory in the flash
                 controller.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lee, S (Reprint Author), MIT, 77 Massachusetts Ave,
                 Cambridge, MA 02139 USA. Lee, Sungjin; Arvind, MIT,
                 Cambridge, MA 02139 USA. Kim, Jihong, Seoul Natl Univ,
                 Sch Comp Sci \& Engn, Seoul, South Korea.",
  author-email = "chamdoo@gmail.com jihong@davinci.snu.ac.kr
                 arvind@csail.mit.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; block I/O subsystems; Computer
                 architecture; DRAM chips; DRAM memory; file systems;
                 flash blocks; flash memories; flash storage; flash
                 translation layer; hard disk drives; HDDs; I/O
                 architecture; I/O architectures; input-output programs;
                 intermediate software layer; interoperability; NAND
                 circuits; NAND flash devices; NAND flash memory;
                 page-level FTL; Performance evaluation; premature
                 wearing; Random access memory; Runtime; Storage
                 management; Storage systems",
  number-of-cited-references = "15",
  research-areas = "Computer Science",
  times-cited =  "7",
  unique-id =    "Lee:2015:RDA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yuan:2015:SGR,
  author =       "Fengkai Yuan and Zhenzhou Ji and Suxia Zhu",
  title =        "Set-Granular Regional Distributed Cooperative
                 Caching",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "75--78",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2319258",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The last level cache (LLC) in private configurations
                 offer lower latency and isolation but extinguishes the
                 possibility of sharing underutilized cache resources.
                 Cooperative Caching (CC) provides capacity sharing by
                 spilling a line evicted from one cache to another.
                 Current studies focus on efficient capacity sharing,
                 while the adaptability of CC to manycore environment
                 deserves more attentions. In this paper, we present
                 Set-granular Regional Distributed Cooperative Caching
                 to optimize CC in manycore CMPs with private LLCs. We
                 achieve efficient capacity sharing by a low-traffic
                 global receiver tracking mechanism and provide a method
                 to manage set-grain cache state transitions for
                 exclusive LLCs. Experiment results show that SRDCC
                 performs better than baseline system, running different
                 workloads varying in receiver-spiller number and
                 distribution, in execution time up to 15.55 percent and
                 memory access up to 40.25 percent, at a negligible cost
                 of network traffics (6.21 percent more than baseline
                 system at worst).",
  acknowledgement = ack-nhfb,
  affiliation =  "Yuan, FK (Reprint Author), Harbin Inst Technol, Sch
                 Comp Sci \& Technol, Harbin 150006, Heilongjiang,
                 Peoples R China. Yuan, Fengkai; Ji, Zhenzhou; Zhu,
                 Suxia, Harbin Inst Technol, Sch Comp Sci \& Technol,
                 Harbin 150006, Heilongjiang, Peoples R China.",
  author-email = "yuan.fengkai@gmail.com jizhenzhou@hit.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache coherence protocol; cache resource sharing;
                 Cache storage; cache storage; capacity sharing; CC;
                 chip multiprocessors; cooperative caching; Cooperative
                 caching; last level cache; LLC; manycore CMP;
                 multiprocessing systems; on-chip networks; private
                 cache configuration; Protocols; Radiation detectors;
                 receiver-spiller distribution; receiver-spiller number;
                 Receivers; set-grain cache state transition;
                 set-granular regional distributed cooperative caching;
                 Telecommunication traffic; Tiled CMP",
  keywords-plus = "CHIP MULTIPROCESSORS",
  number-of-cited-references = "9",
  ORCID-numbers = "Yuan, Fengkai/0000-0003-2615-8642",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Yuan:2015:SGR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lee:2015:SSI,
  author =       "Junghee Lee and Youngjae Kim and Jongman Kim and Galen
                 M. Shipman",
  title =        "Synchronous {I/O} Scheduling of Independent Write
                 Caches for an Array of {SSDs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "79--82",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2298394",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Solid-state drives (SSD) offer a significant
                 performance improvement over the hard disk drives
                 (HDD), however, it can exhibit a significant variance
                 in latency and throughput due to internal garbage
                 collection (GC) process on the SSD. When the SSDs are
                 configured in a RAID, the performance variance of
                 individual SSDs could significantly degrade the overall
                 performance of the RAID of SSDs. The internal cache on
                 the RAID controller can help mitigate the performance
                 variability issues of SSDs in the array; however, the
                 state-of-the-art cache algorithm of the RAID controller
                 does not consider the characteristics of SSDs. In this
                 paper, we examine the most recent write cache algorithm
                 for the array of disks, and propose a synchronous
                 independent write cache (SIW) algorithm. We also
                 present a pre-parity-computation technique for the RAID
                 of SSDs with parity computations, which calculates
                 parities of blocks in advance before they are stored in
                 the write cache. With this new technique, we propose a
                 complete paradigm shift in the design of write cache.
                 In our evaluation study, large write requests dominant
                 workloads show up to about 50 and 20 percent
                 improvements in average response times on RAID-0 and
                 RAID-5 respectively as compared to the state-of-the-art
                 write cache algorithm.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lee, J (Reprint Author), Univ Texas San Antonio, San
                 Antonio, TX 78229 USA. Lee, Junghee, Univ Texas San
                 Antonio, San Antonio, TX 78229 USA. Kim, Youngjae, Ajou
                 Univ, Suwon 441749, South Korea. Kim, Jongman, Georgia
                 Inst Technol, Atlanta, GA 30332 USA. Shipman, Galen M.,
                 Oak Ridge Natl Lab, Oak Ridge, TN USA.",
  author-email = "junghee.lee@utsa.edu youkim@gmail.com
                 jkim@ece.gatech.edu gshipman@ornl.gov",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithm design and analysis; Arrays; cache storage;
                 Delays; disks array; flash memory; GC process; hard
                 disk drives; HDD; I/O scheduling; independent write
                 caches; input-output programs; internal cache; internal
                 garbage collection process; memory architecture;
                 pre-parity-computation technique; RAID; RAID
                 controller; Redundant array of independent disks
                 (RAID); Redundant Array of Independent Disks (RAID);
                 Redundant array of independent disks (RAID);
                 scheduling; SIW algorithm; solid-state drive (SSD);
                 Solid-State Drive (SSD); solid-state drive (SSD);
                 solid-state drives; SSD; Strips; Synchronization;
                 synchronous I/O scheduling; synchronous independent
                 write cache algorithm; Time factors; write cache; Write
                 cache; write cache; write cache design; write
                 requests",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Lee:2015:SSI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2015:RSW,
  author =       "Anonymous",
  title =        "Rock Stars of Wearables",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "83--83",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2447192",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:RSC,
  author =       "Anonymous",
  title =        "Rock Stars of Cybersecurity 2015 Conference",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "84--84",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2447191",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:TCa,
  author =       "Anonymous",
  title =        "Table of Contents",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "C1--C1",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2446391",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:ICAa,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Computer Architecture Letters}}
                 Editorial Board}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "C2--C2",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2446392",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:ICAb,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Computer Architecture Letters}}}
                 Information for Authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "C3--C3",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2446393",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:ICSa,
  author =       "Anonymous",
  title =        "{IEEE Computer Society}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "C4--C4",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2446394",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Shi:2015:CLM,
  author =       "Qingchuan Shi and Henry Hoffmann and Omer Khan",
  title =        "A Cross-Layer Multicore Architecture to Tradeoff
                 Program Accuracy and Resilience Overheads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "85--89",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2365204",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "To protect multicores from soft-error perturbations,
                 resiliency schemes have been developed with high
                 coverage but high power/performance overheads (similar
                 to 2x). We observe that not all soft-errors affect
                 program correctness, some soft-errors only affect
                 program accuracy, i.e., the program completes with
                 certain acceptable deviations from soft-error free
                 outcome. Thus, it is practical to improve processor
                 efficiency by trading off resilience overheads with
                 program accuracy. We propose the idea of declarative
                 resilience that selectively applies resilience schemes
                 to both crucial and non-crucial code, while ensuring
                 program correctness. At the application level, crucial
                 and non-crucial code is identified based on its impact
                 on the program outcome. The hardware collaborates with
                 software support to enable efficient resilience with
                 100 percent soft-error coverage. Only program accuracy
                 is compromised in the worst-case scenario of a
                 soft-error strike during non-crucial code execution.
                 For a set of multithreaded benchmarks, declarative
                 resilience improves completion time by an average of 21
                 percent over state-of-the-art hardware resilience
                 scheme that protects all executed code. Its performance
                 overhead is similar to 1.38x over a multicore that does
                 not support resilience.",
  acknowledgement = ack-nhfb,
  affiliation =  "Shi, QC (Reprint Author), Univ Connecticut, Dept Elect
                 \& Comp Engn, Storrs, CT 06269 USA. Shi, Qingchuan;
                 Khan, Omer, Univ Connecticut, Dept Elect \& Comp Engn,
                 Storrs, CT 06269 USA. Hoffmann, Henry, Univ Chicago,
                 Dept Comp Sci, Chicago, IL 60637 USA.",
  author-email = "qingchuan.shi@uconn.edu hankhoffmann@cs.uchicago.edu
                 khan@uconn.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; Benchmark testing; code execution;
                 Instruction sets; multi-threading; multicore
                 architecture; Multicore processing; multicores;
                 multithreaded benchmark; program accuracy; Resilience;
                 resilience overhead; Soft errors; soft-error
                 perturbation; soft-errors; software architecture;
                 software fault tolerance",
  number-of-cited-references = "23",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Shi:2015:CLM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zheng:2015:ACC,
  author =       "Zhong Zheng and Zhiying Wang and Mikko Lipasti",
  title =        "Adaptive Cache and Concurrency Allocation on
                 {GPGPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "90--93",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2359882",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memory bandwidth is critical to GPGPU performance.
                 Exploiting locality in caches can better utilize memory
                 bandwidth. However, memory requests issued by excessive
                 threads cause cache thrashing and saturate memory
                 bandwidth, degrading performance. In this paper, we
                 propose adaptive cache and concurrency allocation (CCA)
                 to prevent cache thrashing and improve the utilization
                 of bandwidth and computational resources, hence
                 improving performance. According to locality and reuse
                 distance of access patterns in GPGPU program, warps on
                 a stream multiprocessor are dynamically divided into
                 three groups: cached, bypassed, and waiting. The data
                 cache accommodates the footprint of cached warps.
                 Bypassed warps cannot allocate cache lines in the data
                 cache to prevent cache thrashing, but are able to take
                 advantage of available memory bandwidth and
                 computational resource. Waiting warps are de-scheduled.
                 Experimental results show that adaptive CCA can
                 significant improve benchmark performance, with 80
                 percent harmonic mean IPC improvement over the
                 baseline.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zheng, Z (Reprint Author), Natl Univ Def Technol,
                 State Key Lab High Performance Comp, Changsha, Hunan,
                 Peoples R China. Zheng, Zhong; Wang, Zhiying, Natl Univ
                 Def Technol, State Key Lab High Performance Comp,
                 Changsha, Hunan, Peoples R China. Zheng, Zhong; Wang,
                 Zhiying, Natl Univ Def Technol, Sch Comp, Changsha,
                 Hunan, Peoples R China. Lipasti, Mikko, Univ Wisconsin,
                 Dept Elect \& Comp Engn, Madison, WI 54706 USA.",
  author-email = "zheng\_zhong@nudt.edu.cn zywang@nudt.edu.cn
                 mikko@engr.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "CSC; 863 Program [2012AA010905]; NSFC
                 [61070037, 61272143, 61272144, 61103016, 61202121];
                 NUDT [B120607]; RFDP [20114307120013]; NSF
                 [CCF-1318298]",
  funding-text = "This work was partially supported by CSC, 863 Program
                 (2012AA010905), NSFC (61070037, 61272143, 61272144,
                 61103016, 61202121), NUDT(B120607), RFDP
                 (20114307120013), and NSF (CCF-1318298).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "access patterns; adaptive cache-and-concurrency
                 allocation; Bandwidth; bandwidth utilization
                 improvement; benchmark performance improvement;
                 Benchmark testing; bypassed warps; cache; cache lines;
                 cache locality; Cache memory; cache storage; cache
                 thrashing prevention; cached warps; CCA; computational
                 resource utilization improvement; concurrency;
                 concurrency control; Concurrent computing; GPGPU; GPGPU
                 performance improvement; graphics processing units;
                 harmonic mean IPC improvement; Instruction sets; memory
                 bandwidth saturation; multi-threading; multiprocessing
                 systems; performance evaluation; Resource management;
                 reuse distance; stream multiprocessor; waiting warp
                 descheduling",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Zheng:2015:ACC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Nowatzki:2015:GBP,
  author =       "Tony Nowatzki and Venkatraman Govindaraju and
                 Karthikeyan Sankaralingam",
  title =        "A Graph-Based Program Representation for Analyzing
                 Hardware Specialization Approaches",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "94--98",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2476801",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware specialization has emerged as a promising
                 paradigm for future microprocessors. Unfortunately, it
                 is natural to develop and evaluate such architectures
                 within end-to-end vertical silos spanning application,
                 language/compiler, hardware design and evaluation
                 tools, leaving little opportunity for
                 cross-architecture analysis and innovation. This paper
                 develops a novel program representation suitable for
                 modeling heterogeneous architectures with specialized
                 hardware, called the transformable dependence graph
                 (TDG), which combines semantic information about
                 program properties and low-level hardware events in a
                 single representation. We demonstrate, using four
                 example architectures from the literature, that the TDG
                 is a feasible, simple, and accurate modeling technique
                 for transparent specialization architectures, enabling
                 cross-domain comparison and design-space exploration.",
  acknowledgement = ack-nhfb,
  affiliation =  "Nowatzki, T (Reprint Author), Univ Wisconsin, Dept
                 Comp Sci, 1210 W Dayton St, Madison, WI 53706 USA.
                 Nowatzki, Tony; Govindaraju, Venkatraman;
                 Sankaralingam, Karthikeyan, Univ Wisconsin, Dept Comp
                 Sci, Madison, WI 53706 USA.",
  author-email = "tjn@cs.wisc.edu venkatra@cs.wisc.edu
                 karu@cs.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; accelerators; computer architecture;
                 Computer architecture; dependence graphs; graph theory;
                 graph-based program representation; Hardware
                 specialization; hardware specialization approach;
                 heterogeneous architecture modeling; Load modeling;
                 Microarchitecture; microprocessors; Microprocessors;
                 modelling; program representation; Specialization;
                 Specialization, accelerators, modelling, program
                 representation, dependence graphs; TDG; transformable
                 dependence graph; Transforms",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Nowatzki:2015:GBP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2015:PEM,
  author =       "Seung Hun Kim and Dohoon Kim and Changmin Lee and Won
                 Seob Jeong and Won Woo Ro and Jean-Luc Gaudiot",
  title =        "A Performance-Energy Model to Evaluate Single Thread
                 Execution Acceleration",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "99--102",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2368144",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "It is well known that the cost of executing the
                 sequential portion of a program will limit and
                 sometimes even eclipse the gains brought by processing
                 in parallel the rest of the program. This means that
                 serious consideration should be brought to bear on
                 accelerating the execution of this unavoidable
                 sequential part. Such acceleration can be done by
                 boosting the operating frequency in a symmetric
                 multicore processor. In this paper, we derive a
                 performance and power model to describe the
                 implications of this approach. From our model, we show
                 that the ratio of performance over energy during the
                 sequential part improves with an increase in the number
                 of cores. In addition, we demonstrate how to determine
                 with the proposed model the optimal frequency boosting
                 ratio which maximizes energy efficiency.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, SH (Reprint Author), Yonsei Univ, Sch Elect \&
                 Elect Engn, Seoul 120749, South Korea. Kim, Seung Hun;
                 Kim, Dohoon; Lee, Changmin; Jeong, Won Seob; Ro, Won
                 Woo, Yonsei Univ, Sch Elect \& Elect Engn, Seoul
                 120749, South Korea. Gaudiot, Jean-Luc, Univ Calif
                 Irvine, Dept Elect Engn \& Comp Sci, Irvine, CA USA.",
  author-email = "kseunghun@gmail.com dohoon.kim@yonsei.ac.kr
                 exahz@yonsei.ac.kr ws.jeong@yonsei.ac.kr
                 wro@yonsei.ac.kr gaudiot@uci.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Basic Science Research Program through the
                 National Research Foundation of Korea (NRF) ---
                 Ministry of Education [2010-0013202]; National Science
                 Foundation [CCF-1439165]",
  funding-text = "This work was supported in part by the Basic Science
                 Research Program through the National Research
                 Foundation of Korea (NRF) funded by the Ministry of
                 Education (2010-0013202) and by the National Science
                 Foundation, under award CCF-1439165. Any opinions,
                 findings, and conclusions expressed in this material
                 are those of the authors and do not necessarily reflect
                 the views of the sponsors. W. W. Ro is the
                 corresponding author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "energy efficiency; Energy management; energy-aware
                 systems; Mathematical model; Microprocessors; Multicore
                 processing; multiprocessing systems; multiprocessor
                 systems; optimal frequency boosting ratio; parallel
                 processing; performance evaluation; Performance
                 evaluation; Performance modeling; performance-energy
                 model; power aware computing; Power demand; single
                 thread execution acceleration; symmetric multicore
                 processor",
  keywords-plus = "AMDAHLS LAW; ERA",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kim:2015:PEM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Song:2015:ARL,
  author =       "William Song and Saibal Mukhopadhyay and Sudhakar
                 Yalamanchili",
  title =        "Architectural Reliability: Lifetime Reliability
                 Characterization and Management of Many-Core
                 Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "103--106",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2340873",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper presents a lifetime reliability
                 characterization of many-core processors based on a
                 full-system simulation of integrated microarchitecture,
                 power, thermal, and reliability models. Under normal
                 operating conditions, our model and analysis reveal
                 that the mean-time-to-failure of cores on the die show
                 normal distribution. From the processor-level
                 perspective, the key insight is that reducing the
                 variance of the distribution can improve lifetime
                 reliability by avoiding early failures. Based on this
                 understanding, we present two variance reduction
                 techniques for proactive reliability management; (i)
                 proportional dynamic voltage-frequency scaling (DVFS)
                 and (ii) coordinated thread swapping. A major advantage
                 of using variance reduction techniques is that the
                 improvement of system lifetime reliability can be
                 achieved without adding design margins or spare
                 components.",
  acknowledgement = ack-nhfb,
  affiliation =  "Song, W (Reprint Author), Georgia Inst Technol, Sch
                 Elect \& Comp Engn, Atlanta, GA 30332 USA. Song,
                 William; Mukhopadhyay, Saibal; Yalamanchili, Sudhakar,
                 Georgia Inst Technol, Sch Elect \& Comp Engn, Atlanta,
                 GA 30332 USA.",
  author-email = "wjhsong@gatech.edu saibal.mukhopadhyay@ece.gatech.edu
                 sudha.yalamanchili@ece.gatech.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Semiconductor Research Corporation
                 [2084.001]; IBM/SRC Graduate Fellowship; Sandia
                 National Laboratories",
  funding-text = "This research was supported by the Semiconductor
                 Research Corporation under task \#2084.001, IBM/SRC
                 Graduate Fellowship, and Sandia National
                 Laboratories.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architectural reliability; Benchmark testing; Computer
                 architecture; Computer architecture, lifetime
                 estimation, modeling, semiconductor device reliability,
                 simulation; coordinated thread swapping; core
                 mean-time-to-failure; Degradation; design margins;
                 DVFS; full-system simulation; Gaussian distribution;
                 integrated circuit design; Integrated circuit
                 reliability; integrated microarchitecture; lifetime
                 estimation; lifetime reliability characterization;
                 many-core processors; Microarchitecture; microprocessor
                 chips; modeling; multiprocessing systems; normal
                 operating conditions; power aware computing; power
                 models; Program processors; proportional dynamic
                 voltage-frequency scaling; reliability models;
                 semiconductor device reliability; simulation; spare
                 components; thermal models; variance reduction
                 techniques",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Song:2015:ARL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Poluri:2015:SET,
  author =       "Pavan Poluri and Ahmed Louri",
  title =        "A Soft Error Tolerant Network-on-Chip Router Pipeline
                 for Multi-Core Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "107--110",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2360686",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Network-on-Chip (NoC) paradigm is rapidly evolving
                 into an efficient interconnection network to handle the
                 strict communication requirements between the
                 increasing number of cores on a single chip.
                 Diminishing transistor size is making the NoC
                 increasingly vulnerable to both hard faults and soft
                 errors. This paper concentrates on soft errors in NoCs.
                 A soft error in an NoC router results in significant
                 consequences such as data corruption, packet
                 retransmission and deadlock among others. To this end,
                 we propose Soft Error Tolerant NoC Router (STNR)
                 architecture, that is capable of detecting and
                 recovering from soft errors occurring in different
                 control stages of the routing pipeline. STNR exploits
                 the use of idle cycles inherent in NoC packet routing
                 pipeline to perform time redundant executions necessary
                 for soft error tolerance. In doing so, STNR is able to
                 detect and correct all single transient faults in the
                 control stages of the pipeline. Simulation results
                 using PARSEC and SPLASH-2 benchmarks show that STNR is
                 able to accomplish such high level of soft error
                 protection with a minimal impact on latency (an
                 increase of 1.7 and 1.6 percent respectively).
                 Additionally, STNR incurs an area overhead of 7 percent
                 and power overhead of 13 percent as compared to the
                 baseline unprotected router.",
  acknowledgement = ack-nhfb,
  affiliation =  "Poluri, P (Reprint Author), Univ Arizona, Dept Elect
                 \& Comp Engn, Tucson, AZ 85721 USA. Poluri, Pavan;
                 Louri, Ahmed, Univ Arizona, Dept Elect \& Comp Engn,
                 Tucson, AZ 85721 USA.",
  author-email = "pavanp@email.arizona.edu louri@email.arizona.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "US National Science Foundation (NSF)
                 [CNS-1318997, ECCS-0725765, ECCS-1342702,
                 CCF-1420681]",
  funding-text = "This research was supported by US National Science
                 Foundation (NSF) awards CNS-1318997, ECCS-0725765,
                 ECCS-1342702 and CCF-1420681.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; data corruption; deadlock;
                 fault tolerance; hard faults; idle cycles; integrated
                 circuit reliability; interconnection network; Multicore
                 processing; multicore systems; multiprocessing systems;
                 network routing; Network-on-chip; network-on-chip;
                 Network-on-chip; NoC packet routing pipeline; packet
                 retransmission; PARSEC; performance; Pipelines; Ports
                 (Computers); radiation hardening (electronics);
                 reliability; Resource management; single chip; single
                 transient faults; soft error; soft error protection;
                 soft error tolerance; soft error tolerant
                 network-on-chip router pipeline; soft error tolerant
                 NoC router architecture; SPLASH-2 benchmarks; STNR
                 architecture; Switches; time redundant executions;
                 Transient analysis; transistor size",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Poluri:2015:SET",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Xiao:2015:SCD,
  author =       "Canwen Xiao and Yue Yang and Jianwen Zhu",
  title =        "A Sufficient Condition for Deadlock-Free Adaptive
                 Routing in Mesh Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "111--114",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2363829",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Deadlock remains a central problem in interconnection
                 network. In this paper, we establish a new theory of
                 deadlock-free flow control for k-ary, n-cube mesh
                 network, which enables the use of any minimal-path
                 adaptive routing algorithms while avoiding deadlock. We
                 prove that the proposed flow control algorithm is a
                 sufficient condition for deadlock freedom in any
                 minimal path, adaptive routing algorithms on k-ary,
                 n-cube mesh network.",
  acknowledgement = ack-nhfb,
  affiliation =  "Xiao, CW (Reprint Author), Natl Univ Def Technol,
                 Changsha, Hunan, Peoples R China. Xiao, Canwen, Natl
                 Univ Def Technol, Changsha, Hunan, Peoples R China.
                 Yang, Yue; Zhu, Jianwen, Univ Toronto, Dept Elect \&
                 Comp Engn, Toronto, ON, Canada.",
  author-email = "cwxiao@nudt.edu.cn yyang@eecg.toronto.edu
                 jzhu@eecg.toronto.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "``863'' program of China [2012AA01A301,
                 2013AA014301]",
  funding-text = "This work is supported by ``863'' program of China
                 (2012AA01A301, 2013AA014301).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Adaptive systems; Aerospace electronics; concurrency
                 control; deadlock avoidance; Deadlock-Free;
                 deadlock-free adaptive routing; deadlock-free flow
                 control; flow control; interconnection network; k-ary;
                 k-ary mesh network; mesh networks; Mesh networks;
                 minimal path routing algorithm; minimal-path adaptive
                 routing algorithms; Multiprocessor interconnection;
                 multiprocessor interconnection networks; n-cube mesh
                 network; Routing; sufficient condition; System
                 recovery; Wireless mesh networks",
  number-of-cited-references = "7",
  research-areas = "Computer Science",
  researcherid-numbers = "Yang, Yue/N-8370-2019",
  times-cited =  "1",
  unique-id =    "Xiao:2015:SCD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Mittal:2015:ATE,
  author =       "Sparsh Mittal and Jeffrey S. Vetter",
  title =        "{AYUSH}: a Technique for Extending Lifetime of
                 {SRAM--NVM} Hybrid Caches",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "115--118",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2355193",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Recently, researchers have explored way-based hybrid
                 SRAM-NVM (non-volatile memory) last level caches (LLCs)
                 to bring the best of SRAM and NVM together. However,
                 the limited write endurance of NVMs restricts the
                 lifetime of these hybrid caches. We present AYUSH, a
                 technique to enhance the lifetime of hybrid caches,
                 which works by using data-migration to preferentially
                 use SRAM for storing frequently-reused data.
                 Microarchitectural simulations confirm that AYUSH
                 achieves larger improvement in lifetime than a previous
                 technique and also maintains performance and energy
                 efficiency. For single, dual and quad-core workloads,
                 the average increase in cache lifetime with AYUSH is
                 6.90, 24.06 and 47.62x, respectively.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mittal, S (Reprint Author), Oak Ridge Natl Lab, Div
                 Math \& Comp Sci, Oak Ridge, TN 37831 USA. Mittal,
                 Sparsh; Vetter, Jeffrey S., Oak Ridge Natl Lab, Div
                 Math \& Comp Sci, Oak Ridge, TN 37831 USA.",
  author-email = "mittals@ornl.gov vetter@ornl.gov",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "AYUSH; Benchmark testing; Cache memory; cache storage;
                 data-migration; device lifetime; energy efficiency;
                 Energy loss; hybrid cache; last level caches;
                 microarchitectural simulation; Non-volatile memory
                 (NVM); nonvolatile memory; Nonvolatile memory;
                 Radiation detectors; Random access memory; SRAM; SRAM
                 chips; SRAM-NVM cache; SRAM-NVM hybrid caches; write
                 endurance",
  keywords-plus = "ENERGY; MODEL",
  number-of-cited-references = "17",
  ORCID-numbers = "Vetter, Jeffrey/0000-0002-2449-6720 Mittal,
                 Sparsh/0000-0002-2908-993X",
  research-areas = "Computer Science",
  times-cited =  "11",
  unique-id =    "Mittal:2015:ATE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Manohar:2015:CSD,
  author =       "Rajit Manohar",
  title =        "Comparing Stochastic and Deterministic Computing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "119--122",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2412553",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Technology scaling has raised the specter of myriads
                 of cheap, but unreliable and/or stochastic devices that
                 must be creatively combined to create a reliable
                 computing system. This has renewed the interest in
                 computing that exploits stochasticity-embracing, not
                 combating the device physics. If a stochastic
                 representation is used to implement a programmable
                 general-purpose architecture akin to CPUs, GPUs, or
                 FPGAs, the preponderance of evidence indicates that
                 most of the system energy will be expended in
                 communication and storage as opposed to computation.
                 This paper presents an analytical treatment of the
                 benefits and drawbacks of adopting a stochastic
                 approach by examining the cost of representing a value.
                 We show both scaling laws and costs for low precision
                 representations. We also analyze the cost of
                 multiplication implemented using stochastic versus
                 deterministic approaches, since multiplication is the
                 prototypical inexpensive stochastic operation. We show
                 that the deterministic approach compares favorably to
                 the stochastic approach when holding precision and
                 reliability constant.",
  acknowledgement = ack-nhfb,
  affiliation =  "Manohar, R (Reprint Author), Cornell Univ, Cornell
                 Tech, New York, NY 10011 USA. Manohar, Rajit, Cornell
                 Univ, Cornell Tech, New York, NY 10011 USA.",
  author-email = "rajit@csl.cornell.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Complexity theory; Computer architecture;
                 deterministic computing; Encoding; field programmable
                 gate arrays; FPGAs; general-purpose architecture; GPUs;
                 graphics processing units; Logic gates; Receivers;
                 reliable computing system; stochastic computing;
                 Stochastic processes; stochastic processes; stochastic
                 representation",
  number-of-cited-references = "18",
  research-areas = "Computer Science",
  times-cited =  "5",
  unique-id =    "Manohar:2015:CSD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Seo:2015:DDF,
  author =       "Bon-Keun Seo and Seungryoul Maeng and Joonwon Lee and
                 Euiseong Seo",
  title =        "{DRACO}: a Deduplicating {FTL} for Tangible Extra
                 Capacity",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "123--126",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2350984",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The rapid random access of SSDs enables efficient
                 searching of redundant data and their deduplication.
                 However, the space earned from deduplication cannot be
                 used as permanent storage because it must be reclaimed
                 when deduplication is cancelled as a result of an
                 update to the deduplicated data. To overcome this
                 limitation, we propose a novel FTL scheme that enables
                 the gained capacity to be used as permanent storage
                 space for the file system layer. The proposed approach
                 determines the safe amount of gained capacity that can
                 be provided to the upper layer based on the compression
                 rate prediction scheme. It then secures the required
                 space by compressing cold data when capacity overflow
                 occurs from cancelled deduplication. Our evaluation
                 with a kernel source repository showed that the file
                 system obtained approximately 79 percent additional
                 capacity by the proposed scheme.",
  acknowledgement = ack-nhfb,
  affiliation =  "Seo, BK (Reprint Author), Korea Adv Inst Sci \&
                 Technol, Dept Comp Sci, Taejon 305701, South Korea.
                 Seo, Bon-Keun; Maeng, Seungryoul, Korea Adv Inst Sci \&
                 Technol, Dept Comp Sci, Taejon 305701, South Korea.
                 Lee, Joonwon; Seo, Euiseong, Sungkyunkwan Univ, Coll
                 Informat \& Commun Engn, Suwon 440746, South Korea.",
  author-email = "joonwon@skku.edu euiseong@skku.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Basic Science Research Program through the
                 National Research Foundation of Korea
                 [2012R1A1A2A10038823]",
  funding-text = "This research was supported by Basic Science Research
                 Program through the National Research Foundation of
                 Korea (2012R1A1A2A10038823).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "capacity overflow; cold data compression; compression;
                 compression rate prediction scheme; data compression;
                 data deduplication; Data structures; deduplicating FTL;
                 deduplication; disc drives; DRACO; Entropy; file system
                 layer; file systems; File systems; file systems; flash
                 memories; flash memory; Flash memory; flash memory;
                 flash translation layer; FTL; kernel source repository;
                 Linux; over-provisioning; permanent storage space;
                 rapid random access; redundant data searching; SDRAM;
                 SSD; storage management; storage reclamation; tangible
                 extra capacity",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  researcherid-numbers = "Maeng, Seungryoul/C-1882-2011",
  times-cited =  "2",
  unique-id =    "Seo:2015:DDF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Seshadri:2015:FBB,
  author =       "Vivek Seshadri and Kevin Hsieh and Amirali Boroum and
                 Donghyuk Lee and Michael A. Kozuch and Onur Mutlu and
                 Phillip B. Gibbons and Todd C. Mowry",
  title =        "Fast Bulk Bitwise {AND} and {OR} in {DRAM}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "127--131",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2434872",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Bitwise operations are an important component of
                 modern day programming, and are used in a variety of
                 applications such as databases. In this work, we
                 propose a new and simple mechanism to implement bulk
                 bitwise AND and OR operations in DRAM, which is faster
                 and more efficient than existing mechanisms. Our
                 mechanism exploits existing DRAM operation to perform a
                 bitwise AND/OR of two DRAM rows completely within DRAM.
                 The key idea is to simultaneously connect three cells
                 to a bitline before the sense-amplification. By
                 controlling the value of one of the cells, the sense
                 amplifier forces the bitline to the bitwise AND or
                 bitwise OR of the values of the other two cells. Our
                 approach can improve the throughput of bulk bitwise
                 AND/OR operations by 9.7X and reduce their energy
                 consumption by 50.5.X. Since our approach exploits
                 existing DRAM operation as much as possible, it
                 requires negligible changes to DRAM logic. We evaluate
                 our approach using a real-world implementation of a
                 bit-vector based index for databases. Our mechanism
                 improves the performance of commonly-used range queries
                 by 30 percent on average.",
  acknowledgement = ack-nhfb,
  affiliation =  "Seshadri, V (Reprint Author), Carnegie Mellon Univ,
                 Pittsburgh, PA 15213 USA. Seshadri, Vivek; Hsieh,
                 Kevin; Boroum, Amirali; Lee, Donghyuk; Mutlu, Onur;
                 Mowry, Todd C., Carnegie Mellon Univ, Pittsburgh, PA
                 15213 USA. Kozuch, Michael A.; Gibbons, Phillip B.,
                 Intel Pittsburgh, Pittsburgh, PA USA.",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [0953246, 1212962, 1320531]; Intel
                 Science and Tech. Center; Samsung; Google; Facebook;
                 SRC",
  funding-text = "This work was supported by NSF (awards 0953246,
                 1212962, and 1320531), and Intel Science and Tech.
                 Center, Samsung, Google, Facebook, and SRC.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "bit-vector based index; bitwise AND/OR; bulk-bitwise
                 AND operation; bulk-bitwise OR operation; Capacitors;
                 cell value control; Computer architecture; database
                 indexing; Decoding; DRAM; DRAM chips; DRAM memory; DRAM
                 memory, bitwise AND/OR, performance; DRAM operation;
                 energy consumption reduction; logic gates; performance;
                 performance improvement; Program processors; Random
                 access memory; range queries; sense amplifier;
                 sense-amplification; Throughput; throughput
                 improvement",
  number-of-cited-references = "20",
  research-areas = "Computer Science",
  times-cited =  "21",
  unique-id =    "Seshadri:2015:FBB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Altaf:2015:LPM,
  author =       "Muhammad Shoaib Bin Altaf and David A. Wood",
  title =        "{LogCA}: a Performance Model for Hardware
                 Accelerators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "132--135",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2360182",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "To address the Dark Silicon problem, architects have
                 increasingly turned to special-purpose hardware
                 accelerators to improve the performance and energy
                 efficiency of common computational kernels, such as
                 encryption and compression. Unfortunately, the latency
                 and overhead required to off-load a computation to an
                 accelerator sometimes outweighs the potential benefits,
                 resulting in a net decrease in performance or energy
                 efficiency. To help architects and programmers reason
                 about these trade-offs, we have developed the LogCA
                 model, a simple performance model for hardware
                 accelerators. LogCA provides a simplified abstraction
                 of a hardware accelerator characterized by five key
                 parameters. We have validated the model against a
                 variety of accelerators, ranging from on-chip
                 cryptographic accelerators in Sun's UltraSparc T2 and
                 Intel's Sandy Bridge to both discrete and integrated
                 GPUs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Bin Altaf, MS (Reprint Author), Univ Wisconsin,
                 Madison, WI 53706 USA. Bin Altaf, Muhammad Shoaib;
                 Wood, David A., Univ Wisconsin, Madison, WI 53706
                 USA.",
  author-email = "shoaibbinalt@wisc.edu david@cs.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CNS-1117280, CCF-1218323,
                 CNS-1302260]",
  funding-text = "We thank Mark Hill, Michael Swift, Rathijit Sen, and
                 the members of the Wisconsin Multifacet group for their
                 comments on the paper. This work is supported in part
                 with NSF grants CNS-1117280, CCF-1218323, and
                 CNS-1302260. The views expressed herein are not
                 necessarily those of the NSF. Professor Wood has
                 significant financial interests in AMD, Google and
                 Panasas.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accelerators; compression; computational kernel;
                 Computational modeling; cryptography; dark silicon
                 problem; encryption; energy conservation; energy
                 efficiency; GPU; graphics processing units; Hardware
                 accelerators; heterogeneous systems; Intel Sandy
                 Bridge; LogCA model; Modeling; modeling techniques;
                 modeling techniques,; on-chip cryptographic
                 accelerator; Performance evaluation; performance model;
                 performance of systems; special-purpose hardware
                 accelerator; UltraSparc T2",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Altaf:2015:LPM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Diamantopoulos:2015:MMI,
  author =       "Dionysios Diamantopoulos and Sotirios Xydis and Kostas
                 Siozios and Dimitrios Soudris",
  title =        "Mitigating Memory-Induced Dark Silicon in
                 Many-Accelerator Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "136--139",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2410791",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Many-Accelerator (MA) systems have been introduced as
                 a promising architectural paradigm that can boost
                 performance and improve power of general-purpose
                 computing platforms. In this paper, we focus on the
                 problem of resource under-utilization, i.e., Dark
                 Silicon, in FPGA-based MA platforms. We show that
                 except the typically expected peak power budget,
                 on-chip memory resources form a severe
                 under-utilization factor in MA platforms, leading up to
                 75 percent of dark silicon. Recognizing that static
                 memory allocation-the de-facto mechanism supported by
                 modern design techniques and synthesis tools-forms the
                 main source of memory-induced Dark Silicon, we
                 introduce a novel framework that extends conventional
                 high level synthesis (HLS) with dynamic memory
                 management (DMM) features, enabling accelerators to
                 dynamically adapt their allocated memory to the runtime
                 memory requirements, thus maximizing the overall
                 accelerator count through effective sharing of FPGA's
                 memories resources. We show that our technique delivers
                 significant gains in FPGA's accelerators density, i.e.
                 3.8x, and application throughput up to 3.1x and 21.4x
                 for shared and private memory accelerators.",
  acknowledgement = ack-nhfb,
  affiliation =  "Diamantopoulos, D (Reprint Author), Natl Tech Univ
                 Athens, Sch Elect \& Comp Engn, Athens, Greece.
                 Diamantopoulos, Dionysios; Xydis, Sotirios; Siozios,
                 Kostas; Soudris, Dimitrios, Natl Tech Univ Athens, Sch
                 Elect \& Comp Engn, Athens, Greece.",
  author-email = "diamantd@microlab.ntua.gr sxydis@microlab.ntua.gr
                 ksiop@microlab.ntua.gr dsoudris@microlab.ntua.gr",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "E.C. [644906]",
  funding-text = "This research is partially supported by the E.C.
                 funded program AEGLE under H2020 Grant Agreement No:
                 644906.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "de-facto mechanism; DMM feature; dynamic memory
                 management; dynamic memory management feature; Dynamic
                 scheduling; Field programmable gate arrays; field
                 programmable gate arrays; FPGA-based MA platform;
                 high-level synthesis; high-level synthesis tool; HLS
                 tool; MA system; Many-accelerator architectures;
                 many-accelerator architectures; Many-accelerator
                 architectures; Memory management; memory-induced dark
                 silicon source; modern design technique; Network
                 architecture; on-chip memory resource; peak power
                 budget; power aware computing; Resource management;
                 severe under-utilization factor; silicon; static memory
                 allocation; storage management; System-on-chip;
                 Throughput",
  number-of-cited-references = "14",
  ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847 Siozios,
                 Kostas/0000-0002-0285-2202",
  research-areas = "Computer Science",
  researcherid-numbers = "Soudris, Dimitrios/O-8843-2019 Siozios,
                 Kostas/F-9726-2011",
  times-cited =  "1",
  unique-id =    "Diamantopoulos:2015:MMI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Poremba:2015:NUF,
  author =       "Matthew Poremba and Tao Zhang and Yuan Xie",
  title =        "{NVMain 2.0}: a User-Friendly Memory Simulator to
                 Model (Non-) Volatile Memory Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "140--143",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2402435",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this letter, a flexible memory simulator --- NVMain
                 2.0, is introduced to help the community for modeling
                 not only commodity DRAMs but also emerging memory
                 technologies, such as die-stacked DRAM caches,
                 non-volatile memories (e.g., STT-RAM, PCRAM, and ReRAM)
                 including multi-level cells (MLC), and hybrid
                 non-volatile plus DRAM memory systems. Compared to
                 existing memory simulators, NVMain 2.0 features a
                 flexible user interface with compelling simulation
                 speed and the capability of providing sub-array-level
                 parallelism, fine-grained refresh, MLC and data encoder
                 modeling, and distributed energy profiling.",
  acknowledgement = ack-nhfb,
  affiliation =  "Poremba, M (Reprint Author), Penn State Univ, Dept
                 Comp Sci \& Engn, University Pk, PA 16802 USA. Poremba,
                 Matthew; Zhang, Tao; Xie, Yuan, Penn State Univ, Dept
                 Comp Sci \& Engn, University Pk, PA 16802 USA.",
  author-email = "poremba@cse.psu.edu zhangtao@cse.psu.edu
                 yuanxie@cse.psu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [1218867, 1213052, 1409798]; Department
                 of Energy [DE-SC0005026]",
  funding-text = "Poremba, Zhang, and Xie were supported in part by NSF
                 1218867, 1213052, 1409798. This material was based on
                 work supported by the Department of Energy under Award
                 Number DE-SC0005026. Matthew Poremba is the
                 corresponding author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache storage; commodity DRAM; Computational modeling;
                 Computer architecture; die-stacked DRAM cache; DRAM
                 chips; DRAM memory systems; flexible memory simulator;
                 flexible user interface; Memory architecture; memory
                 architecture; Memory architecture, random access
                 memory, nonvolatile memory, phase change memory, SDRAM;
                 Memory management; memory technology; multilevel cells;
                 nonvolatile memory; Nonvolatile memory; nonvolatile
                 memory system; NVMain 2.0; PCRAM; phase change
                 memories; phase change memory; Phase change random
                 access memory; random access memory; ReRAM; SDRAM;
                 STT-RAM; user interfaces; user-friendly memory
                 simulator",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "36",
  unique-id =    "Poremba:2015:NUF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Vandierendonck:2015:EEB,
  author =       "Hans Vandierendonck and Ahmad Hassan and Dimitrios S.
                 Nikolopoulos",
  title =        "On the Energy-Efficiency of Byte-Addressable
                 Non-Volatile Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "144--147",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2355195",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Non-volatile memory (NVM) technology holds promise to
                 replace SRAM and DRAM at various levels of the memory
                 hierarchy. The interest in NVM is motivated by the
                 difficulty faced in scaling DRAM beyond 22 nm and,
                 long-term, lower cost per bit. While offering higher
                 density and negligible static power (leakage and
                 refresh), NVM suffers increased latency and energy per
                 memory access. This paper develops energy and
                 performance models of memory systems and applies them
                 to understand the energy-efficiency of replacing or
                 complementing DRAM with NVM. Our analysis focusses on
                 the application of NVM in main memory. We demonstrate
                 that NVM such as STT-RAM and RRAM is energy-efficient
                 for memory sizes commonly employed in servers and
                 high-end workstations, but PCM is not. Furthermore, the
                 model is well suited to quickly evaluate the impact of
                 changes to the model parameters, which may be achieved
                 through optimization of the memory architecture, and to
                 determine the key parameters that impact system-level
                 energy and performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Vandierendonck, H (Reprint Author), Queens Univ
                 Belfast, Belfast BT7 1NN, Antrim, North Ireland.
                 Vandierendonck, Hans; Nikolopoulos, Dimitrios S.,
                 Queens Univ Belfast, Belfast BT7 1NN, Antrim, North
                 Ireland. Hassan, Ahmad, SAP Belfast, Belfast, Antrim,
                 North Ireland.",
  author-email = "h.vandierendonck@qub.ac.uk ahmad.hassan@sap.com
                 d.nikolopoulos@qub.ac.uk",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "People Programme (Marie Curie Actions) of
                 the European Union's Seventh Framework Programme
                 [327744]",
  funding-text = "This work was supported by the People Programme (Marie
                 Curie Actions) of the European Union's Seventh
                 Framework Programme (FP7/2007-2013), grant agreement
                 no. 327744.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "byte-addressable nonvolatile memory technology;
                 Computational modeling; DRAM; DRAM chips; energy;
                 energy conservation; energy efficiency; Enery
                 efficiency; impact system-level energy; Main memory
                 systems; Main memory systems, non-volatile memory,
                 energy, modeling; Mathematical model; memory
                 architecture; memory hierarchy; Memory management;
                 memory systems; modeling; non-volatile memory;
                 Nonvolatile memory; NVM technology; PCM; Phase change
                 materials; Random access memory; RRAM; SRAM; SRAM
                 chips; static power; STT-RAM",
  number-of-cited-references = "15",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Vandierendonck:2015:EEB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yavits:2015:RAP,
  author =       "Leonid Yavits and Shahar Kvatinsky and Amir Morad and
                 Ran Ginosar",
  title =        "Resistive Associative Processor",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "148--151",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2374597",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Associative Processor (AP) combines data storage and
                 data processing, and functions simultaneously as a
                 massively parallel array SIMD processor and memory.
                 Traditionally, AP is based on CMOS technology, similar
                 to other classes of massively parallel SIMD processors.
                 The main component of AP is a Content Addressable
                 Memory (CAM) array. As CMOS feature scaling slows down,
                 CAM experiences scalability problems. In this work, we
                 propose and investigate an AP based on resistive
                 CAM-the Resistive AP (ReAP). We show that resistive
                 memory technology potentially allows scaling the AP
                 from a few millions to a few hundred millions of
                 processing units on a single silicon die. We compare
                 the performance and power consumption of a ReAP to a
                 CMOS AP and a conventional SIMD accelerator (GPU) and
                 show that ReAP, although exhibiting higher power
                 density, allows better scalability and higher
                 performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Yavits, L (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-3200000 Haifa, Israel.
                 Yavits, Leonid; Kvatinsky, Shahar; Morad, Amir;
                 Ginosar, Ran, Technion Israel Inst Technol, Dept Elect
                 Engn, IL-3200000 Haifa, Israel.",
  author-email = "yavits@txtechnion.ac.il skva@txtechnion.ac.il
                 amirm@txtechnion.ac.il ran@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Intel Collaborative Research Institute for
                 Computational Intelligence; Hasso-Plattner-Institut",
  funding-text = "The authors would like to thank Uri Weiser for
                 inspiring this research. This work was partially funded
                 by the Intel Collaborative Research Institute for
                 Computational Intelligence and by
                 Hasso-Plattner-Institut.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Associative processing; associative processor;
                 Associative Processor; associative processor; CAM
                 array; CMOS feature scaling; CMOS integrated circuits;
                 CMOS technology; complimentary metal oxide
                 semiconductor; Computer aided manufacturing; content
                 addressable memory array; content-addressable storage;
                 data processing; data storage; GPU; graphics processing
                 unit; in-memory computing; In-Memory Computing;
                 in-memory computing; massively parallel array SIMD
                 processor; memory function; memristor; Memristor;
                 memristor; Memristors; parallel processing; Random
                 access memory; ReAP; resistive associative processor;
                 resistive RAM; Resistive RAM; resistive RAM; SIMD; SIMD
                 accelerator",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "22",
  unique-id =    "Yavits:2015:RAP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kang:2015:SRT,
  author =       "Suk Chan Kang and Chrysostomos Nicopoulos and Ada
                 Gavrilovska and Jongman Kim",
  title =        "Subtleties of Run-Time Virtual Address Stacks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "152--155",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2337299",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The run-time virtual address (VA) stack has some
                 unique properties, which have garnered the attention of
                 researchers. The stack one-dimensionally grows and
                 shrinks at its top, and contains data that is seemingly
                 local/private to one thread, or process. Most prior
                 related research has focused on these properties.
                 However, this article aims to demonstrate how
                 conventional wisdom pertaining to the run-time VA stack
                 fails to capture some critical subtleties and
                 complexities. We first explore two widely established
                 assumptions surrounding the VA stack area: (1) Data
                 accesses can be classified as falling either under
                 VA-stack-area accesses, or non-stack-area accesses,
                 with no aliasing; (2) The VA stack data is completely
                 private and invisible to other threads/processes.
                 Subsequently, we summarize a representative selection
                 of related work that pursued the micro-architectural
                 concept of using run-time VA stacks to extend the
                 general-purpose register file. We then demonstrate why
                 these assumptions are invalid, by using examples from
                 prior work to highlight the potential hazards regarding
                 data consistency, shared memory consistency, and cache
                 coherence. Finally, we suggest safeguards against these
                 hazards. Overall, we explore the function-critical
                 issues that future operating systems and compilers
                 should address to effectively reap all the benefits of
                 using run-time VA stacks.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kang, SC (Reprint Author), Georgia Inst Technol,
                 Atlanta, GA 30332 USA. Kang, Suk Chan; Gavrilovska,
                 Ada; Kim, Jongman, Georgia Inst Technol, Atlanta, GA
                 30332 USA. Nicopoulos, Chrysostomos, Univ Cyprus,
                 CY-1678 Nicosia, Cyprus.",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache coherence; cache storage; data consistency; data
                 decoupling; data integrity; data privacy;
                 function-critical issue; general-purpose register file;
                 Instruction sets; memory consistency;
                 microarchitectural concept; nonstack-area access;
                 register file; Run time; Run-time stack; run-time VA
                 stack data access; run-time virtual address stack;
                 shared memory; shared memory consistency; shared memory
                 systems; synonym page; VA-stack-area accesses;
                 Virtualization",
  number-of-cited-references = "12",
  ORCID-numbers = "Nicopoulos, Chrysostomos/0000-0001-6389-6068",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kang:2015:SRT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Rodopoulos:2015:TPV,
  author =       "Dimitrios Rodopoulos and Francky Catthoor and
                 Dimitrios Soudris",
  title =        "Tackling Performance Variability Due to {RAS}
                 Mechanisms with {PID}-Controlled {DVFS}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "156--159",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2385713",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As technology nodes approach deca-nanometer
                 dimensions, many phenomena threaten the binary
                 correctness of processor operation. Computer architects
                 typically enhance their designs with reliability,
                 availability and serviceability (RAS) schemes to
                 correct such errors, in many cases at the cost of extra
                 clock cycles, which, in turn, leads to processor
                 performance variability. The goal of the current paper
                 is to absorb this variability using Dynamic Voltage and
                 Frequency Scaling (DVFS). A closed-loop implementation
                 is proposed, which configures the clock frequency based
                 on observed metrics that encapsulate performance
                 variability due to RAS mechanisms. That way,
                 performance dependability and predictability is
                 achieved. We simulate the transient and steady state
                 behavior of our approach, reporting responsiveness
                 within less than 1 ms. We also assess our idea using
                 the power model of real processor and report a maximum
                 energy overhead of roughly 10 percent for dependable
                 performance in the presence of RAS temporal
                 overheads.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rodopoulos, D (Reprint Author), Natl Tech Univ Athens,
                 MicroLab, Sch Elect \& Comp Engn, Athens 15780, Greece.
                 Rodopoulos, Dimitrios; Soudris, Dimitrios, Natl Tech
                 Univ Athens, MicroLab, Sch Elect \& Comp Engn, Athens
                 15780, Greece. Catthoor, Francky, ESAT KU Leuven,
                 Leuven, Belgium. Catthoor, Francky, SSET IMEC, Leuven,
                 Belgium.",
  author-email = "drodo@microlab.ntua.gr catthoor@imec.be
                 dsoudris@microlab.ntua.gr",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "HARPA EC project [FP7-612069]",
  funding-text = "The authors thank Prof. Y. Sazeides and Prof. C.
                 Nicopoulos of UCY, Cyprus for the insightful
                 discussions. They also acknowledge the constructive
                 feedback of the reviewers. This work was partially
                 supported by the FP7-612069-HARPA EC project. Dimitrios
                 Rodopoulos is the corresponding author. Finally, the
                 authors acknowledge conversations with Dr. Antonis
                 Papanikolaou.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "availability; availability and serviceability;
                 Availability and Serviceability; availability and
                 serviceability; binary correctness; closed loop
                 systems; closed-loop implementation; computer
                 architects; computer architecture; deca-nanometer
                 dimensions; Dynamic voltage and frequency scaling;
                 dynamic voltage and frequency scaling; Dynamic voltage
                 and frequency scaling; Dynamic Voltage and Frequency
                 Scaling; Mathematical model; microcomputers;
                 Performance evaluation; performance variability;
                 performance vulnerability factor; Performance
                 Vulnerability Factor; PID-controlled DVFS; Process
                 control; processor operation; RAS mechanisms;
                 reliability; Reliability; reliability; Reliability;
                 serviceability; three-term control; Voltage control",
  number-of-cited-references = "21",
  ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847",
  research-areas = "Computer Science",
  researcherid-numbers = "Soudris, Dimitrios/O-8843-2019",
  times-cited =  "4",
  unique-id =    "Rodopoulos:2015:TPV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Markovic:2015:TLS,
  author =       "Nikola Markovic and Daniel Nemirovsky and Osman Unsal
                 and Mateo Valero and Adrian Cristal",
  title =        "Thread Lock Section-Aware Scheduling on Asymmetric
                 Single-{ISA} Multi-Core",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "160--163",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2357805",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "As thread level parallelism in applications has
                 continued to expand, so has research in chip multi-core
                 processors. As more and more applications become
                 multi-threaded we expect to find a growing number of
                 threads executing on a machine. As a consequence, the
                 operating system will require increasingly larger
                 amounts of CPU time to schedule these threads
                 efficiently. Instead of perpetuating the trend of
                 performing more complex thread scheduling in the
                 operating system, we propose a scheduling mechanism
                 that can be efficiently implemented in hardware as
                 well. Our approach of identifying multi-threaded
                 application bottlenecks such as thread synchronization
                 sections complements the Fairness-aware Scheduler
                 method. It achieves an average speed up of 11.5 percent
                 (geometric mean) compared to the state-of-the-art
                 Fairness-aware Scheduler.",
  acknowledgement = ack-nhfb,
  affiliation =  "Markovic, N (Reprint Author), Barcelona Supercomputing
                 Ctr, Barcelona, Spain. Markovic, Nikola; Nemirovsky,
                 Daniel; Unsal, Osman; Valero, Mateo, Barcelona
                 Supercomputing Ctr, Barcelona, Spain. Markovic, Nikola;
                 Nemirovsky, Daniel; Valero, Mateo, Univ Politecn
                 Cataluna, Barcelona, Spain. Cristal, Adrian, Univ
                 Politecn Cataluna, Barcelona Supercomputing Ctr,
                 E-08028 Barcelona, Spain. Cristal, Adrian, Artificial
                 Intelligence Res Inst Spanish Natl Res, Barcelona,
                 Spain.",
  author-email = "nikola.markovic@bsc.es daniel.nemirovsky@bsc.es
                 osman.unsal@bsc.es mateo.valero@bsc.es
                 adrian.cristal@bsc.es",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Asymmetric chip multiprocessor (ACMP); asymmetric
                 single-ISA multicore processor; chip multicore
                 processors; Context modeling; fairness-aware scheduler
                 method; HW/SW thread scheduling; Instruction sets;
                 microprocessor chips; multi-threaded applications;
                 multi-threading; Multicore processing; multiprocessing
                 systems; multithreaded application; operating system;
                 Operating systems; operating systems (computers);
                 scheduling; Scheduling; Synchronization; thread lock
                 section-aware scheduling mechanism; thread
                 synchronization",
  number-of-cited-references = "17",
  ORCID-numbers = "UNSAL, OSMAN/0000-0002-0544-9697 Valero,
                 Mateo/0000-0003-2917-2482",
  research-areas = "Computer Science",
  researcherid-numbers = "UNSAL, OSMAN/B-9161-2016 Valero,
                 Mateo/L-5709-2014",
  times-cited =  "7",
  unique-id =    "Markovic:2015:TLS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Pekhimenko:2015:TAC,
  author =       "Gennady Pekhimenko and Evgeny Bolotin and Mike
                 O'Connor and Onur Mutlu and Todd C. Mowry and Stephen
                 W. Keckler",
  title =        "Toggle-Aware Compression for {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "164--168",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2430853",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memory bandwidth compression can be an effective way
                 to achieve higher system performance and energy
                 efficiency in modern data-intensive applications by
                 exploiting redundancy in data. Prior works studied
                 various data compression techniques to improve both
                 capacity (e.g., of caches and main memory) and
                 bandwidth utilization (e.g., of the on-chip and
                 off-chip interconnects). These works addressed two
                 common shortcomings of compression: (i)
                 compression/decompression overhead in terms of latency,
                 energy, and area, and (ii) hardware complexity to
                 support variable data size. In this paper, we make the
                 new observation that there is another important problem
                 related to data compression in the context of the
                 communication energy efficiency: transferring
                 compressed data leads to a substantial increase in the
                 number of bit toggles (communication channel switchings
                 from 0 to 1 or from 1 to 0). This, in turn, increases
                 the dynamic energy consumed by on-chip and off-chip
                 buses due to more frequent charging and discharging of
                 the wires. Our results, for example, show that the bit
                 toggle count increases by an average of 2.2x with some
                 compression algorithms across 54 mobile GPU
                 applications. We characterize and demonstrate this new
                 problem across a wide variety of 221 GPU applications
                 and six different compression algorithms. To mitigate
                 the problem, we propose two new toggle-aware
                 compression techniques: energy control and Metadata
                 Consolidation. These techniques greatly reduce the bit
                 toggle count impact of the six data compression
                 algorithms we examine, while keeping most of their
                 bandwidth reduction benefits.",
  acknowledgement = ack-nhfb,
  affiliation =  "Pekhimenko, G (Reprint Author), Carnegie Mellon Univ,
                 Dept Comp Sci, Pittsburgh, PA 15206 USA. Pekhimenko,
                 Gennady; Mutlu, Onur; Mowry, Todd C., Carnegie Mellon
                 Univ, Dept Comp Sci, Pittsburgh, PA 15206 USA. Bolotin,
                 Evgeny; O'Connor, Mike; Keckler, Stephen W., NVIDA,
                 Santa Clara, CA USA. O'Connor, Mike; Keckler, Stephen
                 W., Univ Texas Austin, Austin, TX 78712 USA.",
  author-email = "gpekhimento@gmail.com ebolotin@nvidia.com
                 moconnor@nvidia.com omutlu@gmail.com tcm@cs.cmu.edu
                 skeckler@nvidia.com",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Intel Science and Technology Center for
                 Cloud Computing; US National Science Foundation
                 [1212962, 1409723, 1423172]; US Department of Energy",
  funding-text = "The authors acknowledge the support of Intel Science
                 and Technology Center for Cloud Computing; US National
                 Science Foundation grants 1212962, 1409723, and
                 1423172; and the US Department of Energy.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "bandwidth utilization; bit toggle count impact; bit
                 toggles; Communication channels; communication energy
                 efficiency; Compression algorithms;
                 compression/decompression overhead; Data compression;
                 data compression; data compression algorithms; data
                 compression techniques; Data compression,
                 interconnected systems, memory; data redundancy;
                 dynamic energy; energy control; graphics processing
                 units; Graphics processing units; hardware complexity;
                 interconnected systems; memory; memory bandwidth
                 compression; metadata consolidation; Mobile
                 communication; mobile GPU applications; modern
                 data-intensive applications; off-chip buses; on-chip
                 buses; power aware computing; System-on-chip;
                 toggle-aware compression; variable data size",
  number-of-cited-references = "29",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Pekhimenko:2015:TAC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2015:TCb,
  author =       "Anonymous",
  title =        "Table of Contents",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "C1--C1",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2510172",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:ICAc,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Computer Architecture Letters}}
                 Editorial Board}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "C2--C2",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2510173",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:ICAd,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Computer Architecture Letters}}}
                 Information for Authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "C3--C3",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2510174",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:ICSb,
  author =       "Anonymous",
  title =        "{IEEE Computer Society}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "C4--C4",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2510176",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Wu:2016:MCN,
  author =       "Wo-Tak Wu and Ahmed Louri",
  title =        "A Methodology for Cognitive {NoC} Design",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2447535",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The number of cores in a multicore chip design has
                 been increasing in the past two decades. The rate of
                 increase will continue for the foreseeable future. With
                 a large number of cores, the on-chip communication has
                 become a very important design consideration. The
                 increasing number of cores will push the communication
                 complexity level to a point where managing such highly
                 complex systems requires much more than what designers
                 can anticipate for. We propose a new design methodology
                 for implementing a cognitive network-on-chip that has
                 the ability to recognize changes in the environment and
                 to learn new ways to adapt to the changes. This
                 learning capability provides a way for the network to
                 manage itself. Individual network nodes work
                 autonomously to achieve global system goals, e.g., low
                 network latency, higher reliability, power efficiency,
                 adaptability, etc. We use fault-tolerant routing as a
                 case study. Simulation results show that the cognitive
                 design has the potential to outperform the conventional
                 design for large applications. With the great inherent
                 flexibility to adopt different algorithms, the
                 cognitive design can be applied to many applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wu, WT (Reprint Author), Univ Arizona, Dept Elect \&
                 Comp Engn, Tucson, AZ 85721 USA. Wu, Wo-Tak; Louri,
                 Ahmed, Univ Arizona, Dept Elect \& Comp Engn, Tucson,
                 AZ 85721 USA.",
  author-email = "wotakwu@email.arizona.edu louri@ece.arizona.edu",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "adaptive; Algorithm design and analysis; cognitive
                 network-on-chip; cognitive NoC design; cognitive
                 process; communication complexity; communication
                 complexity level; Fault tolerance; fault tolerant
                 computing; Fault tolerant systems; fault-tolerant;
                 fault-tolerant routing; individual network nodes;
                 integrated circuit design; intelligent agent; learning
                 (artificial intelligence); learning capability; machine
                 learning; multicore; multicore chip design; Multicore
                 processing; multiprocessing systems; network routing;
                 network-on-chip; NoC; on-chip communication; Routing;
                 Software",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Wu:2016:MCN",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2016:IICa,
  author =       "Anonymous",
  title =        "2015 Index {{\booktitle{IEEE Computer Architecture
                 Letters}}} Vol. 14",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "1--6",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2513858",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Indexes",
}

@Article{Anonymous:2016:IICb,
  author =       "Anonymous",
  title =        "2015 Index {{\booktitle{IEEE Computer Architecture
                 Letters}}} Vol. 14",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "1--6",
  month =        jan # "\slash " # jun,
  year =         "2016",
  DOI =          "https://doi.org/10.1109/LCA.2015.2513858",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 08:36:31 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Presents the 2015 author/subject index for this
                 publication.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Indexes",
}

@Article{Rezaei:2016:DRS,
  author =       "Seyyed Hossein Seyyedaghaei Rezaei and Abbas Mazloumi
                 and Mehdi Modarressi and Pejman Lotfi-Kamran",
  title =        "Dynamic Resource Sharing for High-Performance {$3$-D}
                 Networks-on-Chip",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2448532",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "3D logic-on-logic technology is a promising approach
                 for extending the validity of Moore's law when
                 technology scaling stops. 3D technology can also lead
                 to a paradigm shift in on-chip communication design by
                 providing orders of magnitude higher bandwidth and
                 lower latency for inter-layer communication. To turn
                 the 3D technology bandwidth and latency benefits into
                 network latency reductions and performance improvement,
                 we need networks-on-chip (NoCs) that are specially
                 designed to take advantage of what 3D technology has to
                 offer. While in parallel workloads many packets
                 experience blocking in the network due to losing
                 arbitration for crossbars' input/output ports, we
                 observe that in a considerable fraction of these cases
                 in a 3D NoC, the corresponding input and output ports
                 of the crossbar in the above or below router are idle.
                 Given this observation, we propose FRESH, a router
                 microarchitecture with Fine-grained 3D REsource SHaring
                 capability that leverages the ultra-low latency
                 vertical links of a 3D chip to share crossbars and
                 links at a fine granularity between vertically stacked
                 routers. It enables packets that lose arbitration for
                 crossbars' input/output ports to use idle resources of
                 the above or below routers, and effectively eliminates
                 the unnecessary packet blocking time. We will show that
                 our proposal lowers network latency by up to 21 percent
                 over the state-of-the-art 3D NoC.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rezaei, SHS (Reprint Author), Univ Tehran, Coll Engn,
                 Dept Elect \& Comp Engn, Tehran, Iran. Rezaei, Seyyed
                 Hossein Seyyedaghaei; Mazloumi, Abbas; Modarressi,
                 Mehdi, Univ Tehran, Coll Engn, Dept Elect \& Comp Engn,
                 Tehran, Iran. Lotfi-Kamran, Pejman, Inst Res
                 Fundamental Sci IPM, Sch Comp Sci, Tehran, Iran.",
  author-email = "s.hseyyedaghaei@ut.ac.ir y.mazloomi@gmail.com
                 modarressi@ut.ac.ir plotfi@ipm.ir",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "3-D integration; 3D integration; 3D networks-on-chip;
                 3D NoC; Bandwidth; crossbars input-output ports;
                 fine-grained 3D resource sharing capability; FRESH;
                 network latency; network routing; network-on-chip;
                 Ports (Computers); Resource management; Resource
                 sharing; router microarchitecture; Routing; Switches;
                 Three-dimensional displays; Through-silicon vias",
  keywords-plus = "3D; ROUTER",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Rezaei:2016:DRS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Gorgues:2016:EPC,
  author =       "Miguel Gorgues and Jose Flich",
  title =        "End-Point Congestion Filter for Adaptive Routing with
                 Congestion-Insensitive Performance",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2429130",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Interconnection networks are a critical component in
                 most modern systems nowadays. Both off-chip networks,
                 in HPC systems, data centers, and cloud servers, and
                 on-chip networks, in chip multiprocessors (CMPs) and
                 multiprocessors system-on-chip (MPSoCs), play an
                 increasing role as their performance is vital for the
                 performance of the whole system. One of the key
                 components of any interconnect is the routing
                 algorithm, which steers packets through the network.
                 Adaptive routing algorithms have demonstrated their
                 superior performance by maximizing network resources
                 utilization. However, as systems increase in size (both
                 in off-chip and on-chip), new problems emerge. One of
                 them is congestion where traffic jams inside the
                 network lead to low throughput and high packet latency,
                 significantly impacting overall system performance. We
                 propose a mechanism to eradicate this phenomena and to
                 allow adaptive routing algorithms to achieve the
                 expected performance even in the presence of congestion
                 situations. End-Point Congestion Filter, EPC, detects
                 congestion formed at the end-points of the network, and
                 prevents the congestion from spreading through the
                 network. Basically, EPC disables adaptivity in
                 congested packets. Preliminary results for mid and high
                 congestion situations show EPC is able to totally
                 decouple congestion from routing.",
  acknowledgement = ack-nhfb,
  affiliation =  "Gorgues, M (Reprint Author), Univ Politecn Valencia,
                 Dept Comp Architecture, E-46022 Valencia, Spain.
                 Gorgues, Miguel; Flich, Jose, Univ Politecn Valencia,
                 Dept Comp Architecture, E-46022 Valencia, Spain.",
  author-email = "migoral@disca.upv.es jflich@disca.upv.es",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Adaptive filters; Adaptive routing algorithms;
                 adaptive routing algorithms; Adaptive routing
                 algorithms; adaptive routing algorithms; Adaptive
                 routing algorithms; Adaptive systems; chip
                 multiprocessors; cloud servers; CMP; Congestion;
                 congestion; Congestion; congestion;
                 congestion-insensitive performance; data centers;
                 digital filters; end-point congestion filter; EPC; HPC
                 systems; Information filters; interconnection networks;
                 interconnects; MPSoC; multiprocessor interconnection
                 networks; multiprocessors system-on-chip; network
                 resources utilization; network routing; on-chip
                 networks; packet latency; performance evaluation; Ports
                 (Computers); Routing; system-on-chip; Throughput;
                 traffic jams",
  keywords-plus = "NETWORKS",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Gorgues:2016:EPC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Panda:2016:EPP,
  author =       "Biswabandan Panda and Shankar Balachandran",
  title =        "Expert Prefetch Prediction: an Expert Predicting the
                 Usefulness of Hardware Prefetchers",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2428703",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware prefetching improves system performance by
                 hiding and tolerating the latencies of lower levels of
                 cache and off-chip DRAM. An accurate prefetcher
                 improves system performance whereas an inaccurate
                 prefetcher can cause cache pollution and consume
                 additional bandwidth. Prefetch address filtering
                 techniques improve prefetch accuracy by predicting the
                 usefulness of a prefetch address and based on the
                 outcome of the prediction, the prefetcher decides
                 whether or not to issue a prefetch request. Existing
                 techniques use only one signature to predict the
                 usefulness of a prefetcher but no single predictor
                 works well across all the applications. In this work,
                 we propose weighted-majority filter, an expert way of
                 predicting the usefulness of prefetch addresses. The
                 proposed filter is adaptive in nature and uses the
                 prediction of the best predictor(s) from a pool of
                 predictors. Our filter is orthogonal to the underlying
                 prefetching algorithm. We evaluate the effectiveness of
                 our technique on 22 SPEC-2000/2006 applications. On an
                 average, when employed with three state-of-the-art
                 prefetchers such as AMPM, SMS, and GHB-PC/DC, our
                 filter provides performance improvement of 8.1, 9.3,
                 and 11 percent respectively.",
  acknowledgement = ack-nhfb,
  affiliation =  "Panda, B (Reprint Author), Indian Inst Technol, Dept
                 Comp Sci \& Engn, Madras, Tamil Nadu, India. Panda,
                 Biswabandan; Balachandran, Shankar, Indian Inst
                 Technol, Dept Comp Sci \& Engn, Madras, Tamil Nadu,
                 India.",
  author-email = "biswa.uce@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; AMPM; cache; Cache; cache; Cache; cache;
                 cache storage; filtering theory; GHB-PC/DC; Hardware;
                 hardware prefetchers; Hardware prefetching; Hardware
                 Prefetching; Hardware prefetching; Hardware
                 Prefetching; Memory systems; memory systems; Memory
                 systems; memory systems; Pollution; Prediction
                 algorithms; prefetch addresses; Prefetching;
                 prefetching algorithm; Radiation detectors; Random
                 access memory; SMS; weighted-majority filter",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Panda:2016:EPP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Eker:2016:EEC,
  author =       "Abdulaziz Eker and O{\u{g}}uz Ergin",
  title =        "Exploiting Existing Copies in Register File for Soft
                 Error Correction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2435705",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Soft errors are an increasingly important problem in
                 contemporary digital systems. Being the major data
                 holding component in contemporary microprocessors, the
                 register file has been an important part of the
                 processor on which researchers offered many different
                 schemes to protect against soft errors. In this paper
                 we build on the previously proposed schemes and start
                 with the observation that many register values already
                 have a replica inside the storage space. We use this
                 already available redundancy inside the register file
                 in combination with a previously proposed value
                 replication scheme for soft error detection and
                 correction. We show that, by employing schemes that
                 make use of the already available copies of the values
                 inside the register file, it is possible to detect and
                 correct 39.0 percent of the errors with an additional
                 power consumption of 18.9 percent.",
  acknowledgement = ack-nhfb,
  affiliation =  "Eker, A (Reprint Author), TOBB Univ Econ \& Technol,
                 Dept Comp Engn, Ankara, Turkey. Eker, Abdulaziz; Ergin,
                 O{\u{g}}uz, TOBB Univ Econ \& Technol, Dept Comp Engn,
                 Ankara, Turkey.",
  author-email = "aeker@etu.edu.tr oergin@etu.edu.tr",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "TUBITAK [112E004]",
  funding-text = "This work was supported in part by TUBITAK under Grant
                 112E004. The work is in the framework of COST Action
                 1103.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; contemporary digital systems;
                 contemporary microprocessors; data holding component;
                 Error correction; Error correction codes;
                 microcomputers; microprocessor architecture;
                 Microprocessors; Parity check codes; redundancy;
                 register file; Registers; Reliability; soft error; soft
                 error correction; soft error detection; storage space",
  number-of-cited-references = "16",
  ORCID-numbers = "Ergin, O{\u{g}}uz/0000-0003-2701-3787",
  research-areas = "Computer Science",
  researcherid-numbers = "Ergin, O{\u{g}}uz/E-5717-2010",
  times-cited =  "1",
  unique-id =    "Eker:2016:EEC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Maycock:2016:HES,
  author =       "Matthew Maycock and Simha Sethumadhavan",
  title =        "Hardware Enforced Statistical Privacy",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2403359",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The Internet of Things will result in users generating
                 vast quantities of data, some of it sensitive. Results
                 from the statistical analysis of sensitive data across
                 wide ranges of demographics will become ever more
                 useful to data analysts and their clients. The
                 competing needs of the two groups-data generators with
                 their desire for privacy and analysts with their desire
                 for inferred statistics-will be met through the use of
                 statistical privacy techniques. The question, then, is
                 how can we ensure that the statistical methods are
                 applied in a trustable manner? In this paper we discuss
                 some of the complications and consequences of ensuring
                 both trust and privacy through the immutability of
                 hardware, providing a desiderata for a hardware privacy
                 platform.",
  acknowledgement = ack-nhfb,
  affiliation =  "Maycock, M (Reprint Author), Columbia Univ, Dept Comp
                 Sci, CASTL, New York, NY 10027 USA. Maycock, Matthew;
                 Sethumadhavan, Simha, Columbia Univ, Dept Comp Sci,
                 CASTL, New York, NY 10027 USA.",
  author-email = "mhm2159@columbia.edu simha@columbia.edu",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Alfred P. Sloan Foundation;
                 [FA8750-10-2-0253]",
  funding-text = "This work was supported through grant FA8750-10-2-0253
                 and the Alfred P. Sloan Foundation. Opinions, findings,
                 conclusions and recommendations expressed in this
                 material are those of the authors and may not reflect
                 the views of the funding entities. Simha Sethumadhavan
                 is the corresponding author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "data analysis; Data privacy; data privacy; Data
                 privacy; Engines; Hardware; hardware enforced
                 statistical privacy; hardware immutability; hardware
                 support; Hardware Support; hardware support; Hardware
                 Support; hardware support; Internet of things; Internet
                 of Things; Internet of things; Internet of Things;
                 Internet of things; Internet of Things; Noise; Privacy;
                 privacy; Privacy; privacy; Privacy; privacy; Privacy;
                 privacy protection unit; Privacy Protection Unit;
                 privacy protection unit; Privacy Protection Unit;
                 privacy protection unit; Security; sensitive data;
                 Software; statistical analysis",
  number-of-cited-references = "7",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Maycock:2016:HES",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Li:2016:ICL,
  author =       "Dongdong Li and Tor M. Aamodt",
  title =        "Inter-Core Locality Aware Memory Scheduling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2435709",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Graphics Processing Units (GPUs) run thousands of
                 parallel threads and achieve high Memory Level
                 Parallelism (MLP). To support high Memory Level
                 Parallelism, a structure called a Miss-Status Holding
                 Register (MSHR) handles multiple in-flight miss
                 requests. When multiple cores send requests to the same
                 cache line, the requests are merged into one last level
                 cache MSHR entry and only one memory request is sent to
                 the Dynamic Random-Access Memory (DRAM). We call this
                 inter-core locality. The main reason for inter-core
                 locality is that multiple cores access shared read-only
                 data within the same cache line. By prioritizing memory
                 requests that have high inter-core locality, more
                 threads resume execution. In this paper, we analyze the
                 reason for inter-core locality and show that requests
                 with inter-core locality are more critical to
                 performance. We propose a GPU DRAM scheduler that
                 exploits information about inter-core locality detected
                 at the last level cache MSHRs. For high inter-core
                 locality benchmarks this leads to an average 28 percent
                 reduction in memory request latency and 11 percent
                 improvement in performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Li, DD (Reprint Author), Univ British Columbia, Dept
                 Elect \& Comp Engn, Vancouver, BC V6T 1Z4, Canada. Li,
                 Dongdong; Aamodt, Tor M., Univ British Columbia, Dept
                 Elect \& Comp Engn, Vancouver, BC V6T 1Z4, Canada.",
  author-email = "dongdong@ece.ubc.ca aamodt@ece.ubc.ca",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Benchmark testing; cache line; cache
                 storage; Computational fluid dynamics; DRAM chips;
                 dynamic random-access memory; GPGPU; GPU DRAM
                 scheduler; graphics processing units; Graphics
                 processing units; graphics processing units; Graphics
                 processing units; graphics processing units;
                 Instruction sets; intercore locality aware memory
                 scheduling; last level cache MSHR entry; locality;
                 Locality; locality; Locality; locality; memory access
                 scheduling; Memory Access Scheduling; memory access
                 scheduling; Memory Access Scheduling; memory level
                 parallelism; memory request; memory request latency;
                 miss-status holding register; MLP; multiple cores;
                 multiple in-flight miss requests; multiprocessing
                 systems; parallel processing; parallel threads;
                 Processor scheduling; processor scheduling; Processor
                 scheduling; processor scheduling; Random access memory;
                 read-only data",
  number-of-cited-references = "16",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Li:2016:ICL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Pu:2016:NIP,
  author =       "Libei Pu and Kshitij Doshi and Ellis Giles and Peter
                 Varman",
  title =        "Non-Intrusive Persistence with a Backend {NVM}
                 Controller",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "29--32",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2443105",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "By providing instruction-grained access to vast
                 amounts of persistent data with ordinary loads and
                 stores, byte-addressable storage class memory (SCM) has
                 the potential to revolutionize system architecture. We
                 describe a non-intrusive SCM controller for achieving
                 light-weight failure atomicity through back-end
                 operations. Our solution avoids costly software
                 intervention by decoupling isolation and
                 concurrency-driven atomicity from failure atomicity and
                 durability, and does not require changes to the
                 front-end cache hierarchy. Two implementation
                 alternatives --- one using a hardware structure, and
                 the other extending the memory controller with a
                 firmware managed volatile space, are described.",
  acknowledgement = ack-nhfb,
  affiliation =  "Pu, LB (Reprint Author), Rice Univ, ECE, Houston, TX
                 77005 USA. Pu, Libei; Giles, Ellis; Varman, Peter, Rice
                 Univ, ECE, Houston, TX 77005 USA. Doshi, Kshitij,
                 Intel, SSG, Phoenix, AZ 85226 USA.",
  author-email = "pulibei@gmail.com kshitij.a.doshi@intel.com
                 erg@rice.edu pjv@rice.edu",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "US National Science Foundation (NSF) [CCF
                 1439075]; Intel Software and Services Group",
  funding-text = "This paper is supported by the US National Science
                 Foundation (NSF) Grant CCF 1439075 and by Intel
                 Software and Services Group.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "atomicity; backend NVM controller; byte-addressable
                 storage class memory; cache storage; concurrency-driven
                 atomicity; consistency; durability; failure analysis;
                 firmware; firmware managed volatile space; front-end
                 cache hierarchy; Hardware; hardware structure;
                 instruction-grained access; isolation decoupling;
                 light-weight failure atomicity; memory architecture;
                 Memory management; Non-volatile memory; nonintrusive
                 persistence; nonintrusive SCM controller; Nonvolatile
                 memory; persistent memory; Process control; Random
                 access memory; random-access storage; Retirement;
                 Software; software intervention; system architecture",
  keywords-plus = "SYSTEM",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Pu:2016:NIP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Garcia:2016:CMP,
  author =       "P. Garcia and T. Gomes and J. Monteiro and A. Tavares
                 and M. Ekpanyapong",
  title =        "On-Chip Message Passing Sub-System for Embedded
                 Inter-Domain Communication",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "33--36",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2419260",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "This letter describes the architecture of an
                 inter-domain message passing hardware sub-system
                 targeting the embedded virtualization field. Embedded
                 virtualization is characterized by application-specific
                 solutions, where functionality is partitioned into a
                 small, fixed number of Virtual Machines, typically
                 under real-time constraints, which must communicate for
                 synchronization and status signaling. In light of the
                 growing use of custom hardware, especially supported by
                 (re)configurable platforms, we show how our hardware
                 sub-system can provide virtualization-safe data
                 transfers, without the need for Hypervisor (software)
                 mediation, through the use of translate-once and
                 virtual-interface hardware mechanisms, allowing direct
                 memory-to-memory copies between different partitions'
                 input/output buffers, in both direct-transfer and
                 publish-subscribe modes. Our experiments show our
                 architecture is especially suited for the real time
                 domain, outperforming an equivalent software solution
                 in latencies, throughput and jitter, and outperforming
                 state of the art hardware solutions for small message
                 sizes ($ < 512 $ B).",
  acknowledgement = ack-nhfb,
  affiliation =  "Garcia, P (Reprint Author), Univ Minho, Dept Ctr
                 Algoritmi, P-4800 Braga, Portugal. Garcia, P.; Gomes,
                 T.; Monteiro, J.; Tavares, A., Univ Minho, Dept Ctr
                 Algoritmi, P-4800 Braga, Portugal. Ekpanyapong, M.,
                 Asian Inst Technol, Dept Microelect \& Embedded Syst,
                 Khlong Luang, Thailand.",
  author-email = "pgarcia@dei.uminho.pt tgomes@dei.uminho.pt
                 jmonteiro@dei.uminho.pt atavares@dei.uminho.pt
                 mongkol@ait.ac.th",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "FCT [SFRH/BD/77813/2011]",
  funding-text = "This work was supported in part by a grant from FCT,
                 reference SFRH/BD/77813/2011. P. Garcia is the
                 corresponding author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application program interfaces; application-specific
                 solutions; configurable platforms; direct
                 memory-to-memory copies; direct-transfer modes;
                 embedded interdomain communication; embedded systems;
                 embedded virtualization field; Hardware; interdomain
                 message passing hardware subsystem; message passing;
                 Message passing; on-chip message passing subsystem;
                 partition input/output buffers; publish subscribe
                 modes; Publish-subscribe; real time domain; real-time
                 constraints; Software; status signaling;
                 synchronisation; synchronization; Throughput;
                 translate-once mechanism; Virtual machine monitors;
                 virtual machines; virtual-interface hardware
                 mechanisms; virtualisation; Virtualization;
                 virtualization-safe data transfers",
  number-of-cited-references = "15",
  ORCID-numbers = "Monteiro, Joao L/0000-0002-3287-3995 Monteiro,
                 Joao/0000-0002-3287-3995 Tavares,
                 Adriano/0000-0001-8316-6927 Gomes,
                 Tiago/0000-0002-8496-8179 Garcia,
                 Paulo/0000-0002-1041-5205",
  research-areas = "Computer Science",
  researcherid-numbers = "Monteiro, Joao L/H-7751-2012 Monteiro,
                 Joao/Q-6857-2019 Tavares, Adriano/M-5257-2013",
  times-cited =  "1",
  unique-id =    "Garcia:2016:CMP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Li:2016:PHP,
  author =       "Minghua Li and Guancheng Chen and Qijun Wang and
                 Yonghua Lin and Peter Hofstee and Per Stenstrom and
                 Dian Zhou",
  title =        "{PATer}: a Hardware Prefetching Automatic Tuner on
                 {IBM} {POWER8} Processor",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "37--40",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2442972",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware prefetching on IBM's latest POWER8 processor
                 is able to improve performance of many applications
                 significantly, but it can also cause performance loss
                 for others. The IBM POWER8 processor provides one of
                 the most sophisticated hardware prefetching designs
                 which supports 225 different configurations. Obviously,
                 it is a big challenge to find the optimal or
                 near-optimal hardware prefetching configuration for a
                 specific application. We present a dynamic prefetching
                 tuning scheme in this paper, named prefetch automatic
                 tuner (PATer). PATer uses a prediction model based on
                 machine learning to dynamically tune the prefetch
                 configuration based on the values of hardware
                 performance monitoring counters (PMCs). By developing a
                 two-phase prefetching selection algorithm and a
                 prediction accuracy optimization algorithm in this
                 tool, we identify a set of selected key hardware
                 prefetch configurations that matter mostly to
                 performance as well as a set of PMCs that maximize the
                 machine learning prediction accuracy. We show that
                 PATer is able to accelerate the execution of diverse
                 workloads up to $ 1.4 \times $.",
  acknowledgement = ack-nhfb,
  affiliation =  "Li, MH (Reprint Author), Unvers Texas Dallas, Dept
                 Elect Engn, Richardson, TX 75080 USA. Li, MH (Reprint
                 Author), IBM Res China, Beijing, Peoples R China. Li,
                 Minghua; Zhou, Dian, Unvers Texas Dallas, Dept Elect
                 Engn, Richardson, TX 75080 USA. Li, Minghua; Chen,
                 Guancheng; Wang, Qijun; Lin, Yonghua, IBM Res China,
                 Beijing, Peoples R China. Hofstee, Peter, IBM Corp,
                 ARL, Austin, TX USA. Stenstrom, Per, Chalmers, Dept Sci
                 \& Comp Engn, Gothenburg, Sweden.",
  author-email = "mxl095420@utdallas.edu chengc@cn.ibm.com
                 wqijun@cn.ibm.com linyh@cn.ibm.com hofstee@us.ibm.com
                 pers@chalmers.se zhoud.utdallas@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "IBM Research global summer intern program",
  funding-text = "The authors would like to thank the anonymous
                 reviewers for their valuable suggestions and comments
                 to improve the paper. The authors also want to thank
                 Ling Shao, Xiaowei Shen, Qi Guo, Kun Wang, Tao Liu, Yan
                 Li from IBM Research, and Sally A. Mckee from Chalmers
                 for their insightful suggestions. Minghua Li was
                 supported by IBM Research global summer intern
                 program.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; Benchmark testing; Classifier design and
                 evaluation; Classifier design and evaluation, machine
                 learning, optimization, performance measures; Hardware;
                 hardware PMC; hardware prefetching automatic tuner; IBM
                 POWER8 processor; learning (artificial intelligence);
                 machine learning; multiprocessing systems;
                 Optimization; optimization; Optimization; PATer;
                 performance evaluation; performance measures;
                 performance monitoring counters; prediction accuracy
                 optimization algorithm; prefetch automatic tuner;
                 Prefetching; Runtime; storage management; Training;
                 two-phase prefetching selection algorithm",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Li:2016:PHP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Alian:2016:PGS,
  author =       "Mohammad Alian and Daehoon Kim and Nam Sung Kim",
  title =        "{pd-gem5}: Simulation Infrastructure for
                 Parallel\slash Distributed Computer Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "41--44",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2438295",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Improving the performance and power efficiency of a
                 single processor has been fraught with various
                 challenges stemming from the end of the classical
                 technology scaling. Thus, the importance of efficiently
                 running applications on a parallel/distributed computer
                 system has continued to increase. In developing and
                 optimizing such a parallel/distributed computer system,
                 it is critical to study the impact of the complex
                 interplay amongst processor, node, and network
                 architectures on performance and power efficiency in
                 detail. This necessitates a flexible, detailed and
                 open-source full-system simulation infrastructure.
                 However, our community lacks such an infrastructure. In
                 this paper, we present pd-gem5, a gem5-based
                 infrastructure that can model and simulate a parallel/
                 distributed computer system using multiple simulation
                 hosts. Our experiment shows that pd-gem5 running on six
                 simulation hosts speeds up the simulation of a 24-node
                 computer system up to $ 3.2 \times $ compared with
                 running on a single simulation host.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, NS (Reprint Author), Univ Illinois, ECE Dept,
                 Urbana, IL 61801 USA. Alian, Mohammad; Kim, Daehoon;
                 Kim, Nam Sung, Univ Illinois, ECE Dept, Urbana, IL
                 61801 USA.",
  author-email = "nskim@illinois.edu",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CNS-1217102, CNS-1512981]; DARPA
                 [HR0011-12-2-0019]",
  funding-text = "This work was supported in part by NSF (CNS-1217102
                 and CNS-1512981) and DARPA (HR0011-12-2-0019) grants.
                 Nam Sung Kim has a financial interest in Samsung
                 Electronics and AMD. Daehoon Kim is the corresponding
                 author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Computational modeling; digital
                 simulation; gem5; Handheld computers; Load modeling;
                 multiple simulation hosts; network; open-source
                 full-system simulation infrastructure; parallel
                 processing; parallel/distributed computer systems;
                 parallel/distributed simulation; pd-gem5; power aware
                 computing; public domain software; single processor
                 performance; single processor power efficiency; single
                 simulation host; Switches; Synchronization; technology
                 scaling",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Alian:2016:PGS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2016:RFE,
  author =       "Yoongu Kim and Weikun Yang and Onur Mutlu",
  title =        "{Ramulator}: a Fast and Extensible {DRAM} Simulator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "45--49",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2414456",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Recently, both industry and academia have proposed
                 many different roadmaps for the future of DRAM.
                 Consequently, there is a growing need for an extensible
                 DRAM simulator, which can be easily modified to judge
                 the merits of today's DRAM standards as well as those
                 of tomorrow. In this paper, we present Ramulator, a
                 fast and cycle-accurate DRAM simulator that is built
                 from the ground up for extensibility. Unlike existing
                 simulators, Ramulator is based on a generalized
                 template for modeling a DRAM system, which is only
                 later infused with the specific details of a DRAM
                 standard. Thanks to such a decoupled and modular
                 design, Ramulator is able to provide out-of-the-box
                 support for a wide array of DRAM standards: DDR3/4,
                 LPDDR3/4, GDDR5, WIO1/2, HBM, as well as some academic
                 proposals (SALP, AL-DRAM, TL-DRAM, RowClone, and SARP).
                 Importantly, Ramulator does not sacrifice simulation
                 speed to gain extensibility: according to our
                 evaluations, Ramulator is $ 2.5 \times $ faster than
                 the next fastest simulator. Ramulator is released under
                 the permissive BSD license.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, Y (Reprint Author), Carnegie Mellon Univ, Dept
                 Elect \& Comp Engn, Pittsburgh, PA 15213 USA. Kim,
                 Yoongu; Mutlu, Onur, Carnegie Mellon Univ, Dept Elect
                 \& Comp Engn, Pittsburgh, PA 15213 USA. Yang, Weikun,
                 Carnegie Mellon Univ, Pittsburgh, PA 15213 USA. Yang,
                 Weikun, Peking Univ, Dept Comp Sci, Beijing, Peoples R
                 China.",
  author-email = "yoongu.kim@gmail.com wkyjyy@gmail.com
                 omutlu@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF; SRC",
  funding-text = "We thank the SAFARI group members who have contributed
                 to the development of Ramulator, including Kevin Chang,
                 Saugata Ghose, Donghyuk Lee, Tianshi Li, and Vivek
                 Seshadri. We also thank the anonymous reviewers for
                 feedback. This work was supported by NSF, SRC, and
                 gifts from our industrial partners, including Google,
                 Intel, Microsoft, Nvidia, Samsung, Seagate and VMware.
                 Ramulator can be freely downloaded from
                 https://github.com/CMUSAFARI/ramulator",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "circuit simulation; digital simulation; DRAM; DRAM
                 chips; DRAM simulator; DRAM standard; emerging
                 technologies; experimental methods; Hardware design
                 languages; Main memory; memory scaling; memory systems;
                 Nonvolatile memory; performance evaluation; performance
                 evaluation, experimental methods, emerging
                 technologies, memory systems, memory scaling;
                 Proposals; Ramulator; Random access memory; Runtime;
                 simulation; software tool; standards; Standards;
                 standards; Timing",
  keywords-plus = "LATENCY DRAM; RETHINKING",
  number-of-cited-references = "38",
  research-areas = "Computer Science",
  times-cited =  "29",
  unique-id =    "Kim:2016:RFE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Olson:2016:SIT,
  author =       "Lena E. Olson and Simha Sethumadhavan and Mark D.
                 Hill",
  title =        "Security Implications of Third-Party Accelerators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "50--53",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2445337",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Third-party accelerators offer system designers high
                 performance and low energy without the market delay of
                 in-house development. However, complex third-party
                 accelerators may include vulnerabilities due to design
                 flaws or malicious intent that are hard to expose
                 during verification. Rather than react to each new
                 vulnerability, it is better to proactively build
                 defenses for classes of attacks. To inspire future work
                 on defenses, this paper develops a taxonomy of
                 accelerator vulnerabilities. We consider the cross
                 product of threat types (confidentiality, integrity,
                 and availability) with risk categories (configuration,
                 computation, termination, accelerator memory accesses,
                 system memory accesses, microarchitecture/coherence,
                 exceptions/interrupts, and power), as well as whether
                 processes can be vulnerable only if they use the
                 offending accelerator (accelerator-scope threat) or
                 even when running in the same system (system-scope
                 threat). Our taxonomy draws attention to a grave
                 problem that needs immediate attention from computer
                 architects.",
  acknowledgement = ack-nhfb,
  affiliation =  "Olson, LE (Reprint Author), Univ Wisconsin, Dept Comp
                 Sci, 1210 W Dayton St, Madison, WI 53706 USA. Olson,
                 Lena E.; Hill, Mark D., Univ Wisconsin, Dept Comp Sci,
                 1210 W Dayton St, Madison, WI 53706 USA. Sethumadhavan,
                 Simha, Columbia Univ, Dept Comp Sci, New York, NY 10026
                 USA.",
  author-email = "lena@cs.wisc.edu simha@cs.columbia.edu
                 markhill@cs.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [1054844]; Alfred P. Sloan Foundation;
                 [FA8750-10-2-0253]; [FA8650-11-C-7190]",
  funding-text = "This work is supported through grants
                 FA8750-10-2-0253, FA8650-11-C-7190, NSF 1054844 and the
                 Alfred P. Sloan Foundation. Opinions, findings,
                 conclusions and recommendations expressed in this
                 material are those of the authors and may not reflect
                 the views of the funding entities. The authors thank
                 Eric Sedlar, Dan Gibson, Multifacet, and UW-Madison
                 Computer Architecture Affiliates for valuable
                 feedback.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerator architectures; accelerator
                 vulnerabilities; accelerator-scope threat; Coherence;
                 computer architecture; Computer bugs; Computer
                 security; Cryptography; Hardware; malicious intent;
                 market delay; Registers; risk categories; risk
                 management; system-scope threat; Taxonomy; third-party
                 accelerators",
  number-of-cited-references = "20",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "5",
  unique-id =    "Olson:2016:SIT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Jacob:2016:CVC,
  author =       "Bruce Jacob",
  title =        "The Case for {VLIW--CMP} as a Building Block for
                 Exascale",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "54--57",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2424699",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Current ultra-high-performance computers execute
                 instructions at the rate of roughly 10 PFLOPS (10
                 quadrillion floating-point operations per second) and
                 dissipate power in the range of 10 MW. The next
                 generation will need to execute instructions at EFLOPS
                 rates-100x as fast as today's-but without dissipating
                 any more power. To achieve this challenging goal, the
                 emphasis is on power-efficient execution, and for this
                 we propose VLIW-CMP as a general architectural approach
                 that improves significantly on the power efficiency of
                 existing solutions. Compared to manycore architectures
                 using simple, single-issue cores, VLIW-CMP reduces both
                 power and die area, improves single-thread performance,
                 and maintains aggregate FLOPS per die. To improve
                 further on the power advantages of VLIW, we describe a
                 mechanism that reduces power dissipation of both data
                 forwarding and register-file activity.",
  acknowledgement = ack-nhfb,
  affiliation =  "Jacob, B (Reprint Author), Univ Maryland, Dept Elect
                 \& Comp Engn, College Pk, MD 20742 USA. Jacob, Bruce,
                 Univ Maryland, Dept Elect \& Comp Engn, College Pk, MD
                 20742 USA.",
  author-email = "blj@ece.umd.edu",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; Computer architectures;
                 Computer architectures, high-performance computing,
                 energy efficiency, multicore; data forwarding activity;
                 EFLOPS rates; energy efficiency; high-performance
                 computing; manycore architectures; multicore;
                 multiprocessing systems; parallel architectures;
                 performance evaluation; PFLOPS; Pipelines; Ports
                 (Computers); power aware computing; power dissipation;
                 power-efficient execution; quadrillion floating-point
                 operations-per-second; Radio frequency; register-file
                 activity; Registers; single-thread performance
                 improvement; Software; ultra-high-performance
                 computers; VLIW; VLIW-CMP",
  keywords-plus = "REGISTER LIFETIME; ARCHITECTURE",
  number-of-cited-references = "18",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Jacob:2016:CVC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kleanthous:2016:TML,
  author =       "Marios Kleanthous and Yiannakis Sazeides and Emre Ozer
                 and Chrysostomos Nicopoulos and Panagiota Nikolaou and
                 Zacharias Hadjilambrou",
  title =        "Toward Multi-Layer Holistic Evaluation of System
                 Designs",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "58--61",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2445877",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The common practice for quantifying the benefit(s) of
                 design-time architectural choices of server processors
                 is often limited to the chip- or server-level. This
                 quantification process invariably entails the use of
                 salient metrics, such as performance, power, and
                 reliability, which capture-in a tangible manner-a
                 designs overall ramifications. This paper argues for
                 the necessity of a more holistic evaluation approach,
                 which considers metrics across multiple integration
                 levels (chip, server and datacenter). In order to
                 facilitate said comprehensive evaluation, we utilize an
                 aggregate metric, e.g. the Total Cost of Ownership
                 (TCO), to harness the complexly of comparing multiple
                 metrics at multiple levels. We motivate our proposition
                 for holistic evaluation with a case study that compares
                 a 2D processor to a 3D processor at various design
                 integration levels. We show that while a 2D processor
                 is clearly the best choice at the processor level, the
                 conclusion is reversed at the data-center level, where
                 the 3D processor becomes a better choice. This result
                 emanates mainly from the performance benefits of
                 processor-DRAM 3D integration, and the ability to
                 amortize (at the datacenter-level) the higher 3D
                 per-server cost and lower reliability by requiring
                 fewer 3D servers to match the same performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kleanthous, M (Reprint Author), Univ Cyprus, Dept Comp
                 Sci, Nicosia, Cyprus. Kleanthous, Marios; Sazeides,
                 Yiannakis; Nikolaou, Panagiota; Hadjilambrou,
                 Zacharias, Univ Cyprus, Dept Comp Sci, Nicosia, Cyprus.
                 Nicopoulos, Chrysostomos, Univ Cyprus, Dept Elect \&
                 Comp Engn, Nicosia, Cyprus. Ozer, Emre, ARM Ltd, Res,
                 Cambridge CB19NJ, England.",
  author-email = "marios@kleanthous.info yanos@cs.ucy.ac.cy
                 emre.ozer@arm.com nicopoulos@ucy.ac.cy
                 nikolaou@cs.ucy.ac.cy zhadji01@cs.ucy.ac.cy",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "European Commission [612069 HARPA, 247779
                 EuroCloud]",
  funding-text = "This work was supported by the European Commission FP7
                 projects ``Harnessing Performance Variability'' (No:
                 612069 HARPA) and ``Energy-conscious 3D Server-on-Chip
                 for Green Cloud Services'' (No: 247779 EuroCloud).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "2D processor; 3D processor; Chip; chip; Chip; chip;
                 Computational modeling; computer centres; data-center
                 level; Datacenter; datacenter; Datacenter; datacenter;
                 design integration levels; Design-Space Exploration;
                 design-space exploration; Design-Space Exploration;
                 design-space exploration; design-time architectural
                 choices; DRAM chips; Evaluation Metrics; evaluation
                 metrics; Evaluation Metrics; Holistic evaluation;
                 Holistic Evaluation; Holistic evaluation; Holistic
                 Evaluation; Holistic evaluation; integrated circuit
                 reliability; Measurement; microprocessor chips;
                 multilayer holistic evaluation; multiple integration
                 levels; performance evaluation; processor-DRAM 3D
                 integration; Program processors; ramifications;
                 Reliability; reliability; Reliability; Server; server;
                 Server; server processors; Servers; system designs;
                 System-on-chip; Three-dimensional displays",
  keywords-plus = "PERFORMANCE",
  number-of-cited-references = "23",
  ORCID-numbers = "Nicopoulos, Chrysostomos/0000-0001-6389-6068",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Kleanthous:2016:TML",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Daya:2016:THP,
  author =       "Bhavya K. Daya and Li-Shiuan Peh and Anantha P.
                 Chandrakasan",
  title =        "Towards High-Performance Bufferless {NoCs} with
                 {SCEPTER}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "62--65",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2428699",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In the many-core era, the network on-chip (NoC) is
                 playing a larger role in meeting performance, area and
                 power goals, as router buffers contribute greatly to
                 NoC area and power usage. Proposals have advocated
                 bufferless NoCs, however a performance wall has been
                 reached such that high throughput performance has not
                 been extracted. We present SCEPTER, a high-performance
                 bufferless mesh NoC that sets up single-cycle virtual
                 express paths dynamically across the chip, allowing
                 deflected packets to go through non-minimal paths with
                 no latency penalty. For a 64 node network, we
                 demonstrate an average 62 percent reduction in latency
                 and an average $ 1.3 \times $ higher throughput over a
                 baseline bufferless NoC for synthetic traffic patterns;
                 with comparable performance to a single-cycle multihop
                 buffered mesh network with six flit buffers, per input
                 port, in each router.",
  acknowledgement = ack-nhfb,
  affiliation =  "Daya, BK (Reprint Author), MIT, Dept EECS, 77
                 Massachusetts Ave, Cambridge, MA 02139 USA. Daya,
                 Bhavya K.; Peh, Li-Shiuan; Chandrakasan, Anantha P.,
                 MIT, Dept EECS, 77 Massachusetts Ave, Cambridge, MA
                 02139 USA.",
  author-email = "bdaya@mit.edu peh@csail.mit.edu anantha@mtl.mit.edu",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "64 node network; bufferless router; bypassing;
                 Computer architecture; deflection routing;
                 high-performance bufferless mesh NoC; latency
                 reduction; multiprocessor interconnection;
                 Multiprocessor interconnection; multiprocessor
                 interconnection; multiprocessor interconnection
                 networks; Multiprocessor interconnection, on-chip mesh
                 networks, bufferless router, deflection routing,
                 bypassing; network routing; network-on-chip; nonminimal
                 paths; on-chip mesh networks; performance evaluation;
                 Pipelines; Ports (Computers); power aware computing;
                 power usage; Resource management; router buffers;
                 Routing; SCEPTER; single-cycle express path traversal
                 for efficient routing; single-cycle virtual express
                 paths; Switches; synthetic traffic patterns;
                 Throughput",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Daya:2016:THP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2016:IICc,
  author =       "Anonymous",
  title =        "Introducing {IEEE Collabratec}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "66--66",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2578800",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2016:IICd,
  author =       "Anonymous",
  title =        "Introducing {IEEE Collabratec}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "66--66",
  month =        jan # "\slash " # jun,
  year =         "2016",
  DOI =          "https://doi.org/10.1109/LCA.2016.2578800",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 08:36:31 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "IEEE Collabratec is a new, integrated online community
                 where IEEE members, researchers, authors, and
                 technology professionals with similar fields of
                 interest can network and collaborate, as well as create
                 and manage content. Featuring a suite of powerful
                 online networking and collaboration tools, IEEE
                 Collabratec allows you to connect according to
                 geographic location, technical interests, or career
                 pursuits. You can also create and share a professional
                 identity that showcases key accomplishments and
                 participate in groups focused around mutual interests,
                 actively learning from and contributing to
                 knowledgeable communities. All in one place! Learn
                 about IEEE Collabratec at ieeecollabratec.org.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2016:ENM,
  author =       "Anonymous",
  title =        "Experience the Newest and Most Advanced Thinking in
                 Big Data Analytics",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "67--67",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2581058",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Advertisement, IEEE.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2016:ICS,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Cyber Security}}}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "68--68",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2581078",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Advertisement, IEEE.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2016:TCa,
  author =       "Anonymous",
  title =        "Table of Contents",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "C1--C1",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2578758",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Presents the table of contents for this issue of the
                 publication.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2016:Ca,
  author =       "Anonymous",
  title =        "Cover",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "C2--C2",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2578759",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2016:Cb,
  author =       "Anonymous",
  title =        "Cover",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "C2--C2",
  month =        jan # "\slash " # jun,
  year =         "2016",
  DOI =          "https://doi.org/10.1109/LCA.2016.2578759",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 08:36:31 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Provides a listing of board members, committee
                 members, editors, and society officers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2016:Cc,
  author =       "Anonymous",
  title =        "Cover",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "C3--C3",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2578760",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2016:Cd,
  author =       "Anonymous",
  title =        "Cover",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "C3--C3",
  month =        jan # "\slash " # jun,
  year =         "2016",
  DOI =          "https://doi.org/10.1109/LCA.2016.2578760",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 08:36:31 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "These instructions give guidelines for preparing
                 papers for this publication. Presents information for
                 authors publishing in this journal.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2016:BC,
  author =       "Anonymous",
  title =        "{[Back} cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "C4--C4",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2578761",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Presents the table of contents for this issue of the
                 publication.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Liang:2016:CGR,
  author =       "Shuang Liang and Shouyi Yin and Leibo Liu and Yike Guo
                 and Shaojun Wei",
  title =        "A Coarse-Grained Reconfigurable Architecture for
                 Compute-Intensive {MapReduce} Acceleration",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "69--72",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2458318",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Large-scale workloads often show parallelism of
                 different levels. which offers acceleration potential
                 for clusters and parallel processors. Although
                 processors such as GPGPUs and FPGAs show good
                 performance of speedup, there is still vacancy for a
                 low power, high efficiency and dynamically
                 reconfigurable one, and coarse-grained reconfigurable
                 architecture (CGRA) seems to be one possible choice. In
                 this paper, we introduce how we use our CGRA fabric
                 Chameleon to realize a dynamically reconfigurable
                 acceleration to MapReduce-based (MR-based)
                 applications. A FPGA-shell-CGRA-core (FSCC)
                 architecture is designed for the acceleration
                 PCI-Express board, and a programming model with
                 compilation flow for CGRA is presented. With the
                 supports above, a small evaluation cluster with Hadoop
                 framework is set up, and experiments on
                 compute-intensive applications show that the
                 programming process is significantly simplified, with
                 an 30-60 x speedup offered under low power.",
  acknowledgement = ack-nhfb,
  affiliation =  "Yin, SY (Reprint Author), Tsinghua Univ, Inst
                 Microelect, Beijing 100084, Peoples R China. Liang,
                 Shuang; Yin, Shouyi; Liu, Leibo; Wei, Shaojun, Tsinghua
                 Univ, Inst Microelect, Beijing 100084, Peoples R China.
                 Guo, Yike, Imperial Coll London, Dept Comp, London,
                 England.",
  author-email = "s-liang11@mails.tsinghua.edu.cn yinsy@tsinghua.edu.cn
                 liulb@mail.tsinghua.edu.cn fiascoo@gmail.com
                 wsj@tsinghua.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Nature Science foundation of China
                 [61274131]; International S\&T Cooperation Project of
                 China [2012DFA11170]; Tsinghua Indigenous Research
                 Project [20111080997]; China National High Technologies
                 Research Program [2012-AA012701]",
  funding-text = "This work was supported by the National Nature Science
                 foundation of China (No. 61274131), the International
                 S\&T Cooperation Project of China (No. 2012DFA11170),
                 the Tsinghua Indigenous Research Project (No.
                 20111080997) and the China National High Technologies
                 Research Program (No. 2012-AA012701). S. Yin is the
                 corresponding author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerator; Accelerators; Computer architecture;
                 Field programmable gate arrays; Hardware; MapReduce;
                 Programming; Reconfigurable architectures;
                 Reconfigurable computing; Servers",
  number-of-cited-references = "15",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Liang:2016:CGR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lai:2016:QMD,
  author =       "Bo-Cheng Charles Lai and Luis Garrido Platero and
                 Hsien-Kai Kuo",
  title =        "A Quantitative Method to Data Reuse Patterns of {SIMT}
                 Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "73--76",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2491279",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Understanding data reuse patterns of a computing
                 system is crucial to effective design optimization. The
                 emerging Single Instruction Multiple Threads (SIMT)
                 processor adopts a programming model that is
                 fundamentally disparate from conventional scalar
                 processors. There is a lack of analytical approaches to
                 quantify the data reuse of SIMT applications. This
                 paper presents a quantitative method to study the data
                 reuse inherent to SIMT applications. A metric, Data
                 Reuse Degree, is defined to measure the amount of
                 reused data between memory references, and associate
                 each data reuse degree to a temporal distance
                 representing the virtual time of the execution process.
                 The experiments are performed on an abstracted SIMT
                 processor that considers the programming model and
                 runtime specifics. The experiments illustrate diverse
                 data reuse patterns of SIMT applications and explore
                 the impacts of architectural limitations.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lai, BCC (Reprint Author), Natl Chiao Tung Univ, Dept
                 Elect Engn, Hsinchu 300, Taiwan. Lai, Bo-Cheng Charles,
                 Natl Chiao Tung Univ, Dept Elect Engn, Hsinchu 300,
                 Taiwan. Platero, Luis Garrido, Barcelona Super Comp
                 Ctr, Barcelona, Spain. Kuo, Hsien-Kai, MediaTek Inc,
                 Hsinchu, Taiwan.",
  author-email = "bclai@mail.nctu.edu.tw luis.garrido.platero@gmail.com
                 hsienkai.kuo@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "MOST [104-2221-E-009-079]",
  funding-text = "This project was supported by MOST grant
                 104-2221-E-009-079.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architectural limitations; cache memory; Cache memory;
                 computing system; data analysis; data reuse degree;
                 data reuse patterns; design optimization; execution
                 process; Graphics processing units; Instruction sets;
                 Measurement; Memory management; multi-threading;
                 Parallel architectures; Parallel architectures, cache
                 memory, parallel processing; parallel processing;
                 Parallel processing; programming model; scalar
                 processors; SIMT applications; SIMT processors;
                 single-instruction multiple-threads processors; virtual
                 time",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Lai:2016:QMD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Cakmakci:2016:CPG,
  author =       "Yaman {\c{C}}akmak{\c{c}}i and Will Toms and Javier
                 Navaridas and Mikel Lujan",
  title =        "Cyclic Power-Gating as an Alternative to Voltage and
                 Frequency Scaling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "77--80",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2478784",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Dynamic Voltage and Frequency Scaling is the most
                 commonly used power management technique in modern
                 processors. However, the ability of an individual chip
                 to operate under reduced supply voltage can no longer
                 be predetermined at the design stage and may even
                 change over time. This paper presents Cyclic
                 Power-Gating (CPG), a novel power management strategy
                 where the power consumption of a core can be finely
                 controlled without scaling the supply voltage. CPG
                 builds on state-retentive power-gating which allows the
                 power supply to a core to be switched off and on again
                 at high speed (tens of clock cycles) with minimal
                 disruption to running programs. The power-gating is
                 cyclic, by altering the ratio of time spent powered-on
                 and off in each power-gating period the effective
                 operating frequency and power consumption of a core can
                 be controlled. The overheads in delay and power
                 consumption of CPG for an out-of-order core in a 14 nm
                 technology are accurately modelled and compared to the
                 performance and power consumption of Voltage/Frequency
                 pairs in the same technology. The proposed power gating
                 method reduces average power consumption by 4 percent
                 over voltage and frequency scaling with only a 2
                 percent degradation in performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "{\c{C}}akmak{\c{c}}i, Y (Reprint Author), Univ
                 Manchester, Sch Comp Sci, Manchester M13 9PL, Lancs,
                 England. {\c{C}}akmak{\c{c}}i, Yaman; Toms, Will;
                 Navaridas, Javier; Lujan, Mikel, Univ Manchester, Sch
                 Comp Sci, Manchester M13 9PL, Lancs, England.",
  author-email = "cakmakcy@cs.man.ac.uk tomsw@cs.man.ac.uk
                 javier.navaridas@manchester.ac.uk
                 mikel.lujan@manchester.ac.uk",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "EPSRC [DOME EP/J016330/1, PAMELA
                 EP/K008730/1]; Royal Society University Research
                 Fellowship; Engineering and Physical Sciences Research
                 Council [EP/K008730/1, EP/J016330/1]",
  funding-text = "This work was supported by EPSRC grants DOME
                 EP/J016330/1 and PAMELA EP/K008730/1. Mike Lujan an is
                 funded by a Royal Society University Research
                 Fellowship. The authors thank Timothy Jones for his
                 comments on the draft version of this paper.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Capacitance; Computer architecture;
                 CPG; cyclic power-gating; Energy efficiency; frequency
                 scaling; leakage reduction; power aware computing;
                 power consumption; Power demand; Power efficient
                 design; power management; power management strategy;
                 state-retentive power-gating; Voltage measurement;
                 voltage scaling",
  number-of-cited-references = "12",
  oa =           "Bronze",
  ORCID-numbers = "Navaridas Palma, Javier/0000-0001-7272-6597",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Cakmakci:2016:CPG",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Tomusk:2016:DDG,
  author =       "Erik Tomusk and Christophe Dubach and Michael
                 O'Boyle",
  title =        "Diversity: a Design Goal for Heterogeneous
                 Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "81--84",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2499739",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "A growing number of processors have CPU cores that
                 implement the same instruction set architecture (ISA)
                 using different microarchitectures. The underlying
                 motivation for single-ISA heterogeneity is that a
                 diverse set of cores can enable runtime flexibility.
                 Modern processors are subject to strict power budgets,
                 and heterogeneity provides the runtime scheduler with
                 more latitude to decide the level of performance a
                 program should have based on the amount of power that
                 can be spent. We argue that selecting a diverse set of
                 heterogeneous cores to enable flexible operation at
                 runtime is a non-trivial problem due to diversity in
                 program behavior. We further show that common
                 evaluation methods lead to false conclusions about
                 diversity. Finally, we suggest the KS statistical test
                 as an evaluation metric. The KS test is the first step
                 toward a heterogeneous design methodology that
                 optimizes for runtime flexibility.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tomusk, E (Reprint Author), Univ Edinburgh, Edinburgh,
                 Midlothian, Scotland. Tomusk, Erik; Dubach, Christophe;
                 O'Boyle, Michael, Univ Edinburgh, Edinburgh,
                 Midlothian, Scotland.",
  author-email = "e.tomusk@ed.ac.uk christophe.dubach@ed.ac.uk
                 mob@inf.ed.ac.uk",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Computer architecture; core
                 selection; CPU cores; design goal; Diversity;
                 flexibility; heterogeneity; heterogeneous cores;
                 heterogeneous design methodology; heterogeneous
                 processors; instruction set architecture; instruction
                 sets; integrated circuit design; ISA;
                 Kolmogorov-Smirnov test; KS statistical test;
                 Measurement; metrics; Microarchitecture;
                 microarchitectures; microprocessor chips; power aware
                 computing; Program processors; Runtime; runtime
                 flexibility; runtime scheduler; statistical testing",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Tomusk:2016:DDG",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hashemi:2016:EEB,
  author =       "Milad Hashemi and Debbie Marr and Doug Carmean and
                 Yale N. Patt",
  title =        "Efficient Execution of Bursty Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "85--88",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2456013",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The performance of user-facing applications is
                 critical to client platforms. Many of these
                 applications are event-driven and exhibit ``bursty''
                 behavior: the application is generally idle but
                 generates bursts of activity in response to human
                 interaction. We study one example of a bursty
                 application, web-browsers, and produce two important
                 insights: (1) Activity bursts contain false
                 parallelism, bringing many cores out of a deep sleep to
                 inefficiently render a single webpage, and (2) these
                 bursts are highly compute driven, and thus scale nearly
                 linearly with frequency. We show average performance
                 gains/energy reductions of 14\%/17\% respectively on
                 real hardware by statically moving threads from
                 multiple cores to a single core. We then propose
                 dynamic hardware driven thread migration and scheduling
                 enhancements that detect these bursts, leading to
                 further benefits.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hashemi, M (Reprint Author), Univ Texas Austin, Elect
                 \& Comp Engn, Austin, TX 78701 USA. Hashemi, Milad;
                 Patt, Yale N., Univ Texas Austin, Elect \& Comp Engn,
                 Austin, TX 78701 USA. Marr, Debbie, Intel Corp, Intel
                 Labs, Portland, OR USA. Carmean, Doug, Microsoft,
                 Microsoft Res, Seattle, WA USA.",
  author-email = "miladh@hps.utexas.edu debbie.marr@intel.com
                 dcarmean@microsoft.com patt@hps.utexas.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Intel Corporation; Cockrell Foundation; HPS
                 Research Group",
  funding-text = "The authors thank Intel Corporation and the Cockrell
                 Foundation for their continued generous financial
                 support of the HPS Research Group.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Browsers; bursty applications; dynamic hardware;
                 Energy; energy reductions; Hardware; human computer
                 interaction; human interaction; Instruction sets;
                 Internet; Loading; multi-threading; Multicore
                 processing; multiple cores; multiprocessing systems;
                 online front-ends; Operating systems; performance;
                 performance evaluation; performance gains; power aware
                 computing; thread migration; thread scheduling;
                 Web-browsers; Webpage; webpages; webpages, thread
                 scheduling",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Hashemi:2016:EEB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kannan:2016:EAP,
  author =       "Sudarsun Kannan and Moinudin Qureshi and Ada
                 Gavrilovska and Karsten Schwan",
  title =        "Energy Aware Persistence: Reducing the Energy
                 Overheads of Persistent Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "89--92",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2472410",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Next generation byte addressable nonvolatile memory
                 (NVM) technologies like PCM are attractive for end-user
                 devices as they offer memory scalability as well as
                 fast persistent storage. In such environments, NVM's
                 limitations of slow writes and high write energy are
                 magnified for applications that need atomic,
                 consistent, isolated and durable (ACID) updates. This
                 is because, for satisfying correctness (ACI),
                 application state must be frequently flushed from all
                 intermediate buffers, including processor cache, and to
                 support durability (D) guarantees, that state must be
                 logged. This increases NVM access and more importantly
                 results in additional CPU instructions. This paper
                 proposes Energy Aware Persistence (EAP). To develop
                 EAP, we first show that the energy related overheads
                 for maintaining durability are significant. We then
                 propose energy-efficient durability principles that
                 mitigate those costs, an example being flexible logging
                 that switch between performance and energy-efficient
                 modes and a memory management technique that trades
                 capacity for energy. Finally, we propose relaxed
                 durability (ACI-RD) mechanism used under critical low
                 energy conditions that do not affect correctness. The
                 initial results for several realistic applications and
                 benchmark show up to 2x reduction in CPU and NVM energy
                 usage relative to a traditional ACID-based
                 persistence.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kannan, S (Reprint Author), Georgia Inst Technol,
                 Atlanta, GA 30332 USA. Kannan, Sudarsun; Qureshi,
                 Moinudin; Gavrilovska, Ada; Schwan, Karsten, Georgia
                 Inst Technol, Atlanta, GA 30332 USA.",
  author-email = "sudarsun@gatech.edu moin@ece.gatech.edu
                 ada@cc.gatech.edu schwan@cc.gatech.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ACI-RD mechanism; ACID; ACID updates; ACID-based
                 persistence; atomic-consistent-isolated-durable
                 updates; Benchmark testing; cache storage; CPU energy
                 usage; CPU instructions; EAP; end-user devices; energy
                 aware persistence; Energy management; energy overhead
                 reduction; energy overheads; energy-efficient
                 durability principles; energy-efficient modes;
                 heap-based persistence; logging; memory management;
                 microprocessor chips; next generation byte addressable
                 nonvolatile memory; next generation byte addressable
                 NVM; Nonvolatile memory; NVM; NVM access; NVM energy
                 usage; Optimization; performance evaluation; persistent
                 memory; power aware computing; processor cache; Random
                 access memory; random-access storage; Resource
                 management; storage management",
  keywords-plus = "PHASE-CHANGE MEMORY",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kannan:2016:EAP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Valero:2016:ELD,
  author =       "Alejandro Valero and Negar Miralaei and Salvador Petit
                 and Julio Sahuquillo and Timothy M. Jones",
  title =        "Enhancing the {L1} Data Cache Design to Mitigate
                 {HCI}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "93--96",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2460736",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Over the lifetime of a microprocessor, the Hot Carrier
                 Injection (HCI) phenomenon degrades the threshold
                 voltage, which causes slower transistor switching and
                 eventually results in timing violations and faulty
                 operation. This effect appears when the memory cell
                 contents flip from logic `0' to `1' and vice versa. In
                 caches, the majority of cell flips are concentrated
                 into only a few of the total memory cells that make up
                 each data word. In addition, other researchers have
                 noted that zero is the most commonly-stored data value
                 in a cache, and have taken advantage of this behavior
                 to propose data compression and power reduction
                 techniques. Contrary to these works, we use this
                 information to extend the lifetime of the caches by
                 introducing two microarchitectural techniques that
                 spread and reduce the number of flips across the
                 first-level (L1) data cache cells. Experimental results
                 show that, compared to the conventional approach, the
                 proposed mechanisms reduce the highest cell flip peak
                 up to 65.8 percent, whereas the threshold voltage
                 degradation savings range from 32.0 to 79.9 percent
                 depending on the application.",
  acknowledgement = ack-nhfb,
  affiliation =  "Valero, A (Reprint Author), Univ Politecn Valencia,
                 Dept Comp Engn, Valencia, Spain. Valero, Alejandro;
                 Petit, Salvador; Sahuquillo, Julio, Univ Politecn
                 Valencia, Dept Comp Engn, Valencia, Spain. Miralaei,
                 Negar; Jones, Timothy M., Univ Cambridge, Comp Lab,
                 Cambridge, England.",
  author-email = "alvabre@gap.upv.es negar.miralaei@cl.cam.ac.uk
                 spetit@disca.upv.es jsahuqui@disca.upv.es
                 timothy.jones@cl.cam.ac.uk",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Spanish Ministerio de Economia y
                 Competitividad (MINECO); FEDER funds
                 [TIN2012-38341-C04-01]; Intel Early Career Faculty
                 Honor Program Award; HiPEAC Collaboration Grant-FP7
                 HiPEAC Network of Excellence [287759]; Engineering and
                 Physical Sciences Research Council (EPSRC)
                 [EP/K026399/1, EP/J016284/1]; Engineering and Physical
                 Sciences Research Council [EP/J016284/1,
                 EP/K026399/1]",
  funding-text = "This work has been supported by the Spanish Ministerio
                 de Economia y Competitividad (MINECO), by FEDER funds
                 through Grant TIN2012-38341-C04-01, by the Intel Early
                 Career Faculty Honor Program Award, by a HiPEAC
                 Collaboration Grant funded by the FP7 HiPEAC Network of
                 Excellence under grant agreement 287759, and by the
                 Engineering and Physical Sciences Research Council
                 (EPSRC) through Grants EP/K026399/1 and EP/J016284/1.
                 Additional data related to this publication are
                 available in the data repository at
                 https://www.repository.cam.ac.uk/handle/1810/249006.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Cache memories; Cache memory; cache storage; cell flip
                 peaks; cell flips; commonly-stored data value; data
                 compression; Degradation; faulty operation; first-level
                 data cache cells; HCI mitigation; Hot carrier effects;
                 Hot Carrier Injection; hot carrier injection; Hot
                 Carrier Injection; hot carriers; Human computer
                 interaction; L1 data cache design; memory architecture;
                 memory cells; microarchitectural techniques;
                 microprocessor chips; microprocessor lifetime;
                 Microprocessors; power aware computing; power
                 reduction; Program processors; threshold voltage
                 degradation; transistor switching; Voltage
                 measurement",
  number-of-cited-references = "10",
  oa =           "Green Accepted, Green Published",
  ORCID-numbers = "Valero, Alejandro/0000-0002-0824-5833",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Valero:2016:ELD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Sen:2016:GFM,
  author =       "Rathijit Sen and David A. Wood",
  title =        "{GPGPU} Footprint Models to Estimate per-Core Power",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "97--100",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2456909",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We explore the problem of how to easily estimate the
                 per-core power distribution of GPGPUs from the total
                 power of all cores. We show that the dynamic energy
                 consumption of a core for a given kernel, represented
                 by its work footprint, is approximately proportional to
                 the total time taken by all work units executing on
                 that core, and the static power, represented by its
                 core footprint, is proportional to the time that the
                 core has assigned work. Footprints can be easily
                 tracked using two hardware counters per GPU core. We
                 also show how per-core power estimates can be used to
                 compute power-performance pareto frontiers that
                 identify opportunities for saving power and energy in
                 cases of non-uniform work distribution by exploiting
                 per-core DVFS support for GPGPUs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sen, R (Reprint Author), Univ Wisconsin, Dept Comp
                 Sci, 1210 W Dayton St, Madison, WI 53706 USA. Sen,
                 Rathijit; Wood, David A., Univ Wisconsin, Dept Comp
                 Sci, 1210 W Dayton St, Madison, WI 53706 USA.",
  author-email = "rathijit@cs.wisc.edu david@cs.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "US National Science Foundation (NSF)
                 [CCF-1218323, CNS-1302260]",
  funding-text = "The authors thank Srilatha Manne, Indrani Paul, and
                 Wei Huang for discussions about per-core DVFS support
                 in GPUs and Mark Hill, Jason Power, anonymous
                 reviewers, and the Associate Editor for helpful review
                 comments. This work was supported in part with US
                 National Science Foundation (NSF) grants CCF-1218323
                 and CNS-1302260. The views expressed herein are not
                 necessarily those of the NSF. Wood has significant
                 financial interests in AMD and Google.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Analytical models; Computational modeling; DVFS;
                 dynamic energy consumption; energy consumption;
                 footprint; GPGPU; GPGPU footprint models; GPGPU
                 per-core power distribution; Graphics processing units;
                 graphics processing units; Mathematical model; Pareto
                 analysis; pareto frontier; Pareto optimization;
                 per-core DVFS support; per-core power estimation;
                 power; power aware computing; Power distribution;
                 power-performance Pareto frontiers; Predictive models;
                 static power",
  keywords-plus = "PERFORMANCE",
  number-of-cited-references = "12",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Sen:2016:GFM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Jung:2016:LPS,
  author =       "Daejin Jung and Sheng Li and Jung Ho Ahn",
  title =        "Large Pages on Steroids: Small Ideas to Accelerate Big
                 Memory Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "101--104",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2495103",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Utilizing small (e.g., 4 KB) pages incurs frequent TLB
                 misses on modern big memory applications, substantially
                 degrading the performance of the system. Large (e.g., 1
                 GB) pages or direct segments can alleviate this penalty
                 due to page table walks, but at the same time such a
                 strategy exposes the organizational and operational
                 details of modern DRAM-based memory systems to
                 applications. Row-buffer conflicts caused by accesses
                 heading to the same DRAM bank but different rows from
                 multiple threads are regarded as the main culprits
                 behind the very large gaps between peak and achieved
                 main memory throughput, but hardware-based approaches
                 in memory controllers have achieved only limited
                 success whereas existing proposals that change memory
                 allocators cannot be applied to large pages or direct
                 segments. In this paper, we propose a set of
                 application-level techniques to improve the effective
                 main memory bandwidth. The techniques stem from the two
                 key observations that (1) each thread of an application
                 exclusively accesses certain datasets for a short or
                 long period of time, and (2) superfluous memory reads
                 originating from a cache's write allocation policy can
                 be avoided if scatters during the data shuffling pass
                 through intermediate cache-friendly buffers.
                 Experiments with a contemporary x86 server show that
                 combining large pages with the proposed address
                 linearization, bank coloring, and write streaming
                 techniques improves the performance of the three big
                 memory applications of high-throughput key-value store,
                 fast-Fourier transform, and radix sort by 37.6, 22.9,
                 and 68.1 percent, respectively.",
  acknowledgement = ack-nhfb,
  affiliation =  "Jung, D (Reprint Author), Seoul Natl Univ, Dept
                 Transdisciplinary Studies, Seoul, South Korea. Jung,
                 Daejin; Ahn, Jung Ho, Seoul Natl Univ, Dept
                 Transdisciplinary Studies, Seoul, South Korea. Li,
                 Sheng, Intel Labs, Santa Clara, CA USA. Ahn, Jung Ho,
                 Seoul Natl Univ, Big Data Inst, Seoul, South Korea.",
  author-email = "haidj@snu.ac.kr sheng.r.li@intel.com gajh@snu.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Research Foundation of Korea -
                 Korea government [NRF-2014R1A2A1A11052936,
                 NRF-2012M3A9D1054622]",
  funding-text = "The authors thank Jongwook Chung and Jaeyoon Choi on
                 their contributions to application writing and
                 experiments. This work was partially supported by the
                 National Research Foundation of Korea grant funded by
                 the Korea government (NRF-2014R1A2A1A11052936 and
                 NRF-2012M3A9D1054622). Jung Ho Ahn is also with Big
                 Data Institute, Seoul National University.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "address linearization; application-level techniques;
                 Bandwidth; bank coloring; big memory applications;
                 cache storage; cache write allocation policy;
                 cache-friendly buffers; data shuffling; DRAM bank; DRAM
                 chips; DRAM-based memory; fast-Fourier transform;
                 high-throughput key-value store; Instruction sets;
                 large pages; memory allocators; memory bandwidth;
                 memory controllers; Memory management; memory
                 throughput; multi-threading; multiple threads;
                 Performance gain; Physical-to-DRAM address mapping;
                 radix sort; Random access memory; row-buffer conflicts;
                 Servers; superfluous memory reads; write streaming",
  number-of-cited-references = "14",
  ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Jung:2016:LPS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Verdu:2016:PSA,
  author =       "Javier Verdu and Alex Pajuelo",
  title =        "Performance Scalability Analysis of {JavaScript}
                 Applications with {Web} Workers",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "105--108",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2494585",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/java2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Web applications are getting closer to the performance
                 of native applications taking advantage of new
                 standard-based technologies. The recent HTML5 standard
                 includes, among others, the Web Workers API that allows
                 executing JavaScript applications on multiple threads,
                 or workers. However, the internals of the browser's
                 JavaScript virtual machine does not expose direct
                 relation between workers and running threads in the
                 browser and the utilization of logical cores in the
                 processor. As a result, developers do not know how
                 performance actually scales on different environments
                 and therefore what is the optimal number of workers on
                 parallel JavaScript codes. This paper presents the
                 first performance scalability analysis of parallel web
                 apps with multiple workers. We focus on two case
                 studies representative of different worker execution
                 models. Our analyses show performance scaling on
                 different parallel processor microarchitectures and on
                 three major web browsers in the market. Besides, we
                 study the impact of co-running applications on the web
                 app performance. The results provide insights for
                 future approaches to automatically find out the optimal
                 number of workers that provide the best tradeoff
                 between performance and resource usage to preserve
                 system responsiveness and user experience, especially
                 on environments with unexpected changes on system
                 workload.",
  acknowledgement = ack-nhfb,
  affiliation =  "Verdu, J (Reprint Author), BarcelonaTECH UPC, Dept
                 Comp Architecture, Barcelona, Spain. Verdu, Javier;
                 Pajuelo, Alex, BarcelonaTECH UPC, Dept Comp
                 Architecture, Barcelona, Spain.",
  author-email = "jverdu@ac.upc.edu mpajuelo@ac.upc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Spanish Ministry of Economy and
                 Competitiveness (MINECO) [TIN2012-34557]",
  funding-text = "This work has been supported by the Spanish Ministry
                 of Economy and Competitiveness (MINECO) under contract
                 TIN2012-34557.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application program interfaces; Benchmark testing;
                 Browsers; Computer architecture; HTML5; HTML5 standard;
                 hypermedia markup languages; Internet; Java;
                 javascript; JavaScript applications; Message systems;
                 Microarchitecture; multithreading; Multithreading;
                 multithreading; online front-ends; parallel processing;
                 parallel processor microarchitectures; parallel Web
                 apps; parallelism; performance scalability analysis;
                 resource usage; Scalability; standard-based
                 technologies; system responsiveness preservation; user
                 experience; Web applications; web apps; Web browsers;
                 web workers; Web workers API; worker execution models",
  number-of-cited-references = "12",
  oa =           "Green Published",
  ORCID-numbers = "Pajuelo, Alex/0000-0002-5510-6860 Verdu Mula,
                 Javier/0000-0003-4485-2419",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Verdu:2016:PSA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Delimitrou:2016:SID,
  author =       "Christina Delimitrou and Christos Kozyrakis",
  title =        "Security Implications of Data Mining in Cloud
                 Scheduling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "109--112",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2461215",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Cloud providers host an increasing number of popular
                 applications, on the premise of resource flexibility
                 and cost efficiency. Most of these systems expose
                 virtualized resources of different types and sizes. As
                 instances share the same physical host to increase
                 utilization, they contend on hardware resources, e.g.,
                 last-level cache, making them vulnerable to
                 side-channel attacks from co-scheduled applications. In
                 this work we show that using data mining techniques can
                 help an adversarial user of the cloud determine the
                 nature and characteristics of co-scheduled applications
                 and negatively impact their performance through
                 targeted contention injections. We design Bolt, a
                 simple runtime that extracts the sensitivity of
                 co-scheduled applications to various types of
                 interference and uses this signal to determine the type
                 of these applications by applying a set of data mining
                 techniques. We validate the accuracy of Bolt on a
                 39-server cluster. Bolt correctly identifies the type
                 and characteristics of 81 percent out of 108 victim
                 applications, and constructs specialized contention
                 signals that degrade their performance. We also use
                 Bolt to find the most commonly-run applications on EC2.
                 We hope that underlining such security vulnerabilities
                 in modern cloud facilities will encourage cloud
                 providers to introduce stronger resource isolation
                 primitives in their systems.",
  acknowledgement = ack-nhfb,
  affiliation =  "Delimitrou, C (Reprint Author), Stanford Univ, Dept
                 Elect Engn, Stanford, CA 94305 USA. Delimitrou,
                 Christina; Kozyrakis, Christos, Stanford Univ, Dept
                 Elect Engn, Stanford, CA 94305 USA.",
  author-email = "cdel@stanford.edu kozyraki@stanford.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "39-server cluster; application studies resulting in
                 better multiple-processor systems; Bolt; Cloud
                 computing; cloud computing; cloud facilities; cloud
                 providers; co-scheduled applications; Computer crime;
                 cost efficiency; cryptography; data mining; Data
                 mining; Degradation; Interference; resource allocation;
                 resource flexibility; resource isolation primitives;
                 scheduling and task partitioning; security and privacy
                 protection; security vulnerabilities; Servers;
                 side-channel attacks; specialized contention signals;
                 Super (very large) computers; virtualized resources",
  number-of-cited-references = "21",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Delimitrou:2016:SID",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wang:2016:SMF,
  author =       "Zhenning Wang and Jun Yang and Rami Melhem and Bruce
                 Childers and Youtao Zhang and Minyi Guo",
  title =        "Simultaneous Multikernel: Fine-Grained Sharing of
                 {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "113--116",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2477405",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Studies show that non-graphics programs can be less
                 optimized for the GPU hardware, leading to significant
                 resource under-utilization. Sharing the GPU among
                 multiple programs can effectively improve utilization,
                 which is particularly attractive to systems (e.g.,
                 cloud computing) where many applications require access
                 to the GPU. However, current GPUs lack proper
                 architecture features to support sharing. Initial
                 attempts are very preliminary in that they either
                 provide only static sharing, which requires
                 recompilation or code transformation, or they do not
                 effectively improve GPU resource utilization. We
                 propose Simultaneous Multikernel (SMK), a fine-grained
                 dynamic sharing mechanism, that fully utilizes
                 resources within a streaming multiprocessor by
                 exploiting heterogeneity of different kernels. We
                 extend the GPU hardware to support SMK, and propose
                 several resource allocation strategies to improve
                 system throughput while maintaining fairness. Our
                 evaluation of 45 shared workloads shows that SMK
                 improves GPU throughput by 34 percent over non-shared
                 execution and 10 percent over a state-of-the-art
                 design.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wang, ZN (Reprint Author), Shanghai Jiao Tong Univ,
                 Dept Comp Sci, Shanghai, Peoples R China. Wang,
                 Zhenning; Guo, Minyi, Shanghai Jiao Tong Univ, Dept
                 Comp Sci, Shanghai, Peoples R China. Yang, Jun, Univ
                 Pittsburgh, Elect \& Comp Engn Dept, Pittsburgh, PA
                 15260 USA. Melhem, Rami; Childers, Bruce; Zhang,
                 Youtao, Univ Pittsburgh, Dept Comp Sci, Pittsburgh, PA
                 15260 USA.",
  author-email = "znwang@sjtu.edu.cn juy9@pitt.edu melhem@cs.pitt.edu
                 childers@cs.pitt.edu zhangyt@cs.pitt.edu
                 guo-my@cs.sjtu.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Basic Research 973 Program of
                 China [2015CB352403]; National Natural Science
                 Foundation of China (NSFC) [61261160502, 61272099]; CSC
                 scholarship; US National Science Foundation (NSF)
                 [CNS-1012070, CNS-1305220, CCF-1422331]",
  funding-text = "This work is supported in part by the National Basic
                 Research 973 Program of China (No. 2015CB352403), the
                 National Natural Science Foundation of China (NSFC)
                 (Nos. 61261160502, 61272099), the CSC scholarship, US
                 National Science Foundation (NSF) grants CNS-1012070,
                 CNS-1305220, and CCF-1422331.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Context switch; fine-grained dynamic sharing
                 mechanism; GPU; GPU hardware; GPU resource utilization
                 improvement; graphics processing units; Graphics
                 processing units; multiprocessing programs;
                 multiprocessor streaming; multitasking; Multitasking;
                 multitasking; nongraphic programs; resource allocation;
                 Resource management; resource under-utilization; SMK;
                 static sharing; Switches; Throughput",
  number-of-cited-references = "17",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Wang:2016:SMF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zhang:2016:SIW,
  author =       "Chulian Zhang and Hamed Tabkhi and Gunar Schirner",
  title =        "Studying Inter-Warp Divergence Aware Execution on
                 {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "117--120",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2478778",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This letter quantitatively studies the benefits of
                 inter-warp divergence aware execution on GPUs. To that
                 end, the letter first proposes a novel approach to
                 quantify the inter-warp divergence by measuring the
                 temporal similarity in execution progress of concurrent
                 warps, which we call Warp Progression Similarity (WPS).
                 Based on the WPS metric, this letter proposes a
                 WPS-aware Scheduler (WPSaS) to optimize GPU throughput.
                 The aim is to manage inter-warp divergence to hide
                 memory access latency and minimize resource conflicts
                 and temporal under-utilization in compute units
                 allowing GPUs to achieve their peak throughput. Our
                 results demonstrate that WPSaS improves throughput by
                 10 percent with a pronounced reduction in resource
                 conflicts and temporal under-utilization.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zhang, CL (Reprint Author), Northeastern Univ, Dept
                 Elect \& Comp Engn, Boston, MA 02115 USA. Zhang,
                 Chulian; Tabkhi, Hamed; Schirner, Gunar, Northeastern
                 Univ, Dept Elect \& Comp Engn, Boston, MA 02115 USA.",
  author-email = "zhang.chul@husky.neu.edu tabkhi@ece.neu.edu
                 schirner@ece.neu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [1319501]",
  funding-text = "This material is based upon work supported by the
                 National Science Foundation under Award No. 1319501.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Computer architecture; concurrent
                 warps; GPU scheduler; GPU throughput optimization;
                 Graphics processing units; graphics processing units;
                 Histograms; Inter-warp divergence; interwarp divergence
                 aware execution; interwarp divergence management;
                 Measurement; memory access latency hiding; Processor
                 scheduling; resource allocation; resource conflict
                 minimization; scheduling; temporal similarity
                 measurement; temporal underutilization; Throughput;
                 warp progression similarity; warp progression
                 similarity (WPS); WPS metric; WPS-aware scheduler;
                 WPSaS",
  number-of-cited-references = "8",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Zhang:2016:SIW",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Tavakkol:2016:TTB,
  author =       "Arash Tavakkol and Pooyan Mehrvarzy and Hamid
                 Sarbazi-Azad",
  title =        "{TBM}: Twin Block Management Policy to Enhance the
                 Utilization of Plane-Level Parallelism in {SSDs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "121--124",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2461162",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The internal architecture of a SSD provides channel-,
                 chip-, die- and plane-level parallelism levels, to
                 concurrently perform multiple data accesses and
                 compensate for the performance gap between a single
                 flash chip and host interface. Although a good striping
                 strategy can effectively exploit the first three
                 levels, parallel I/O accesses at plane-level can be
                 performed only for operations of the same types and
                 page addresses. In this work, we propose the Twin Block
                 Management (TBM) policy that symmetrically conducts
                 usage and recycling of the flash block addresses on the
                 planes of a die, thus enhancing the utilization of
                 plane-level parallelism for reads, writes and erases.
                 Evaluation results show that TBM improves IOPS and
                 response time by up to 73 and 42 percent,
                 respectively.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tavakkol, A (Reprint Author), Sharif Univ Technol,
                 Dept Comp Engn, HPCAN Lab, Tehran, Iran. Tavakkol,
                 Arash; Sarbazi-Azad, Hamid, Sharif Univ Technol, Dept
                 Comp Engn, HPCAN Lab, Tehran, Iran. Mehrvarzy, Pooyan;
                 Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch
                 Comp Sci, Tehran, Iran.",
  author-email = "tavakkol@ce.sharif.edu p.mehrvarzy@ipm.ir
                 azad@ipm.ir",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "channel-level parallelism level; chip-level
                 parallelism level; die-level parallelism level; flash
                 block; flash chip; flash memories; Flash memory;
                 garbage collection; host interface; IOPS; memory
                 architecture; multiple data accesses; parallel
                 processing; Parallel processing; performance
                 evaluation; plane-level parallelism; plane-level
                 parallelism level; Recycling; Resource management;
                 response time; Solid state circuits; solid-state drive;
                 SSD internal architecture; TBM; Time factors; twin
                 block management",
  number-of-cited-references = "11",
  ORCID-numbers = "Tavakkol, Arash/0000-0003-3859-1259",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Tavakkol:2016:TTB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Jacob:2016:PPT,
  author =       "Bruce Jacob",
  title =        "The 2 {PetaFLOP}, 3 Petabyte, 9 {TB/s}, 90 {kW}
                 Cabinet: a System Architecture for Exascale and Big
                 Data",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "125--128",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2451652",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We present a system architecture that uses
                 high-efficiency processors as opposed to
                 high-performance processors, NAND flash as
                 byte-addressable main memory, and high-speed DRAM as a
                 cache front-end for the flash. The main memory system
                 is interconnected and presents a unified global address
                 space to the client microprocessors. A single cabinet
                 contains 2,550 nodes, networked in a highly redundant
                 modified Moore graph that yields a bisection bandwidth
                 of 9.1 TB/s and a worst-case latency of four hops from
                 any node to any other. At a per-cabinet level, the
                 system supports a minimum of 2.6 petabytes of main
                 memory, dissipates 90 kW, and achieves 2.2 PetaFLOPS.
                 The system architecture provides several features
                 desirable in today's large-scale systems, including a
                 global shared physical address space (and optional
                 support for a global shared virtual space as well), the
                 ability to partition the physical space unequally among
                 clients as in a unified cache architecture (e.g., so as
                 to support multiple VMs in a datacenter), pairwise
                 system-wide sequential consistency on user-specified
                 address sets, built-in checkpointing via journaled
                 non-volatile main memory, memory cost-per-bit
                 approaching that of NAND flash, and memory performance
                 approaching that of pure DRAM.",
  acknowledgement = ack-nhfb,
  affiliation =  "Jacob, B (Reprint Author), Univ Maryland, Elect \&
                 Comp Engn, College Pk, MD 20742 USA. Jacob, Bruce, Univ
                 Maryland, Elect \& Comp Engn, College Pk, MD 20742
                 USA.",
  author-email = "blj@umd.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Big Data; Big data; Big Data; bisection
                 bandwidth; built-in checkpointing; byte-addressable
                 main memory; cache storage; checkpointing; DRAM chips;
                 exascale computing; extremely large; extremely large,
                 high radix network topologies; flash memories; High
                 performance computing; high-efficiency processors
                 high-performance processors; High-performance
                 computing; high-radix network topologies; high-speed
                 DRAM; journaled main memory; memory architecture;
                 Memory management; memory performance; microprocessor
                 chips; microprocessors; NAND flash; Network topology;
                 nonvolatile main memory; pairwise system-wide
                 sequential consistency; parallel architectures;
                 PetaFLOP; Ports (Computers); Program processors; Random
                 access memory; redundant modified Moore graph; system
                 architecture; user-specified address sets",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Jacob:2016:PPT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Xiao:2016:TAC,
  author =       "He Xiao and Wen Yueh and Saibal Mukhopadhyay and
                 Sudhakar Yalamanchili",
  title =        "Thermally Adaptive Cache Access Mechanisms for {3D}
                 Many-Core Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "129--132",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2495125",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "A compelling confluence of technology and application
                 trends in which the cost, execution time, and energy of
                 applications are being dominated by the memory system
                 is driving the industry to 3D packages for future
                 microarchitectures. However, these packages result in
                 high heat fluxes and increased thermal coupling
                 challenging current thermal solutions. Conventional
                 design approaches utilize design margins that
                 correspond to worst case temperatures and process
                 corners leading to a significant impact on system level
                 performance. This paper advocates a design approach
                 based on microarchitecture adaptation to device-level
                 temperature-dependent delay variations to realize
                 average case performance that is superior to which can
                 be achieved by using worst case design margins. We
                 demonstrate this approach with adaptation principles
                 for the last level cache (LLC) in a 3D many-core
                 architecture. We propose and evaluate two adaptation
                 mechanisms. In the first case, the access time to the
                 LLC from the L1 tracks the LLC's temperature-delay
                 variations. In the second case, the processor DVFS
                 state tracks the LLC temperature as a negative
                 feedback. Compared to a worst case design baseline, the
                 full system simulation results show that both
                 approaches increase the IPC by over 20 percent, and
                 improve the energy efficiency by up to 3 percent.",
  acknowledgement = ack-nhfb,
  affiliation =  "Xiao, H (Reprint Author), Georgia Inst Technol, Sch
                 Elect \& Comp Engn, Atlanta, GA 30332 USA. Xiao, He;
                 Yueh, Wen; Mukhopadhyay, Saibal; Yalamanchili,
                 Sudhakar, Georgia Inst Technol, Sch Elect \& Comp Engn,
                 Atlanta, GA 30332 USA.",
  author-email = "hxiao@gatech.edu wyueh3@gatech.edu
                 saibal.mukhopadhyay@ece.gatech.edu
                 sudha.yalamanchili@ece.gatech.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Semiconductor Research Corporation under
                 SRC [2318.001]; National Science Foundation
                 [CNS-0855110]",
  funding-text = "This research is supported and sponsored by the
                 Semiconductor Research Corporation under SRC task
                 2318.001, and the National Science Foundation under
                 grant CNS-0855110.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "3D IC; 3D IC, SRAM cache, adaptive architecture,
                 performance gain, energy efficiency; 3D many-core
                 architectures; Adaptation models; adaptive
                 architecture; Cache memory; cache storage; Computer
                 architecture; device-level temperature-dependent delay
                 variations; energy efficiency; integrated circuit
                 design; Integrated circuit modeling; last level cache;
                 LLC temperature; memory architecture;
                 Microarchitecture; microarchitecture adaptation;
                 microarchitectures; multiprocessing systems;
                 performance evaluation; performance gain; power aware
                 computing; processor DVFS state; Random access memory;
                 SRAM cache; system level performance; thermal coupling
                 challenging current thermal solutions; thermally
                 adaptive cache access mechanisms; Three-dimensional
                 displays",
  number-of-cited-references = "13",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Xiao:2016:TAC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hu:2016:TDM,
  author =       "Qi Hu and Peng Liu and Michael C. Huang",
  title =        "Threads and Data Mapping: Affinity Analysis for
                 Traffic Reduction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "133--136",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2451172",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Modern processors spend significant amount of time and
                 energy moving data. With the increase in core count,
                 the relative importance of such latency and energy
                 expenditure will only increase with time. Inter-core
                 communication traffic when executing a multithreaded
                 application is one such source of latency and energy
                 expenditure. This traffic is influenced by the mapping
                 of threads and data onto multicore systems. This paper
                 investigates the impact of threads and data mapping on
                 traffic in a chip-multiprocessor, and exploits the
                 potential for traffic reduction through threads and
                 data mapping. Based on the analysis and estimation of
                 the lowest traffic, we propose a threads and data
                 mapping mechanism to approach the lowest traffic. The
                 mapping takes both the correlation among threads and
                 the affinity of data with individual threads into
                 account, and results in significant traffic reduction
                 and energy savings.",
  acknowledgement = ack-nhfb,
  affiliation =  "Liu, P (Reprint Author), Zhejiang Univ, Coll Informat
                 Sci \& Elect Engn, Hangzhou 310027, Peoples R China.
                 Hu, Qi; Liu, Peng, Zhejiang Univ, Coll Informat Sci \&
                 Elect Engn, Hangzhou 310027, Peoples R China. Huang,
                 Michael C., Univ Rochester, Dept Elect \& Comp Engn,
                 601 Elmwood Ave, Rochester, NY 14627 USA.",
  author-email = "huqi\_isee@zju.edu.cn liupeng@zju.edu.cn
                 michael.huang@rochester.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EH9MM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSFC [61028004]; US National Science
                 Foundation (NSF) [1217662, 1255729]; Open Project
                 Program of the State Key Laboratory of Mathematical
                 Engineering and Advanced Computing [2014A08, 2015A09]",
  funding-text = "This work was supported by NSFC under grant 61028004,
                 and also in part by US National Science Foundation
                 (NSF) under grants 1217662 and 1255729, and the Open
                 Project Program of the State Key Laboratory of
                 Mathematical Engineering and Advanced Computing under
                 grants 2014A08 and 2015A09. P. Liu is the corresponding
                 author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "affinity analysis; chip-multiprocessor; Correlation;
                 data mapping; energy conservation; energy savings;
                 Instruction sets; intercore communication traffic;
                 Mapping; memory; Message systems; microprocessor chips;
                 modern processors; multi-threading; multicore;
                 Multicore processing; multicore systems;
                 multiprocessing systems; multithreaded application;
                 network-on-chip; Network-on-chip; network-on-chip;
                 Statistical analysis; thread mapping; traffic; traffic
                 reduction",
  keywords-plus = "NETWORKS; CACHES; CHIP",
  number-of-cited-references = "11",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Hu:2016:TDM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2016:TCb,
  author =       "Anonymous",
  title =        "Table of Contents",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "C1--C1",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2628298",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2016:Ce,
  author =       "Anonymous",
  title =        "Cover",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "C2--C2",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2628299",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2016:Cf,
  author =       "Anonymous",
  title =        "Cover",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "C3--C3",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2628301",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2016:TCBa,
  author =       "Anonymous",
  title =        "Table of contents [back cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "2",
  pages =        "C4--C4",
  month =        jul # "\slash " # dec,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2628302",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Beckmann:2017:CCM,
  author =       "Nathan Beckmann and Daniel Sanchez",
  title =        "Cache Calculus: Modeling Caches through Differential
                 Equations",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "1--5",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2512873",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Caches are critical to performance, yet their behavior
                 is hard to understand and model. In particular, prior
                 work does not provide closed-form solutions of cache
                 performance, i.e., simple expressions for the miss rate
                 of a specific access pattern. Existing cache models
                 instead use numerical methods that, unlike closed-form
                 solutions, are computationally expensive and yield
                 limited insight. We present cache calculus, a technique
                 that models cache behavior as a system of ordinary
                 differential equations, letting standard calculus
                 techniques find simple and accurate solutions of cache
                 performance for common access patterns.",
  acknowledgement = ack-nhfb,
  affiliation =  "Beckmann, N (Reprint Author), MIT CSAIL, Cambridge, MA
                 02139 USA. Beckmann, Nathan; Sanchez, Daniel, MIT
                 CSAIL, Cambridge, MA 02139 USA.",
  author-email = "beckmann@csail.mit.edu sanchez@csail.mit.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-1318384]; Qatar Computing Research
                 Institute",
  funding-text = "This work was supported in part by NSF grant
                 CCF-1318384 and a grant from the Qatar Computing
                 Research Institute.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Arrays; cache behavior models; cache calculus; cache
                 memory; cache storage; closed-form solutions;
                 Closed-form solutions; closed-form solutions;
                 Computational modeling; Computer architecture; computer
                 architecture; Computer architecture; differential
                 equations; Differential equations; differential
                 equations; mathematical model; Mathematical model; miss
                 rate; Numerical models; ordinary differential
                 equations",
  number-of-cited-references = "8",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Beckmann:2017:CCM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2017:IIC,
  author =       "Anonymous",
  title =        "2016 Index {{\booktitle{IEEE Computer Architecture
                 Letters}}} Vol. 15",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "1--6",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2653771",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Zhan:2017:CCS,
  author =       "Xin Zhan and Reza Azimi and Svilen Kanev and David
                 Brooks and Sherief Reda",
  title =        "{CARB}: a {C}-State Power Management Arbiter for
                 Latency-Critical Workloads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "6--9",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2537802",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Latency-critical workloads in datacenters have tight
                 response time requirements to meet service-level
                 agreements (SLAs). Sleep states (c-states) enable
                 servers to reduce their power consumption during idle
                 times; however entering and exiting c-states is not
                 instantaneous, leading to increased transaction
                 latency. In this paper we propose a c-state arbitration
                 technique, CARB, that minimizes response time, while
                 simultaneously realizing the power savings that could
                 be achieved from enabling c-states. CARB adapts to
                 incoming request rates and processing times and
                 activates the smallest number of cores for processing
                 the current load. CARB reshapes the distribution of
                 c-states and minimizes the latency cost of sleep by
                 avoiding going into deep sleeps too often. We quantify
                 the improvements from CARB with memcached running on an
                 8-core Haswell-based server.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zhan, X (Reprint Author), Brown Univ, Providence, RI
                 02906 USA. Zhan, Xin; Azimi, Reza; Reda, Sherief, Brown
                 Univ, Providence, RI 02906 USA. Kanev, Svilen; Brooks,
                 David, Harvard Univ, Cambridge, MA 02138 USA.",
  author-email = "xin\_zhan@brown.edu reza\_azimi@brown.edu
                 skanev@eecs.harvard.edu dbrooks@eecs.harvard.edu
                 sherief\_reda@brown.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [1305148, 1438958]",
  funding-text = "The authors would like to thank the anonymous
                 reviewers for their comments. The research of X. Zhan,
                 R. Azimi, and S. Reda was supported by NSF under Grants
                 1305148 and 1438958.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "c-state; c-state arbitration technique; c-state
                 distribution; c-state power management arbiter; cache
                 storage; CARB; computer centres; contracts;
                 datacenters; Delays; energy-efficient; feedback
                 controller; Haswell-based server; idle times; latency
                 cost minimization; Latency-critical workloads;
                 latency-critical workloads; memcached; Monitoring;
                 Optimization; power aware computing; power consumption;
                 Power demand; power savings; processing times; request
                 rates; response time minimization; Servers;
                 service-level agreements; SLA; sleep states; Time
                 factors; workload consolidation",
  number-of-cited-references = "10",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Zhan:2017:CCS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Jeon:2017:CCA,
  author =       "Dong-Ik Jeon and Ki-Seok Chung",
  title =        "{CasHMC}: a Cycle-Accurate Simulator for Hybrid Memory
                 Cube",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "10--13",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2600601",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "3D-stacked DRAM has been actively studied to overcome
                 the limits of conventional DRAM. The Hybrid Memory Cube
                 (HMC) is a type of 3D-stacked DRAM that has drawn great
                 attention because of its usability for server systems
                 and processing-in-memory (PIM) architecture. Since HMC
                 is not directly stacked on the processor die where the
                 central processing units (CPUs) and graphic processing
                 units (GPUs) are integrated, HMC has to be linked to
                 other processor components through high speed serial
                 links. Therefore, the communication bandwidth and
                 latency should be carefully estimated to evaluate the
                 performance of HMC. However, most existing HMC
                 simulators employ only simple HMC modeling. In this
                 paper, we propose a cycle-accurate simulator for hybrid
                 memory cube called CasHMC. It provides a cycle-by-cycle
                 simulation of every module in an HMC and generates
                 analysis results including a bandwidth graph and
                 statistical data. Furthermore, CasHMC is implemented in
                 C++ as a single wrapped object that includes an HMC
                 controller, communication links, and HMC memory.
                 Instantiating this single wrapped object facilitates
                 simultaneous simulation in parallel with other
                 simulators that generate memory access patterns such as
                 a processor simulator or a memory trace generator.",
  acknowledgement = ack-nhfb,
  affiliation =  "Jeon, DI (Reprint Author), Hanyang Univ, Dept Elect \&
                 Comp Engn, Seoul 04763, South Korea. Jeon, Dong-Ik;
                 Chung, Ki-Seok, Hanyang Univ, Dept Elect \& Comp Engn,
                 Seoul 04763, South Korea.",
  author-email = "estwingz@naver.com kchung@hanyang.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Institute for Information \& communications
                 Technology Promotion (IITP) --- Korea government (MSIP)
                 [R7119-16-1009]",
  funding-text = "This work was supported by Institute for Information
                 \& communications Technology Promotion (IITP) grant
                 funded by the Korea government (MSIP) (R7119-16-1009,
                 Development of Intelligent Semiconductor Core
                 Technologies for IoT Devices based on Harvest Energy).
                 Ki-Seok Chung is the corresponding author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "3D-stacked DRAM; Analytical models; Bandwidth;
                 bandwidth graph; Benchmark testing; CasHMC; central
                 processing units; communication bandwidth;
                 communication links; Computational modeling; Computer
                 architecture; CPU; cycle-accurate simulator;
                 cycle-by-cycle simulation; C{\thorn}{\thorn}; DRAM
                 chips; GPU; graph theory; graphic processing units;
                 high-speed serial links; HMC controller; HMC memory;
                 HMC simulators; hybrid memory cube; latency; memory
                 access patterns; memory architecture; Memory control
                 and access; memory design; memory trace generator;
                 modeling of computer architecture; performance
                 evaluation; PIM architecture; processing-in-memory
                 architecture; processor simulator; Random access
                 memory; server systems; simulation; Simulation;
                 simulation; single-wrapped object instantiation;
                 statistical analysis; statistical data",
  number-of-cited-references = "10",
  ORCID-numbers = "CHUNG, KI-SEOK/0000-0002-2908-8443 Jeon,
                 Dong-Ik/0000-0002-8572-4184",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Jeon:2017:CCA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wu:2017:CSB,
  author =       "Hao Wu and Fangfei Liu and Ruby B. Lee",
  title =        "Cloud Server Benchmark Suite for Evaluating New
                 Hardware Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "14--17",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2597818",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Adding new hardware features to a cloud computing
                 server requires testing both the functionality and the
                 performance of the new hardware mechanisms. However,
                 commonly used cloud computing server workloads are not
                 well-represented by the SPEC integer and floating-point
                 benchmark and Parsec suites typically used by the
                 computer architecture community. Existing cloud
                 benchmark suites for scale-out or scale-up computing
                 are not representative of the most common cloud usage,
                 and are very difficult to run on a cycle-accurate
                 simulator that can accurately model new hardware, like
                 gem5. In this paper, we present PALMScloud, a suite of
                 cloud computing benchmarks for performance evaluation
                 of cloud servers, that is ready to run on the gem5
                 cycle-accurate simulator. We conduct a behavior
                 characterization and analysis of the benchmarks. We
                 hope that these cloud benchmarks, ready to run on a
                 dual-machine gem5 simulator or on real machines, can be
                 useful to other researchers interested in improving
                 hardware micro-architecture and cloud server
                 performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wu, H (Reprint Author), Princeton Univ, Princeton, NJ
                 08544 USA. Wu, Hao; Liu, Fangfei; Lee, Ruby B.,
                 Princeton Univ, Princeton, NJ 08544 USA.",
  author-email = "haow.princeton@gmail.com fangfeil@princeton.edu
                 rblee@princeton.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "DHS/AFRL [FA8750-12-2-0295]; National
                 Science Foundation [CNS-1218817]",
  funding-text = "This work was supported in part by DHS/AFRL
                 FA8750-12-2-0295 and US National Science Foundation
                 CNS-1218817.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "behavior characterization; Benchmark testing;
                 benchmarks; Cloud Computing; Cloud computing; cloud
                 computing; cloud computing benchmarks; cloud computing
                 server workloads; cloud server benchmark; cloud
                 servers; cloud usage; computer architecture; computer
                 architecture community; cycle accurate simulator; dual
                 machine gem5 simulator; floating-point benchmark; gem5;
                 Hardware; new hardware architectures; new hardware
                 mechanisms; Parsec; performance evaluation; Performance
                 evaluation; scale-out computing; scale-up computing;
                 simulation; SPEC integer",
  number-of-cited-references = "8",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Wu:2017:CSB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Seyedzadeh:2017:CBT,
  author =       "Seyed Mohammad Seyedzadeh and Alex K. Jones and Rami
                 Melhem",
  title =        "Counter-Based Tree Structure for Row Hammering
                 Mitigation in {DRAM}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "18--21",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2614497",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Scaling down DRAM technology degrades cell reliability
                 due to increased coupling between adjacent DRAM cells,
                 commonly referred to as crosstalk. Moreover, high
                 access frequency of certain cells (hot cells) may cause
                 data loss in neighboring cells in adjacent rows due to
                 crosstalk, which is known as row hammering. In this
                 work, the goal is to mitigate row hammering in DRAM
                 cells through a Counter-Based Tree (CBT) approach. This
                 approach uses a tree of counters to detect hot rows and
                 then refreshes neighboring cells. In contrast to
                 existing deterministic solutions, CBT utilizes fewer
                 counters that makes it practically feasible to be
                 implemented on-chip. Compared to existing probabilistic
                 approaches, CBT more precisely refreshes rows
                 vulnerable to row hammering based on their access
                 frequency. Experimental results on workloads from three
                 benchmark suites show that CBT can reduce the refresh
                 energy by more than 60 percent and nearly 70 percent in
                 comparison to leading probabilistic and deterministic
                 approaches, respectively. Furthermore, hardware
                 evaluation shows that CBT can be easily implemented
                 on-chip with only a nominal overhead.",
  acknowledgement = ack-nhfb,
  affiliation =  "Seyedzadeh, SM (Reprint Author), Univ Pittsburgh, Dept
                 Comp Sci, Pittsburgh, PA 15260 USA. Seyedzadeh, Seyed
                 Mohammad; Melhem, Rami, Univ Pittsburgh, Dept Comp Sci,
                 Pittsburgh, PA 15260 USA. Jones, Alex K., Univ
                 Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA
                 15260 USA.",
  author-email = "seyedzadeh@cs.pitt.edu akjones@pitt.edu
                 melhem@cs.pitt.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-1064976]; SGMI grant from Samsung
                 electronics",
  funding-text = "This work is supported by NSF grants CCF-1064976 and
                 an SGMI grant from Samsung electronics. We thank the
                 anonymous reviewers for their feedback.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "CBT; Computer architecture; counter-based tree
                 structure; crosstalk; Crosstalk; crosstalk; DRAM; DRAM
                 chips; dynamic random-access memory; Microprocessors;
                 Radiation detectors; Random access memory; reliability;
                 Reliability; reliability; row hammering mitigation;
                 System-on-chip",
  keywords-plus = "REFRESH; MEMORY",
  number-of-cited-references = "17",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Seyedzadeh:2017:CBT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Naghibijouybari:2017:CCG,
  author =       "Hoda Naghibijouybari and Nael Abu-Ghazaleh",
  title =        "Covert Channels on {GPGPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "22--25",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2590549",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "GPUs are increasingly used to accelerate the
                 performance of not only graphics workloads, but also
                 data intensive applications. In this paper, we explore
                 the feasibility of covert channels in General Purpose
                 Graphics Processing Units (GPGPUs). We consider the
                 possibility of two colluding malicious applications
                 using the GPGPU as a covert channel to communicate, in
                 the absence of a direct channel between them. Such a
                 situation may arise in cloud environments, or in
                 environments employing containment mechanisms such as
                 dynamic information flow tracking. We reverse engineer
                 the block placement algorithm to understand
                 co-residency of blocks from different applications on
                 the same Streaming Multiprocessor (SM) core, or on
                 different SMs concurrently. In either mode, we identify
                 the shared resources that may be used to create
                 contention. We demonstrate the bandwidth of two example
                 channels: one that uses the L1 constant memory cache to
                 enable communication on the same SM, and another that
                 uses the L2 constant memory caches to enable
                 communication between different SMs. We also examine
                 the possibility of increasing the bandwidth of the
                 channel by using the available parallelism on the GPU,
                 achieving a bandwidth of over 400 Kbps. This study
                 demonstrates that GPGPUs are a feasible medium for
                 covert communication.",
  acknowledgement = ack-nhfb,
  affiliation =  "Naghibijouybari, H (Reprint Author), Univ Calif
                 Riverside, Dept Comp Sci \& Engn, Riverside, CA 92521
                 USA. Naghibijouybari, Hoda; Abu-Ghazaleh, Nael, Univ
                 Calif Riverside, Dept Comp Sci \& Engn, Riverside, CA
                 92521 USA.",
  author-email = "hnagh001@ucr.edu naelag@ucr.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "US National Science Foundation
                 [CNS-1422401]",
  funding-text = "This work is partially supported by US National
                 Science Foundation grant CNS-1422401.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; Bandwidth; block placement algorithm;
                 cache storage; Computer architecture; covert channel;
                 general purpose graphics processing units; GPGPU;
                 Graphics processing units; graphics processing units;
                 Kernel; L1 constant memory cache; L2 constant memory
                 caches; malicious applications; multiprocessing
                 systems; Security; security of data; SM core; streaming
                 multiprocessor core; Trojan horses",
  number-of-cited-references = "23",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Naghibijouybari:2017:CCG",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Song:2017:EPU,
  author =       "Wonjun Song and Hyung-Joon Jung and Jung Ho Ahn and
                 Jae W. Lee and John Kim",
  title =        "Evaluation of Performance Unfairness in {NUMA} System
                 Architecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "26--29",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2602876",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "NUMA (Non-uniform memory access) system architectures
                 are commonly used in high-performance computing and
                 datacenters. Within each architecture, a
                 processor-interconnect is used for communication
                 between the different sockets and examples of such
                 interconnect include Intel QPI and AMD HyperTransport.
                 In this work, we explore the impact of the
                 processor-interconnect on overall performance-in
                 particular, we explore the impact on performance
                 fairness from the processor-interconnect arbitration.
                 It is well known that locally-fair arbitration does not
                 guarantee globally-fair bandwidth sharing as closer
                 nodes receive more bandwidth in a multi-hop network.
                 However, this paper is the first to demonstrate the
                 opposite can occur in a commodity NUMA servers where
                 remote nodes receive higher bandwidth (and perform
                 better). This problem occurs because router
                 micro-architectures for processor-interconnects
                 commonly employ external concentration. While accessing
                 remote memory can occur in any NUMA system, performance
                 unfairness (or performance variation) is more critical
                 in cloud computing and virtual machines with shared
                 resources. We demonstrate how this unfairness creates
                 significant performance variation when executing
                 workload on the Xen virtualization platform. We then
                 provide analysis using synthetic workloads to better
                 understand the source of unfairness.",
  acknowledgement = ack-nhfb,
  affiliation =  "Song, W (Reprint Author), Korea Adv Inst Sci \&
                 Technol, Daejeon, South Korea. Song, Wonjun; Jung,
                 Hyung-Joon; Kim, John, Korea Adv Inst Sci \& Technol,
                 Daejeon, South Korea. Ahn, Jung Ho; Lee, Jae W., Seoul
                 Natl Univ, Seoul, South Korea.",
  author-email = "iamwonjunsong@kaist.edu hans7taiji@kaist.edu
                 gajh@snu.ac.kr jaewlee@snu.ac.kr jjk12@kaist.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Google Faculty Research Award, National
                 Research Foundation of Korea [NRF-2013R1A2A2A01069132,
                 NRF-2014R1A2A1A11052936, NRF-2015M3C4A7065647]; MSIP
                 under the ITRC [IITP-2016-H8501-16-1005]",
  funding-text = "This work was supported in part by Google Faculty
                 Research Award, National Research Foundation of Korea
                 (NRF-2013R1A2A2A01069132, NRF-2014R1A2A1A11052936, and
                 NRF-2015M3C4A7065647), and in part by MSIP under the
                 ITRC (IITP-2016-H8501-16-1005).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "AMD HyperTransport; Bandwidth; cloud computing;
                 globally-fair bandwidth sharing; high-performance
                 computing; Intel QPI; locally-fair arbitration; memory
                 architecture; Micromechanical devices; multihop
                 network; Multiprocessor interconnection; nonuniform
                 memory access system architectures; NUMA; NUMA system
                 architecture; parallel processing; performance
                 unfairness evaluation; processor-interconnect;
                 processor-interconnect arbitration;
                 processor-interconnects; router microarchitectures;
                 Servers; shared resources; Sockets; System-on-chip;
                 unfairness; virtual machines; Virtual machining; Xen
                 virtualization platform",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  researcherid-numbers = "Kim, John/C-1792-2011",
  times-cited =  "1",
  unique-id =    "Song:2017:EPU",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Verner:2017:EAL,
  author =       "Uri Verner and Avi Mendelson and Assaf Schuster",
  title =        "Extending {Amdahl's Law} for Multicores with Turbo
                 Boost",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "30--33",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2512982",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Rewriting sequential programs to make use of multiple
                 cores requires considerable effort. For many years,
                 Amdahl's law has served as a guideline to assess the
                 performance benefits of parallel programs over
                 sequential ones, but recent advances in multicore
                 design introduced variability in the performance of the
                 cores and motivated the reexamination of the underlying
                 model. This paper extends Amdahl's law for multicore
                 processors with built-in dynamic frequency scaling
                 mechanisms such as Intel's Turbo Boost. Using a model
                 that captures performance dependencies between cores,
                 we present tighter upper bounds for the speedup and
                 reduction in energy consumption of a parallel program
                 over a sequential one on a given multicore processor
                 and validate them on Haswell and Sandy Bridge Intel
                 CPUs. Previous studies have shown that from a processor
                 design perspective, Turbo Boost mitigates the speedup
                 limitations obtained under Amdahl's law by providing
                 higher performance for the same energy budget. However,
                 our new model and evaluation show that from a software
                 development perspective, Turbo Boost aggravates these
                 limitations by making parallelization of sequential
                 codes less profitable.",
  acknowledgement = ack-nhfb,
  affiliation =  "Verner, U (Reprint Author), Technion, Dept Comp Sci,
                 Haifa, Israel. Verner, Uri; Mendelson, Avi; Schuster,
                 Assaf, Technion, Dept Comp Sci, Haifa, Israel.",
  author-email = "uriv@cs.technion.ac.il avi.mendelson@cs.technion.ac.il
                 assaf@cs.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Amdahl law; Amdahl's law; Bridges; code
                 parallelization; Computational modeling; dynamic
                 frequency scaling mechanisms; energy consumption;
                 Energy consumption; energy consumption; Haswell;
                 multicore; multicore design; Multicore processing;
                 multicore processors; multiple cores; multiprocessing
                 systems; parallel programming; parallel programs;
                 Performance modeling; Power demand; Program processors;
                 Sandy Bridge Intel CPU; sequential code
                 parallelization; sequential program rewriting; software
                 development perspective; software engineering; Time
                 measurement; turbo boost; Turbo Boost; turbo boost",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Verner:2017:EAL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Sasaki:2017:HTP,
  author =       "Hiroshi Sasaki and Fang-Hsiang Su and Teruo Tanimoto
                 and Simha Sethumadhavan",
  title =        "Heavy Tails in Program Structure",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "34--37",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2574350",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Designing and optimizing computer systems require deep
                 understanding of the underlying system behavior.
                 Historically many important observations that led to
                 the development of essential hardware and software
                 optimizations were driven by empirical observations
                 about program behavior. In this paper, we report an
                 interesting property of program structures by viewing
                 dynamic program execution as a changing network. By
                 analyzing the communication network created as a result
                 of dynamic program execution, we find that
                 communication patterns follow heavy-tailed
                 distributions. In other words, a few instructions have
                 consumers that are orders of magnitude larger than most
                 instructions in a program. Surprisingly, these
                 heavy-tailed distributions follow the iconic power law
                 previously seen in man-made and natural networks. We
                 provide empirical measurements based on the SPEC
                 CPU2006 benchmarks to validate our findings as well as
                 perform semantic analysis of the source code to reveal
                 the causes of such behavior.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sasaki, H (Reprint Author), Columbia Univ, Dept Comp
                 Sci, New York, NY 10027 USA. Sasaki, Hiroshi; Su,
                 Fang-Hsiang; Sethumadhavan, Simha, Columbia Univ, Dept
                 Comp Sci, New York, NY 10027 USA. Tanimoto, Teruo,
                 Kyushu Univ, Grad Sch Informat Sci \& Elect Engn,
                 Fukuoka 8190395, Japan.",
  author-email = "sasaki@cs.columbia.edu mikefhsu@cs.columbia.edu
                 teruo.tanimoto@cpc.ait.kyushu-u.ac.jp
                 simha@cs.columbia.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "JSPS Postdoctoral Fellowships for Research
                 Abroad; US National Science Foundation [1302269];
                 Alfred P. Sloan Fellowship",
  funding-text = "This work is sponsored in part by JSPS Postdoctoral
                 Fellowships for Research Abroad, US National Science
                 Foundation award number 1302269 and Alfred P. Sloan
                 Fellowship. This work was done while Teruo Tanimoto was
                 a visiting student at Columbia University.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Communication networks; computer
                 systems; Computers; dynamic program execution;
                 empirical studies; hardware optimization; heavy-tailed
                 distribution; Image edge detection; Optimization;
                 Program characterization; program diagnostics; program
                 structure; Registers; semantic analysis; Shape;
                 software optimization; SPEC CPU2006 benchmarks;
                 statistical distribution; statistical distributions;
                 system behavior",
  number-of-cited-references = "9",
  oa =           "Bronze",
  research-areas = "Computer Science",
  researcherid-numbers = "Sasaki, Hiroshi/N-8579-2019",
  times-cited =  "1",
  unique-id =    "Sasaki:2017:HTP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Feng:2017:HHC,
  author =       "Liang Feng and Hao Liang and Sharad Sinha and Wei
                 Zhang",
  title =        "{HeteroSim}: a Heterogeneous {CPU--FPGA} Simulator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "38--41",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2615617",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Heterogeneous Computing is a promising direction to
                 address the challenges of performance and power walls
                 in high-performance computing, where CPU-FPGA
                 architectures are particularly promising for
                 application acceleration. However, the development of
                 such architectures associated with optimal memory
                 hierarchies is challenging due to the absence of an
                 integrated simulator to support full system simulation
                 and architectural exploration. In this work, we present
                 HeteroSim, a full system simulator supporting x86
                 multi-cores integrated with an FPGA via bus connection.
                 It can support fast architectural exploration with
                 respect to number of cores, number of accelerated
                 kernels on FPGA, and different memory hierarchies
                 between CPU and FPGA. Various performance metrics are
                 returned for further performance analysis and
                 architectural configuration optimization.",
  acknowledgement = ack-nhfb,
  affiliation =  "Feng, L (Reprint Author), Hong Kong Univ Sci \&
                 Technol, Kowloon, Hong Kong, Peoples R China. Feng,
                 Liang; Liang, Hao; Sinha, Sharad; Zhang, Wei, Hong Kong
                 Univ Sci \& Technol, Kowloon, Hong Kong, Peoples R
                 China.",
  author-email = "lfengad@connect.ust.hk hliangac@connect.ust.hk
                 sharad\_sinha@ieee.org wei.zhang@ust.hk",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; architectural configuration
                 optimization; bus connection; Computational modeling;
                 Computer architecture; CPU-FPGA architectures; digital
                 simulation; Field programmable gate arrays; field
                 programmable gate arrays; FPGA; full system simulator;
                 Hardware design languages; heterogeneous computing;
                 heterogeneous CPU-FPGA simulator; heterogeneous system;
                 HeteroSim; high-performance computing; Kernel;
                 microprocessor chips; multiprocessing systems; optimal
                 memory hierarchies; parallel architectures; performance
                 analysis; performance metrics; Registers; Simulator;
                 x86 multicores",
  number-of-cited-references = "11",
  ORCID-numbers = "SINHA, SHARAD/0000-0002-4532-2017 SINHA,
                 SHARAD/0000-0002-4532-2017",
  research-areas = "Computer Science",
  researcherid-numbers = "SINHA, SHARAD/J-6775-2019 SINHA,
                 SHARAD/R-2575-2017",
  times-cited =  "1",
  unique-id =    "Feng:2017:HHC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zhao:2017:LIC,
  author =       "Xia Zhao and Yuxi Liu and Almutaz Adileh and Lieven
                 Eeckhout",
  title =        "{LA-LLC}: Inter-Core Locality-Aware Last-Level Cache
                 to Exploit Many-to-Many Traffic in {GPGPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "42--45",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2611663",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The reply network is a severe performance bottleneck
                 in General Purpose Graphic Processing Units (GPGPUs),
                 as the communication path from memory controllers (MC)
                 to cores is often congested. In this paper, we find
                 that instead of relying on the congested communication
                 path between MCs and cores, the unused core-to-core
                 communication path can be leveraged to transfer data
                 blocks between cores. We propose the inter-core
                 Locality-Aware Last-Level Cache (LA-LLC), which
                 requires only few bits per cache block and enables a
                 core to fetch shared data from another core's private
                 cache instead of the LLC. Leveraging inter-core
                 communication, LA-LLC transforms few-to-many traffic to
                 many-to-many traffic, thereby mitigating the reply
                 network bottleneck. For a set of applications
                 exhibiting varying degrees of inter-core locality,
                 LA-LLC reduces memory access latency and increases
                 performance by 21.1 percent on average and up to 68
                 percent, with negligible hardware cost.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zhao, X (Reprint Author), Univ Ghent, Ghent, Belgium.
                 Zhao, Xia; Liu, Yuxi; Adileh, Almutaz; Eeckhout,
                 Lieven, Univ Ghent, Ghent, Belgium.",
  author-email = "xia.zhao@ugent.be yuxi.liu@ugent.be
                 almutaz.adileh@ugent.be lieven.eeckhout@ugent.be",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Benchmark testing; cache storage; congested
                 communication path; core-to-core communication path;
                 few-to-many traffic; general purpose graphic processing
                 units; GPGPU; GPGPUs; Graphics processing units;
                 graphics processing units; inter-core locality;
                 intercore communication; intercore locality-aware
                 last-level cache; LA-LLC; LLC; many-to-many traffic;
                 memory access latency; memory controllers;
                 Multiprocessor interconnection; network-on-chip; NoC;
                 Ports (Computers); private cache; reply network; shared
                 data fetching; System recovery",
  number-of-cited-references = "16",
  oa =           "Green Published",
  ORCID-numbers = "Zhao, Xia/0000-0001-6479-9200",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Zhao:2017:LIC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Boroumand:2017:LEC,
  author =       "Amirali Boroumand and Saugata Ghose and Minesh Patel
                 and Hasan Hassan and Brandon Lucia and Kevin Hsieh and
                 Krishna T. Malladi and Hongzhong Zheng and Onur Mutlu",
  title =        "{LazyPIM}: an Efficient Cache Coherence Mechanism for
                 Processing-in-Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "46--50",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2577557",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Processing-in-memory (PIM) architectures cannot use
                 traditional approaches to cache coherence due to the
                 high off-chip traffic consumed by coherence messages.
                 We propose LazyPIM, a new hardware cache coherence
                 mechanism designed specifically for PIM. LazyPIM uses a
                 combination of speculative cache coherence and
                 compressed coherence signatures to greatly reduce the
                 overhead of keeping PIM coherent with the processor. We
                 find that LazyPIM improves average performance across a
                 range of PIM applications by 49.1 percent over the best
                 prior approach, coming within 5.5 percent of an ideal
                 PIM mechanism.",
  acknowledgement = ack-nhfb,
  affiliation =  "Boroumand, A (Reprint Author), Carnegie Mellon Univ,
                 Pittsburgh, PA 15123 USA. Boroumand, Amirali; Ghose,
                 Saugata; Patel, Minesh; Hassan, Hasan; Lucia, Brandon;
                 Hsieh, Kevin; Mutlu, Onur, Carnegie Mellon Univ,
                 Pittsburgh, PA 15123 USA. Hassan, Hasan, TOBB ETU
                 Sogutozu, TR-06560 Ankara, Turkey. Malladi, Krishna T.;
                 Zheng, Hongzhong, Samsung Semicond Inc, Milpitas, CA
                 95035 USA. Mutlu, Onur, ETH, Ramistr, CH-8092 Zurich,
                 Switzerland.",
  author-email = "amirali@cmu.edu ghose@cmu.edu mineshp@andrew.cmu.edu
                 hhasan@etu.edu.tr blucia@andrew.cmu.edu
                 tsuwangh@andrew.cmu.edu k.tej@ssi.samsung.com
                 hz.zheng@ssi.samsung.com omuthu@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; cache coherence mechanism; cache storage;
                 Coherence; coherence messages; compressed coherence;
                 Computer architecture; Kernel; LazyPIM mechanism;
                 Message systems; PIM architecture;
                 processing-in-memory; Programming; Random access
                 memory; speculative cache coherence",
  keywords-plus = "CONSISTENCY",
  number-of-cited-references = "30",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "8",
  unique-id =    "Boroumand:2017:LEC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Gottscho:2017:MIM,
  author =       "Mark Gottscho and Mohammed Shoaib and Sriram Govindan
                 and Bikash Sharma and Di Wang and Puneet Gupta",
  title =        "Measuring the Impact of Memory Errors on Application
                 Performance",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "51--55",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2599513",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memory reliability is a key factor in the design of
                 warehouse-scale computers. Prior work has focused on
                 the performance overheads of memory fault-tolerance
                 schemes when errors do not occur at all, and when
                 detected but uncorrectable errors occur, which result
                 in machine downtime and loss of availability. We focus
                 on a common third scenario, namely, situations when
                 hard but correctable faults exist in memory; these may
                 cause an ``avalanche'' of errors to occur on affected
                 hardware. We expose how the hardware/software
                 mechanisms for managing and reporting memory errors can
                 cause severe performance degradation in systems
                 suffering from hardware faults. We inject faults in
                 DRAM on a real cloud server and quantify the
                 single-machine performance degradation for both batch
                 and interactive workloads. We observe that for SPEC
                 CPU2006 benchmarks, memory errors can slow down average
                 execution time by up to 2.5x. For an interactive
                 web-search workload, average query latency degrades by
                 up to 2.3x for a light traffic load, and up to an
                 extreme 3746x under peak load. Our analyses of the
                 memory error-reporting stack reveals architecture,
                 firmware, and software opportunities to improve
                 performance consistency by mitigating the worst-case
                 behavior on faulty hardware.",
  acknowledgement = ack-nhfb,
  affiliation =  "Gottscho, M (Reprint Author), Univ Calif Los Angeles,
                 Dept Elect Engn, Los Angeles, CA 90095 USA. Gottscho,
                 Mark; Gupta, Puneet, Univ Calif Los Angeles, Dept Elect
                 Engn, Los Angeles, CA 90095 USA. Shoaib, Mohammed;
                 Wang, Di, Microsoft Res, Redmond, WA 98052 USA.
                 Govindan, Sriram; Sharma, Bikash, Microsoft, Redmond,
                 WA 98052 USA.",
  author-email = "mgottscho@ucla.edu shoaib@microsoft.com
                 srgovin@microsoft.com bsharma@microsoft.com
                 wangdi@microsoft.com puneet@ee.ucla.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF Variability Expedition Grant
                 [CCF-1029030]",
  funding-text = "This work was conducted jointly between Microsoft
                 Corporation and the NanoCAD Lab of the Electrical
                 Engineering Department at the University of California,
                 Los Angeles (UCLA). The authors thank Dr. Jie Liu of
                 Microsoft Research, and Dr. Badriddine Khessib and Dr.
                 Kushagra Vaid of Microsoft for supporting this work
                 while Mr. Gottscho was an intern at Microsoft Research
                 in 2015. Funding came partly from the NSF Variability
                 Expedition Grant No. CCF-1029030.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application performance; availability; cloud;
                 Degradation; DRAM; dynamic random-access storage;
                 error-handling; fault tolerant computing; Hardware;
                 hardware/software interface; hardware/software
                 mechanisms; Instruction sets; interactive web-search
                 workload; Main memory; memory errors; memory
                 fault-tolerance schemes; memory reliability;
                 performance consistency; Random access memory;
                 random-access storage; RAS; reliability; Reliability;
                 servers; Servers; servers; warehouse-scale computer
                 design",
  keywords-plus = "VARIABILITY; RELIABILITY; SYSTEMS",
  number-of-cited-references = "32",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Gottscho:2017:MIM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Adileh:2017:MPH,
  author =       "Almutaz Adileh and Stijn Eyerman and Aamer Jaleel and
                 Lieven Eeckhout",
  title =        "Mind The Power Holes: Sifting Operating Points in
                 Power-Limited Heterogeneous Multicores",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "56--59",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2616339",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Heterogeneous chip multicore processors (HCMPs)
                 equipped with multiple voltage-frequency (V-F)
                 operating points provide a wide spectrum of
                 power-performance tradeoff opportunities. This work
                 targets the performance of HCMPs under a power cap. We
                 show that for any performance optimization technique to
                 work under power constraints, the default set of V-F
                 operating points in HCMPs must be first filtered based
                 on the application's power and performance
                 characteristics. Attempting to find operating points of
                 maximum performance by naively walking the default set
                 of operating points leads the application to
                 inefficient operating points which drain power without
                 significant performance benefit. We call these points
                 Power Holes (PH). Contrary to intuition, we show that
                 even using a power-performance curve of Pareto-optimal
                 operating points still degrades performance
                 significantly for the same reason. We propose
                 PH-Sifter, a fast and scalable technique that sifts the
                 default set of operating points and eliminates power
                 holes. We show significant performance improvement of
                 PH-Sifter compared to Pareto sifting for three use
                 cases: (i) maximizing performance for a single
                 application, (ii) maximizing system throughput for
                 multi-programmed workloads, and (iii) maximizing
                 performance of a system in which a fraction of the
                 power budget is reserved for a high-priority
                 application. Our results show performance improvements
                 of 13, 27, and 28 percent on average that reach up to
                 52, 91 percent, and 2.3x, respectively, for the three
                 use cases.",
  acknowledgement = ack-nhfb,
  affiliation =  "Adileh, A (Reprint Author), Univ Ghent, B-9052 Ghent,
                 East Flanders, Belgium. Adileh, Almutaz; Eeckhout,
                 Lieven, Univ Ghent, B-9052 Ghent, East Flanders,
                 Belgium. Eyerman, Stijn, Intel Belgium, B-2550 Leuven,
                 Kontich, Belgium. Jaleel, Aamer, Nvidia Res, Boston, MA
                 01886 USA.",
  author-email = "almutaz.adileh@ugent.be stijn.eyerman@elis.ugent.be
                 ajaleel@nvidia.com lieven.eeckhout@elis.ugent.be",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "European Research Council under the
                 European Community's Seventh Framework Programme
                 (FP7)/ERC grant [259295]",
  funding-text = "We thank the anonymous reviewers for their thoughtful
                 feedback. This research is supported in part through
                 the European Research Council under the European
                 Community's Seventh Framework Programme
                 (FP7/2007-2013)/ERC grant agreement no. 259295. This
                 work was done while Stijn Eyerman was at Ghent
                 University.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "HCMP performance; heterogeneous chip multicore
                 processors; Heterogeneous multicores; high-priority
                 application; Indexes; Legged locomotion; Multicore
                 processing; multiple voltage-frequency operating
                 points; multiprocessing systems; multiprogramming;
                 optimal operating points; Optimization; Pareto
                 optimisation; Pareto-optimal operating points;
                 performance evaluation; performance maximization;
                 performance optimization; PH-Sifter; power aware
                 computing; Power Holes; power management; power-limited
                 processors; power-performance curve; power-performance
                 tradeoff opportunities; Program processors; Schedules;
                 system throughput maximization; Throughput; V-F
                 operating points",
  keywords-plus = "PERFORMANCE; DVFS",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Adileh:2017:MPH",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Sasaki:2017:MPC,
  author =       "Hiroshi Sasaki and Alper Buyuktosunoglu and Augusto
                 Vega and Pradip Bose",
  title =        "Mitigating Power Contention: a Scheduling Based
                 Approach",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "60--63",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2572080",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Shared resource contention has been a major
                 performance issue for CMPs. In this paper, we tackle
                 the power contention problem in power constrained CMPs
                 by considering and treating power as a first-class
                 shared resource. Power contention occurs when multiple
                 processes compete for power, and leads to degraded
                 system performance. In order to solve this problem, we
                 develop a shared resource contention-aware scheduling
                 algorithm that mitigates the contention for power and
                 the shared memory subsystem at the same time. The
                 proposed scheduler improves system performance by
                 balancing the shared resource usage among scheduling
                 groups. Evaluation results across a variety of
                 multiprogrammed workloads show performance improvements
                 over a state-of-the-art scheduling policy which only
                 considers memory subsystem contention.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sasaki, H (Reprint Author), Columbia Univ, Dept Comp
                 Sci, New York, NY 10027 USA. Sasaki, Hiroshi, Columbia
                 Univ, Dept Comp Sci, New York, NY 10027 USA.
                 Buyuktosunoglu, Alper; Vega, Augusto; Bose, Pradip, IBM
                 TJ Watson Res Ctr, New York, NY 10598 USA.",
  author-email = "sasaki@cs.columbia.edu alperb@us.ibm.com
                 ajvega@us.ibm.com pbose@us.ibm.com",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "JSPS Postdoctoral Fellowships for Research
                 Abroad; Defense Advanced Research Projects Agency
                 (DARPA), Microsystems Technology Office (MTO)
                 [HR0011-13-C-0022]",
  funding-text = "This work is sponsored, in part, by JSPS Postdoctoral
                 Fellowships for Research Abroad, and Defense Advanced
                 Research Projects Agency (DARPA), Microsystems
                 Technology Office (MTO), under contract number
                 HR0011-13-C-0022. The views expressed are those of the
                 authors and do not reflect the official policy or
                 position of the Department of Defense or the U.S.
                 Government.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; chip multiprocessors;
                 energy-efficient systems; first-class shared resource;
                 Memory management; memory subsystem contention;
                 multi-core processors; multiprogrammed workloads;
                 performance evaluation; power aware computing; power
                 capping; power constrained CMP; Power contention; power
                 contention problem; Power demand; process scheduling;
                 processor scheduling; Processor scheduling; Random
                 access memory; resource allocation; Scheduling;
                 scheduling-based approach; shared memory systems;
                 shared resource contention-aware scheduling algorithm;
                 System performance",
  keywords-plus = "PERFORMANCE",
  number-of-cited-references = "15",
  oa =           "Bronze",
  research-areas = "Computer Science",
  researcherid-numbers = "Sasaki, Hiroshi/N-8579-2019",
  times-cited =  "1",
  unique-id =    "Sasaki:2017:MPC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Marquez:2017:MCH,
  author =       "David Gonzalez Marquez and Adrian Cristal Kestelman
                 and Esteban Mocskos",
  title =        "{Mth}: Codesigned Hardware\slash Software Support for
                 Fine Grain Threads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "64--67",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2606383",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multi-core processors are ubiquitous in all market
                 segments from embedded to high performance computing,
                 but only few applications can efficiently utilize them.
                 Existing parallel frameworks aim to support
                 thread-level parallelism in applications, but the
                 imposed overhead prevents their usage for small problem
                 instances. This work presents Micro-threads (Mth) a
                 hardware-software proposal focused on a shared thread
                 management model enabling the use of parallel resources
                 in applications that have small chunks of parallel code
                 or small problem inputs by a combination of software
                 and hardware: delegation of the resource control to the
                 application, an improved mechanism to store and fill
                 processor's context, and an efficient synchronization
                 system. Four sample applications are used to test our
                 proposal: HSL filter (trivially parallel), FFT Radix2
                 (recursive algorithm), LU decomposition (barrier every
                 cycle) and Dantzig algorithm (graph based, matrix
                 manipulation). The results encourage the use of Mth and
                 could smooth the use of multiple cores for applications
                 that currently can not take advantage of the
                 proliferation of the available parallel resources in
                 each chip.",
  acknowledgement = ack-nhfb,
  affiliation =  "Marquez, DG (Reprint Author), Univ Buenos Aires, Fac
                 Ciencias Exactas \& Nat, Dept Comp Sci, C1428EGA,
                 RA-1053 Buenos Aires, DF, Argentina. Marquez, David
                 Gonzalez; Mocskos, Esteban, Univ Buenos Aires, Fac
                 Ciencias Exactas \& Nat, Dept Comp Sci, C1428EGA,
                 RA-1053 Buenos Aires, DF, Argentina. Mocskos, Esteban,
                 CSC CONICET, C1425FQD, RA-2390 Buenos Aires, DF,
                 Argentina. Kestelman, Adrian Cristal, CSIC, IIIA,
                 Barcelona Supercomp Ctr, ES-08034 Barcelona, Spain.
                 Kestelman, Adrian Cristal, Univ Politecn Cataluna, Dept
                 Comp Architecture, ES-08034 Barcelona, Spain.",
  author-email = "dmarquez@dc.uba.ar adrian.cristal@bsc.es
                 emocskos@dc.uba.ar",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Universidad de Buenos Aires [UBACyT
                 20020130200096BA]; CONICET [PIP 11220110100379]",
  funding-text = "This work was partially funded by grants from
                 Universidad de Buenos Aires (UBACyT 20020130200096BA)
                 and CONICET (PIP 11220110100379). The authors thank
                 specially Osman Unsal for reading this article with
                 fruitful criticism.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "barrier every cycle; codesigned hardware-software
                 support; Dantzig algorithm; digital arithmetic;
                 embedded processors; fast Fourier transforms; FFT
                 Radix2 algorithm; fine grain threads; graph based
                 algorithm; graph theory; hardware-software codesign;
                 high performance computing; HSL filter; LU
                 decomposition; matrix decomposition; matrix
                 manipulation; Message systems; microthreads; Mirrors;
                 Mth hardware/software support; multi-threading;
                 multicore processing; multicore processors;
                 multithreading; Parallel architectures; parallel
                 architectures; Parallel architectures; parallel code;
                 parallel frameworks; Parallel processing; parallel
                 programming; parallel resources; Program processors;
                 Proposals; recursive algorithm; Registers; resource
                 control; shared memory systems; shared thread
                 management model; Synchronization; synchronization
                 system; thread-level parallelism support; trivially
                 parallel filter",
  keywords-plus = "PARALLELISM",
  number-of-cited-references = "11",
  ORCID-numbers = "Mocskos, Esteban/0000-0002-6473-7672",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Marquez:2017:MCH",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Morad:2017:ORO,
  author =       "Tomer Y. Morad and Gil Shomron and Mattan Erez and
                 Avinoam Kolodny and Uri C. Weiser",
  title =        "Optimizing Read-Once Data Flow in Big-Data
                 Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "68--71",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2520927",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memory hierarchies in modern computing systems work
                 well for workloads that exhibit temporal data locality.
                 Data that is accessed frequently is brought closer to
                 the computing cores, allowing faster access times,
                 higher bandwidth, and reduced transmission energy. Many
                 applications that work on big data, however, read data
                 only once. When running these applications on modern
                 computing systems, data that is not reused is
                 nevertheless transmitted and copied into all memory
                 hierarchy levels, leading to energy and bandwidth
                 waste. In this paper we evaluate workloads dealing with
                 read-once data and measure their energy consumption. We
                 then modify the workloads so that data that is known to
                 be used only once is transferred directly from storage
                 into the CPU's last level cache, effectively bypassing
                 DRAM and avoiding keeping unnecessary copies of the
                 data. Our measurements on a real system show savings of
                 up to 5 Watts in server power and up to 3.9 percent
                 reduction in server energy when 160 GB of read-once
                 data bypasses DRAM.",
  acknowledgement = ack-nhfb,
  affiliation =  "Morad, TY (Reprint Author), Cornell Tech, Jacobs
                 Technion Cornell Inst, 111 8th Ave, New York, NY 10011
                 USA. Morad, Tomer Y.; Shomron, Gil; Kolodny, Avinoam;
                 Weiser, Uri C., Technion Israel Inst Technol, Dept
                 Elect Engn, IL-32000 Haifa, Israel. Morad, Tomer Y.,
                 Cornell Tech, Jacobs Technion Cornell Inst, 111 8th
                 Ave, New York, NY 10011 USA. Erez, Mattan, Univ Texas
                 Austin, Dept Elect \& Comp Engn, 201 E 24th St, C0803,
                 POB 6-248, Austin, TX 78712 USA.",
  author-email = "tomerm@tx.technion.ac.il gilsho@tx.technion.ac.il
                 mattan.erez@utexas.edu kolodny@ee.technion.ac.il
                 uri.weiser@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Intel Collaborative Research Institute for
                 Computational Intelligence (ICRI-CI)",
  funding-text = "This research was supported by the Intel Collaborative
                 Research Institute for Computational Intelligence
                 (ICRI-CI).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; bandwidth wastage; Big Data; Big-Data
                 applications; cache storage; computing cores; CPU
                 last-level cache; data access time; data flow
                 computing; DRAM; energy consumption measure; Energy
                 efficiency; Energy measurement; energy wastage; memory
                 architecture; memory hierarchy levels; Memory
                 management; Performance evaluation; Prefetching; Random
                 access memory; read-once data flow optimization;
                 reduced transmission energy; server energy reduction;
                 Servers; temporal data locality",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Morad:2017:ORO",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yasoubi:2017:PEA,
  author =       "Ali Yasoubi and Reza Hojabr and Mehdi Modarressi",
  title =        "Power-Efficient Accelerator Design for Neural Networks
                 Using Computation Reuse",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "72--75",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2521654",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Applications of neural networks in various fields of
                 research and technology have expanded widely in recent
                 years. In particular, applications with inherent
                 tolerance to accuracy loss, such as signal processing
                 and multimedia applications, are highly suited to the
                 approximation property of neural networks. This
                 approximation property has been exploited in many
                 existing neural network accelerators to trade-off
                 accuracy for power-efficiency and speed. In addition to
                 the power saving obtained by approximation, we observed
                 that a considerable amount of arithmetic operations in
                 neural networks are repetitive and can be eliminated to
                 further decrease power consumption. Given this
                 observation, we propose CORN, COmputation Reuse-aware
                 Neural network accelerator that allows neurons to share
                 their computation results, effectively eliminating the
                 power usage of redundant computations. We will show
                 that CORN lowers power consumption by 26 percent on
                 average over low-power neural network accelerators.",
  acknowledgement = ack-nhfb,
  affiliation =  "Yasoubi, A (Reprint Author), Univ Tehran, Dept Elect
                 \& Comp Engn, Coll Engn, Tehran, Iran. Yasoubi, Ali;
                 Hojabr, Reza; Modarressi, Mehdi, Univ Tehran, Dept
                 Elect \& Comp Engn, Coll Engn, Tehran, Iran.",
  author-email = "a.yosoubi@ut.ac.ir r.hojabr@ut.ac.ir
                 modarressi@uti.ac.ir",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "approximation; arithmetic operations; Biological
                 neural networks; Buffer storage; computation reuse;
                 computation reuse-aware neural network accelerator;
                 Computer architecture; CORN; energy conservation;
                 hardware accelerator; low-power neural network
                 accelerators; neural nets; Neural network; Neurons;
                 power aware computing; Power demand; power usage
                 elimination; power-efficiency; power-efficient
                 accelerator design; Redundancy; redundant
                 computations",
  keywords-plus = "RECOGNITION",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Yasoubi:2017:PEA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Son:2017:SAS,
  author =       "Young Hoon Son and Hyunyoon Cho and Yuhwan Ro and Jae
                 W. Lee and Jung Ho Ahn",
  title =        "{SALAD}: Achieving Symmetric Access Latency with
                 Asymmetric {DRAM} Architecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "76--79",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2525760",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memory access latency has significant impact on
                 application performance. Unfortunately, the random
                 access latency of DRAM has been scaling relatively
                 slowly, and often directly affects the critical path of
                 execution, especially for applications with
                 insufficient locality or memory-level parallelism. The
                 existing low-latency DRAM organizations either incur
                 significant area overhead or burden the software stack
                 with non-uniform access latency. This paper proposes
                 SALAD, a new DRAM device architecture that provides
                 symmetric access latency with asymmetric DRAM bank
                 organizations. Since local banks have lower data
                 transfer time due to their proximity to the I/O pads,
                 SALAD applies high aspect-ratio (i.e., low-latency)
                 mats only to remote banks to offset the difference in
                 data transfer time, thus providing uniformly low access
                 time (tAC) over the whole device. Our evaluation
                 demonstrates that SALAD improves the IPC by 13 percent
                 (10 percent) without any software modifications, while
                 incurring only 6 percent (3 percent) area overhead.",
  acknowledgement = ack-nhfb,
  affiliation =  "Son, YH (Reprint Author), Seoul Natl Univ, Seoul,
                 South Korea. Son, Young Hoon; Cho, Hyunyoon; Ro,
                 Yuhwan; Ahn, Jung Ho, Seoul Natl Univ, Seoul, South
                 Korea. Lee, Jae W., Sungkyunkwan Univ, Seoul, South
                 Korea.",
  author-email = "yhson96@snu.ac.kr sumk40@snu.ac.kr yuhwanro@snu.ac.kr
                 jaewlee@skku.edu gajh@snu.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Research Foundation of Korea -
                 Korea government [NRF-2015M3C4A7065647]; ICT R\&D
                 program of MSIP/IITP [KI001810041244]",
  funding-text = "This work was partially supported by the National
                 Research Foundation of Korea grant funded by the Korea
                 government (NRF-2015M3C4A7065647) and ICT R\&D program
                 of MSIP/IITP (KI001810041244).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "asymmetric bank organizations; asymmetric DRAM bank
                 organizations; Data transfer; data transfer time; DRAM;
                 DRAM chips; DRAM device architecture; I/O pads; memory
                 architecture; Memory management; microarchitecture;
                 Organizations; Parallel processing; Random access
                 memory; SALAD; Software; symmetric access latency with
                 asymmetric DRAM; uniformly low access time",
  number-of-cited-references = "20",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Son:2017:SAS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Judd:2017:SBS,
  author =       "Patrick Judd and Jorge Albericio and Andreas
                 Moshovos",
  title =        "{Stripes}: Bit-Serial Deep Neural Network Computing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "80--83",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2597140",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The numerical representation precision required by the
                 computations performed by Deep Neural Networks (DNNs)
                 varies across networks and between layers of a same
                 network. This observation motivates a precision-based
                 approach to acceleration which takes into account both
                 the computational structure and the required numerical
                 precision representation. This work presents Stripes
                 (STR), a hardware accelerator that uses bit-serial
                 computations to improve energy efficiency and
                 performance. Experimental measurements over a set of
                 state-of-the-art DNNs for image classification show
                 that STR improves performance over a state-of-the-art
                 accelerator from 1.35x to 5.33x and by 2.24x on
                 average. STR's area and power overhead are estimated at
                 5 percent and 12 percent respectively. STR is 2.00x
                 more energy efficient than the baseline.",
  acknowledgement = ack-nhfb,
  affiliation =  "Judd, P (Reprint Author), Univ Toronto, Edward S
                 Rogers Sr Dept Elect \& Comp Engn, Toronto, ON M5S 3H7,
                 Canada. Judd, Patrick; Albericio, Jorge; Moshovos,
                 Andreas, Univ Toronto, Edward S Rogers Sr Dept Elect \&
                 Comp Engn, Toronto, ON M5S 3H7, Canada.",
  author-email = "patrick.judd@mail.utoronto.ca jorge@ece.utoronto.ca
                 moshovos@eecg.toronto.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Artificial neural networks; bit-serial computations;
                 bit-serial deep neural network computing; convolution;
                 deep learning; deep neural networks; energy efficiency;
                 Graphics processing units; Hardware acceleration; image
                 classification; learning (artificial intelligence);
                 neural nets; Neurons; Nickel; numerical representation;
                 Parallel processing; precision-based approach; serial
                 computing; STR; Stripes; Three-dimensional displays",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "5",
  unique-id =    "Judd:2017:SBS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Ravi:2017:TSM,
  author =       "Gokul Subramanian Ravi and Mikko Lipasti",
  title =        "Timing Speculation in Multi-Cycle Data Paths",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "1",
  pages =        "84--87",
  month =        jan # "\slash " # jun,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2580501",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Modern processors set timing margins conservatively at
                 design time to support extreme variations in workload
                 and environment, in order to operate reliably and
                 produce expected outputs. Unfortunately, the
                 conservative guard bands set to achieve this
                 reliability are detrimental to processor performance
                 and energy efficiency. In this paper, we propose the
                 use of processors with internal transparent pipelines,
                 which allow data to flow between stages without
                 latching, to maximize timing speculation efficiency as
                 they are inherently suited to slack conservation. We
                 design a synchronous tracking mechanism which runs in
                 parallel with the multi-cycle data path to estimate the
                 accumulated slack across instructions/pipeline stages
                 and then appropriately clock synchronous boundaries
                 early to minimize wasted slack and achieve maximum
                 clock cycle savings. Preliminary evaluations atop the
                 CRIB processor show performance improvements of greater
                 than 10\% on average and as high as 30\% for an assumed
                 25\% slack per clock cycle.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ravi, GS (Reprint Author), Univ Wisconsin, Dept Elect
                 \& Comp Engn, 1415 Johnson Dr, Madison, WI 53706 USA.
                 Ravi, Gokul Subramanian; Lipasti, Mikko, Univ
                 Wisconsin, Dept Elect \& Comp Engn, 1415 Johnson Dr,
                 Madison, WI 53706 USA.",
  author-email = "gravi@wisc.edu mikko@engr.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "EY5PB",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "clock cycle savings; clocks; Clocks; CRIB; CRIB
                 processor; internal transparent pipelines;
                 microprocessor chips; multi-cycle datapath; multicycle
                 data paths; parallel processing; parallel synchronous
                 tracking mechanism; pipeline processing; Pipelines;
                 Program processors; Proposals; Registers; Reliability;
                 slack; Timing; Timing speculation; timing speculation",
  number-of-cited-references = "8",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Ravi:2017:TSM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Khan:2017:CMC,
  author =       "Samira Khan and Chris Wilkerson and Donghyuk Lee and
                 Alaa R. Alameldeen and Onur Mutlu",
  title =        "A Case for Memory Content-Based Detection and
                 Mitigation of Data-Dependent Failures in {DRAM}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "88--93",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2624298",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "DRAM cells in close proximity can fail depending on
                 the data content in neighboring cells. These failures
                 are called data-dependent failures. Detecting and
                 mitigating these failures online while the system is
                 running in the field enables optimizations that improve
                 reliability, latency, and energy efficiency of the
                 system. All these optimizations depend on accurately
                 detecting every possible data-dependent failure that
                 could occur with any content in DRAM. Unfortunately,
                 detecting all data-dependent failures requires the
                 knowledge of DRAM internals specific to each DRAM chip.
                 As internal DRAM architecture is not exposed to the
                 system, detecting data-dependent failures at the
                 system-level is a major challenge. Our goal in this
                 work is to decouple the detection and mitigation of
                 data-dependent failures from physical DRAM organization
                 such that it is possible to detect failures without
                 knowledge of DRAM internals. To this end, we propose
                 MEMCON, a memory content-based detection and mitigation
                 mechanism for data-dependent failures in DRAM. MEMCON
                 does not detect every possible data-dependent failure.
                 Instead, it detects and mitigates failures that occur
                 with the current content in memory while the programs
                 are running in the system. Using experimental data from
                 real machines, we demonstrate that MEMCON is an
                 effective and low-overhead system-level detection and
                 mitigation technique for data-dependent failures in
                 DRAM.",
  acknowledgement = ack-nhfb,
  affiliation =  "Khan, S (Reprint Author), Univ Virginia,
                 Charlottesville, VA 22903 USA. Khan, Samira, Univ
                 Virginia, Charlottesville, VA 22903 USA. Wilkerson,
                 Chris; Alameldeen, Alaa R., Intel Labs, Santa Clara, CA
                 95054 USA. Lee, Donghyuk; Mutlu, Onur, Carnegie Mellon
                 Univ, Pittsburgh, PA 15213 USA. Mutlu, Onur, ETH,
                 CH-8092 Zurich, Switzerland.",
  author-email = "samirakhan@virginia.edu chris.wilkerson@intel.com
                 donghyu1@cmu.edu alaa.r.alameldeen@intel.com
                 onur@cmu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "ISTC-CC, an US National Science Foundation
                 [CCF-0953246]; US National Science Foundation
                 [CCF-1212962, CNS-1320531, CCF-1566483]",
  funding-text = "We thank anonymous reviewers and SAFARI group members
                 for feedback. We acknowledge the support of Google,
                 Intel, Nvidia, Seagate, and Samsung. This research was
                 supported in part by the ISTC-CC, an US National
                 Science Foundation CAREER Award (CCF-0953246), and US
                 National Science Foundation grants (CCF-1212962,
                 CNS-1320531, and CCF-1566483).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Content management; data content; data dependent
                 failure; data-dependent failures; DRAM; DRAM cells;
                 DRAM chips; DRAM internals; DRAM, data dependent
                 failure, system-level testing; failure analysis;
                 Failure analysis; integrated circuit reliability;
                 Interference; low-overhead system-level detection
                 technique; low-overhead system-level migration
                 technique; MEMCON; memory content-based detection;
                 memory content-based migration; neighboring cells;
                 optimisation; physical DRAM organization; System-level
                 design; system-level testing; Testing",
  keywords-plus = "NOISE",
  number-of-cited-references = "42",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Khan:2017:CMC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Mittal:2017:ARD,
  author =       "Sparsh Mittal and Jeffrey S. Vetter and Lei Jiang",
  title =        "Addressing Read-Disturbance Issue in {STT--RAM} by
                 Data Compression and Selective Duplication",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "94--98",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2645207",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In deep sub-micron region, spin transfer torque RAM
                 (STT-RAM) shows read-disturbance error (RDE) which
                 presents a crucial reliability challenge. We present
                 SHIELD, a technique to mitigate RDE in STT-RAM last
                 level caches (LLCs). SHIELD uses data compression to
                 reduce cache-write traffic and restore requirement.
                 Also, SHIELD keeps two copies of data blocks compressed
                 to less than half the block size and since several LLC
                 blocks are only accessed once, this approach avoids
                 several restore operations. SHIELD consumes smaller
                 energy than two previous RDE-mitigation techniques,
                 namely high-current restore required read (HCRR, also
                 called restore-after-read) and low-current long latency
                 read (LCLL) and even an ideal RDE-free STT-RAM cache.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mittal, S (Reprint Author), IIT Hyderabad, Sangareddy
                 502285, Telangana, India. Mittal, Sparsh, IIT
                 Hyderabad, Sangareddy 502285, Telangana, India. Vetter,
                 Jeffrey S., Oak Ridge Natl Lab, Oak Ridge, TN 37830
                 USA. Jiang, Lei, Indiana Univ, Bloomington, IN 47405
                 USA.",
  author-email = "sparsh0mittal@gmail.com vetter@ornl.gov
                 jiang60@iu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "U.S. Department of Energy, Office of
                 Science, Advanced Scientific Computing Research",
  funding-text = "Support for this work was provided by the U.S.
                 Department of Energy, Office of Science, Advanced
                 Scientific Computing Research.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache storage; data blocks; data compression; Data
                 compression; data compression; deep sub-micron region;
                 duplication; Encoding; Error analysis; Error correction
                 codes; HCRR; ideal RDE-free STT-RAM cache; integrated
                 circuit reliability; last level cache; last level
                 caches; LCLL; LLC; low-current long latency read;
                 Magnetic tunneling; Non-volatile memory; Nonvolatile
                 memory; Random access memory; random-access storage;
                 read disturbance error; read-disturbance error;
                 restore-after-read; selective duplication; SHIELD; spin
                 transfer torque RAM; STT-RAM; transfer torque RAM",
  number-of-cited-references = "14",
  ORCID-numbers = "Mittal, Sparsh/0000-0002-2908-993X",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Mittal:2017:ARD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Bakhshalipour:2017:ETD,
  author =       "Mohammad Bakhshalipour and Pejman Lotfi-Kamran and
                 Hamid Sarbazi-Azad",
  title =        "An Efficient Temporal Data Prefetcher for {L1}
                 Caches",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "99--102",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2654347",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Server workloads frequently encounter L1-D cache
                 misses, and hence, lose significant performance
                 potential. One way to reduce the number of L1-D misses
                 or their effect is data prefetching. As L1-D access
                 sequences have high temporal correlations, temporal
                 prefetching techniques are promising for L1 caches.
                 State-of-the-art temporal prefetching techniques are
                 effective at reducing the number of L1-D misses, but we
                 observe that there is a significant gap between what
                 they offer and the opportunity. This work aims to
                 improve the effectiveness of temporal prefetching
                 techniques. To overcome the deficiencies of existing
                 temporal prefetchers, we introduce Domino prefetching.
                 Domino prefetcher is a temporal prefetching technique
                 that looks up the history to find the last occurrence
                 of the last one or two L1-D miss addresses for
                 prefetching. We show that Domino prefetcher captures
                 more than 87 percent of the temporal opportunity at
                 L1-D. Through evaluation of a 16-core processor on a
                 set of server workloads, we show that Domino prefetcher
                 improves system performance by 26 percent (up to 56
                 percent).",
  acknowledgement = ack-nhfb,
  affiliation =  "Bakhshalipour, M (Reprint Author), Sharif Univ
                 Technol, Dept Comp Engn, Tehran 1458889694, Iran.
                 Bakhshalipour, Mohammad; Sarbazi-Azad, Hamid, Sharif
                 Univ Technol, Dept Comp Engn, Tehran 1458889694, Iran.
                 Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch
                 Comp Sci, Tehran 1956836681, Iran.",
  author-email = "bakhshalipour@ce.sharif.edu plotfi@ipm.ir
                 azad@ipm.ir",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache storage; Correlation; data prefetching; Domino
                 prefetcher captures; efficient temporal data
                 prefetcher; high temporal correlations; L1-D access
                 sequences; L1-D cache misses; L1-D miss addresses; L1-D
                 misses; multiprocessing systems; Prefetching; Server
                 workloads; Servers; storage management; Streaming
                 media; temporal correlation; temporal opportunity;
                 temporal prefetching technique; Web search",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Bakhshalipour:2017:ETD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Martinez:2017:SII,
  author =       "Jorge A. Mart{\'\i}nez and Juan Antonio Maestro and
                 Pedro Reviriego",
  title =        "A Scheme to Improve the Intrinsic Error Detection of
                 the Instruction Set Architecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "103--106",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2623628",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The Instruction Set Architecture (ISA) determines the
                 effect that a soft error on an instruction can have on
                 the processor. Previous works have shown that the ISA
                 has some intrinsic capability of detecting errors. For
                 example, errors that change a valid instruction into an
                 invalid instruction encoding or into an instruction
                 that causes an exception. The percentage of detectable
                 errors varies widely for each bit in the ISA. For
                 example, errors on bits that are used for immediate or
                 register values are unlikely to be detected while those
                 that are used for the opcode are more likely to lead to
                 an exception. In this paper, this is exploited by
                 introducing a simple encoding of the instructions that
                 does not require additional bits. The idea is that the
                 decoding propagates the error so that it affects the
                 most sensitive bit of the ISA and therefore it is more
                 likely to be detected. As no additional bits are
                 required, no changes or overheads are needed in the
                 memory. The proposed scheme is useful when the memory
                 is not protected with parity or Error Correction Codes.
                 The only cost of implementing the technique are simple
                 encoder and decoder circuits that are similar to a
                 parity computation. This technique is applicable to any
                 ISA, no matter the length of the opcodes or their
                 location in the instruction encoding. The effectiveness
                 of the proposed scheme has been evaluated on the ARM
                 Cortex M0 ISA resulting in an increase in the error
                 detection capability of up to 1.64x.",
  acknowledgement = ack-nhfb,
  affiliation =  "Martinez, JA (Reprint Author), Univ Antonio Nebrija, C
                 Pirineos 55, Madrid 28040, Spain. Martinez, Jorge A.;
                 Antonio Maestro, Juan; Reviriego, Pedro, Univ Antonio
                 Nebrija, C Pirineos 55, Madrid 28040, Spain.",
  author-email = "jmartine@nebrija.es jmaestro@nebrija.es
                 previrie@nebrija.es",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ARM Cortex M0 ISA; Circuit faults; Computer
                 architecture; decoder circuits; detectable errors;
                 detecting errors; Encoding; Error analysis; error
                 correction codes; Error Correction Codes; error
                 detection; error detection capability; instruction set
                 architecture; Instruction sets; instruction sets;
                 intrinsic capability; intrinsic error detection;
                 invalid instruction encoding; microprocessor chips;
                 simple encoder; simple encoding; Soft error; soft
                 error; Soft error",
  number-of-cited-references = "10",
  ORCID-numbers = "Maestro, Juan Antonio/0000-0001-7133-9026",
  research-areas = "Computer Science",
  researcherid-numbers = "Maestro, Juan Antonio/L-6091-2014",
  times-cited =  "3",
  unique-id =    "Martinez:2017:SII",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wang:2017:DAS,
  author =       "Rujia Wang and Sparsh Mittal and Youtao Zhang and Jun
                 Yang",
  title =        "{Decongest}: Accelerating Super-Dense {PCM} Under
                 Write Disturbance by Hot Page Remapping",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "107--110",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2675883",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "At small feature sizes, phase change memory (PCM)
                 shows write disturbance (WD) error (WDE) and this issue
                 can eclipse the density and energy efficiency advantage
                 of PCM. We propose `Decongest', a technique to address
                 WD errors in main memory designed with super-dense
                 (4F(2) cell size) PCM. Decongest works by identifying
                 and remapping write-intensive hot pages to a WD-free
                 spare area, which avoids WD to nearby pages due to
                 writing these hot pages, and WD to these hot pages from
                 writing nearby pages. Compared to a WD-affected
                 super-dense PCM baseline, Decongest improves the
                 performance by 14.0 percent, and saves 21.8 percent
                 energy.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wang, RJ (Reprint Author), Univ Pittsburgh,
                 Pittsburgh, PA 15260 USA. Wang, Rujia; Zhang, Youtao;
                 Yang, Jun, Univ Pittsburgh, Pittsburgh, PA 15260 USA.
                 Mittal, Sparsh, IIT Hyderabad, Kandi 502285, Telangana,
                 India.",
  author-email = "rujia.w@pitt.edu sparsh0mittal@gmail.com
                 youtao@pitt.edu juy9@pitt.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "US NSF CCF [1617071]; IIT, Hyderabad,
                 India",
  funding-text = "This work is partially supported by US NSF
                 CCF\#1617071 and a seed-grant from IIT, Hyderabad,
                 India.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; Diseases; Energy management;
                 energy saving; main memory; Microprocessors; page
                 remapping; Phase change materials; Phase change memory;
                 Radiation detectors; reliability; write disturbance",
  number-of-cited-references = "13",
  ORCID-numbers = "Mittal, Sparsh/0000-0002-2908-993X",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Wang:2017:DAS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Tanimoto:2017:EDG,
  author =       "Teruo Tanimoto and Takatsugu Ono and Koji Inoue and
                 Hiroshi Sasaki",
  title =        "Enhanced Dependence Graph Model for Critical Path
                 Analysis on Modern Out-of-Order Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "111--114",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2684813",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The dependence graph model of out-of-order (OoO)
                 instruction execution is a powerful representation used
                 for the critical path analysis. However most, if not
                 all, of the previous models are out-of-date and lack
                 enough detail to model modern OoO processors, or are
                 too specific and complicated which limit their
                 generality and applicability. In this paper, we propose
                 an enhanced dependence graph model which remains simple
                 but greatly improves the accuracy over prior models.
                 The evaluation results using the gem5 simulator show
                 that the proposed enhanced model achieves CPI error of
                 2.1 percent which is a 90.3 percent improvement against
                 the state-of-the-art model.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tanimoto, T (Reprint Author), Kyushu Univ, Fukuoka
                 8190395, Japan. Tanimoto, Teruo; Ono, Takatsugu; Inoue,
                 Koji, Kyushu Univ, Fukuoka 8190395, Japan. Sasaki,
                 Hiroshi, Columbia Univ, New York, NY 10027 USA.",
  author-email = "teruo.tanimoto@cpc.ait.kyushu-u.ac.jp
                 takatsugu.ono@cpc.ait.kyushu-u.ac.jp
                 inoue@ait.kyushu-u.ac.jp sasaki@cs.columbia.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "CREST, JST",
  funding-text = "This work was supported in part by CREST, JST. We
                 would like to express our thanks to RIIT of Kyushu
                 University for providing us the resource to conduct the
                 experiments in this paper.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Analytical models; Benchmark testing; computer
                 architecture; critical path analysis; Delays;
                 Dependence graph model; enhanced dependence graph
                 model; graph theory; Hidden Markov models;
                 Microarchitecture; modern OoO processors; out-of-order
                 instruction execution; out-of-order processors;
                 parallel architectures; Path planning; pipeline
                 processing; Program processors",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  researcherid-numbers = "Sasaki, Hiroshi/N-8579-2019",
  times-cited =  "0",
  unique-id =    "Tanimoto:2017:EDG",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lee:2017:FFE,
  author =       "Junghee Lee and Kalidas Ganesh and Hyuk-Jun Lee and
                 Youngjae Kim",
  title =        "{FESSD}: a Fast Encrypted {SSD} Employing On-Chip
                 Access-Control Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "115--118",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2667639",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Cryptography is one of the most popular methods for
                 protecting data stored in storage devices such as
                 solid-state drives (SSDs). To maintain integrity of
                 data, one of the popular techniques is that all
                 incoming data are encrypted before they are stored,
                 however, in this technique, the encryption overhead is
                 non-negligible and it can increase I/O service time. In
                 order to mitigate the negative performance impact
                 caused by the data encryption, a write buffer can be
                 used to hide the long latency by encryption. Using the
                 write buffer, incoming unencrypted data can be
                 immediately returned as soon as they are written in the
                 buffer. They will get encrypted and synchronized with
                 flash memory. However, if the write buffer itself is
                 not encrypted, unencrypted secret data might leak
                 through this insecure write buffer. On the other hand,
                 if the entire write buffer is fully encrypted, it
                 incurs significant performance overhead. To address
                 this problem, we propose an on-chip access control
                 memory (ACM) and presents a fast encrypted SSD, called
                 FESSD that implements a secure write buffering
                 mechanism using the ACM. The ACM does not require a
                 memory-level full encryption mechanism, thus not only
                 solving the unencrypted data leaking problem, but also
                 offering relatively fast I/O service. Our simulation
                 results show that the I/O response time of FESSD can be
                 improved by up to 56 percent over a baseline where
                 encrypted data are stored in the normal write buffer.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lee, J (Reprint Author), Univ Texas San Antonio, San
                 Antonio, TX 78249 USA. Lee, Junghee; Ganesh, Kalidas,
                 Univ Texas San Antonio, San Antonio, TX 78249 USA. Lee,
                 Hyuk-Jun; Kim, Youngjae, Sogang Univ, Seoul 121742,
                 South Korea.",
  author-email = "junghee.lee@my.utsa.edu dyk567@my.utsa.edu
                 hyukjunl@sogang.ac.kr youkim@sogang.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Research Foundation of Korea (NRF)
                 --- Korea Government (MISP) [2015R1C1A1A0152105]",
  funding-text = "This work was supported by the National Research
                 Foundation of Korea (NRF) grant funded by the Korea
                 Government (MISP) (No. 2015R1C1A1A0152105). This
                 research also used resources of The University of Texas
                 at San Antonio, San Antonio, TX. Youngjae Kim is the
                 corresponding author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ACM; authorisation; cryptography; data encryption;
                 encrypted data; encryption; Encryption; encryption;
                 encryption overhead; fast encrypted SSD; FeSSD; flash
                 memories; flash memory; Hardware; negative performance
                 impact; Nonvolatile memory; normal write buffer;
                 on-chip access control memory; on-chip access-control
                 memory; on-chip memory; Registers; security;
                 Solid-state drive (SSD); solid-state drives; storage
                 devices; storage management; System-on-chip;
                 unencrypted data leaking problem; unencrypted secret
                 data",
  keywords-plus = "SECURITY",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Lee:2017:FFE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Badawy:2017:GLO,
  author =       "Abdel-Hameed A. Badawy and Donald Yeung",
  title =        "Guiding Locality Optimizations for Graph Computations
                 via Reuse Distance Analysis",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "119--122",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2695178",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This work addresses the problem of optimizing
                 graph-based programs for multicore processors. We use
                 three graph benchmarks and three input data sets to
                 characterize the importance of properly partitioning
                 graphs among cores at multiple levels of the cache
                 hierarchy. We also exhaustively explore a large design
                 space comprised of different parallelization schemes
                 and graph partitionings via detailed simulation to show
                 how much gain we can obtain over a baseline legacy
                 scheme that partitions for the L1 cache only. Our
                 results demonstrate the legacy approach is not the best
                 choice, and that our proposed parallelization /
                 locality techniques can perform better (by up to 20
                 percent). We then use a performance prediction model
                 based on multicore reuse distance (RD) profiles to rank
                 order the different parallelization / locality schemes
                 in the design space. We compare the best configuration
                 as predicted by our model against the actual best
                 identified by our exhaustive simulations. For one
                 benchmark and data input, we show our model can achieve
                 79.5 percent of the performance gain achieved by the
                 actual best. Across all benchmarks and data inputs, our
                 model achieves 48 percent of the maximum performance
                 gain. Our work demonstrates a new use case for
                 multicore RD profiles --- i.e., as a tool for helping
                 program developers and compilers to optimize
                 graph-based programs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Badawy, AHA (Reprint Author), New Mexico State Univ,
                 Klipsch Sch Elect \& Comp Engn, Las Cruces, NM 88003
                 USA. Badawy, Abdel-Hameed A., New Mexico State Univ,
                 Klipsch Sch Elect \& Comp Engn, Las Cruces, NM 88003
                 USA. Yeung, Donald, Univ Maryland, Dept Elect \& Comp
                 Engn, College Pk, MD 20742 USA.",
  author-email = "badawy@nmsu.edu yeung@umd.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "baseline legacy scheme; Benchmark testing; cache
                 hierarchy; cache storage; Computational modeling; graph
                 benchmarks; graph computations; graph partitionings;
                 graph theory; legacy approach; locality optimization;
                 memory system; Multicore processing; multicore
                 processors; multicore RD profiles; multicore reuse
                 distance profiles; multiprocessing systems;
                 Optimization; partitioning; performance prediction
                 model; prediction; Predictive models; profiling;
                 program developers; Program processors; reuse distance;
                 reuse distance analysis; Runtime",
  keywords-plus = "BIOMOLECULAR SIMULATION",
  number-of-cited-references = "11",
  ORCID-numbers = "Badawy, Abdel-Hameed/0000-0001-8027-1449",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Badawy:2017:GLO",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zha:2017:IFM,
  author =       "Yue Zha and Jing Li",
  title =        "{IMEC}: a Fully Morphable In-Memory Computing Fabric
                 Enabled by Resistive Crossbar",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "123--126",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2672558",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "In this paper, we propose a fully morphable In-MEmory
                 Computing (IMEC) fabric to better implement the concept
                 of processing inside memory (PIM). Enabled by emerging
                 nonvolatile memory, i.e., RRAM and its monolithic 3D
                 integration, IMEC can be configured into one or a
                 combination of four distinct functions, (1) logic, (2)
                 ternary content addressable memory, (3) memory, and (4)
                 interconnect. Thus, IMEC exploits a continuum of PIM
                 capabilities across the whole spectrum, ranging from 0
                 percent (pure data storage) to 100 percent (pure
                 compute engine), or intermediate states in between.
                 IMEC can be modularly integrated into the DDRx memory
                 subsystem, communicating with processors by the
                 ordinary DRAM commands. Additionally, to reduce the
                 programming burden, we provide a complete framework to
                 compile applications written in high-level programming
                 language (e.g., OpenCL) onto IMEC. This framework also
                 enables code portability across different platforms for
                 heterogeneous computing. By using this framework,
                 several benchmarks are mapped onto IMEC for evaluating
                 its performance, energy and resource utilization. The
                 simulation results show that, IMEC reduces the energy
                 consumption by 99.6 percent, and achieves 644x speedup,
                 compared to a baseline CPU system. We further compare
                 IMEC with FPGA architecture, and demonstrate that the
                 performance improvement is not simply obtained by
                 replacing SRAM cells with denser RRAM cells.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zha, Y (Reprint Author), Univ Wisconsin, Elect \& Comp
                 Engn Dept, Madison, WI 53706 USA. Zha, Yue; Li, Jing,
                 Univ Wisconsin, Elect \& Comp Engn Dept, Madison, WI
                 53706 USA.",
  author-email = "yzha3@wisc.edu jli587@wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Decoding; Energy efficiency; energy-efficiency
                 computing; Field programmable gate arrays; Non-volatile
                 memory; Nonvolatile memory; processing-in-memory;
                 Program processors; TCAM",
  keywords-plus = "ARCHITECTURE",
  number-of-cited-references = "20",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Zha:2017:IFM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Chen:2017:IGP,
  author =       "Li-Jhan Chen and Hsiang-Yun Cheng and Po-Han Wang and
                 Chia-Lin Yang",
  title =        "Improving {GPGPU} Performance via Cache Locality Aware
                 Thread Block Scheduling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "127--131",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2693371",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Modern GPGPUs support the concurrent execution of
                 thousands of threads to provide an energy-efficient
                 platform. However, the massive multi-threading of
                 GPGPUs incurs serious cache contention, as the cache
                 lines brought by one thread can easily be evicted by
                 other threads in the small shared cache. In this paper,
                 we propose a software-hardware cooperative approach
                 that exploits the spatial locality among different
                 thread blocks to better utilize the precious cache
                 capacity. Through dynamic locality estimation and
                 thread block scheduling, we can capture more
                 performance improvement opportunities than prior work
                 that only explores the spatial locality between
                 consecutive thread blocks. Evaluations across diverse
                 GPGPU applications show that, on average, our
                 locality-aware scheduler provides 25 and 9 percent
                 performance improvement over the commonly-employed
                 round-robin scheduler and the state-of-the-art
                 scheduler, respectively.",
  acknowledgement = ack-nhfb,
  affiliation =  "Chen, LJ (Reprint Author), Natl Taiwan Univ, Taipei
                 10617, Taiwan. Chen, Li-Jhan; Wang, Po-Han; Yang,
                 Chia-Lin, Natl Taiwan Univ, Taipei 10617, Taiwan.
                 Cheng, Hsiang-Yun, Acad Sinica, Taipei 11529, Taiwan.",
  author-email = "r03922026@csie.ntu.edu.tw hycheng@citi.sinica.edu.tw
                 f96922002@csie.ntu.edu.tw yangc@csie.ntu.edu.tw",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Ministry of Science and Technology of
                 Taiwan [MOST-105-2221-E-002-156-MY2,
                 MOST-105-2622-8-002-002, MOST-105-2218-E-002-025];
                 MediaTek Inc., Hsin-chu, Taiwan",
  funding-text = "This work is supported in part by research grants from
                 the Ministry of Science and Technology of Taiwan
                 (MOST-105-2221-E-002-156-MY2, MOST-105-2622-8-002-002,
                 and MOST-105-2218-E-002-025), and sponsored by MediaTek
                 Inc., Hsin-chu, Taiwan.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache lines; cache locality; cache locality aware
                 thread block scheduling; Cache memory; cache storage;
                 consecutive thread blocks; Dispatching; dynamic
                 locality estimation; energy-efficient platform; GPGPU;
                 GPGPU performance; graphics processing units; Graphics
                 processing units; Instruction sets; locality-aware
                 scheduler; multi-threading; performance improvement
                 opportunities; precious cache capacity; processor
                 scheduling; serious cache contention; shared cache;
                 thread block scheduling; Two dimensional displays",
  number-of-cited-references = "18",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Chen:2017:IGP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Garland:2017:LCM,
  author =       "James Garland and David Gregg",
  title =        "Low Complexity Multiply Accumulate Unit for
                 Weight-Sharing Convolutional Neural Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "132--135",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2656880",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Convolutional Neural Networks (CNNs) are one of the
                 most successful deep machine learning technologies for
                 processing image, voice and video data. CNNs require
                 large amounts of processing capacity and memory, which
                 can exceed the resources of low power mobile and
                 embedded systems. Several designs for hardware
                 accelerators have been proposed for CNNs which
                 typically contain large numbers of Multiply Accumulate
                 (MAC) units. One approach to reducing data sizes and
                 memory traffic in CNN accelerators is ``weight
                 sharing'', where the full range of values in a trained
                 CNN are put in bins and the bin index is stored instead
                 of the original weight value. In this paper we propose
                 a novel MAC circuit that exploits binning in
                 weight-sharing CNNs. Rather than computing the MAC
                 directly we instead count the frequency of each weight
                 and place it in a bin. We then compute the accumulated
                 value in a subsequent multiply phase. This allows
                 hardware multipliers in the MAC circuit to be replaced
                 with adders and selection logic. Experiments show that
                 for the same clock speed our approach results in fewer
                 gates, smaller logic, and reduced power.",
  acknowledgement = ack-nhfb,
  affiliation =  "Garland, J (Reprint Author), Trinity Coll Dublin,
                 Dublin 2, Ireland. Garland, James; Gregg, David,
                 Trinity Coll Dublin, Dublin 2, Ireland.",
  author-email = "jgarland@tcd.ie david.gregg@cs.tcd.ie",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Science Foundation Ireland [12/IA/1381]",
  funding-text = "This research is supported by Science Foundation
                 Ireland, Project 12/IA/1381.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "adders; arithmetic hardware circuits; bin index; CNN
                 accelerators; convolution; Convolutional neural
                 network; Convolutional neural networks; deep machine
                 learning technologies; embedded systems; Energy
                 efficiency; feedforward neural nets; hardware
                 accelerators; hardware multipliers; learning
                 (artificial intelligence); Logic gates; MAC circuit;
                 Machine learning; memory traffic; multiply accumulate;
                 multiply accumulate units; multiplying circuits; Neural
                 networks; original weight value; power efficiency;
                 subsequent multiply phase; video data; weight-sharing
                 CNN; weight-sharing convolutional neural networks",
  number-of-cited-references = "9",
  ORCID-numbers = "Garland, James/0000-0002-8688-9407",
  research-areas = "Computer Science",
  researcherid-numbers = "Garland, James/L-1294-2019",
  times-cited =  "2",
  unique-id =    "Garland:2017:LCM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Jung:2017:NIP,
  author =       "Myoungsoo Jung",
  title =        "{NearZero}: an Integration of Phase Change Memory with
                 Multi-Core Coprocessor",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "136--140",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2694828",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Multi-core based coprocessors have become powerful
                 research vehicles to analyze a large amount of data.
                 Even though they can accelerate data processing by
                 using a hundred cores, the data unfortunately exist on
                 an external storage device. The separation of
                 computation and storage introduces redundant memory
                 copies and unnecessary data transfers over different
                 physical device boundaries, which limit the benefits of
                 coprocessor-accelerated data processing. In addition,
                 the coprocessors need assistance from host-side
                 resources to access the external storage, which can
                 require additional system context switches. To address
                 these challenges, we propose NearZero, a novel
                 DRAM-less coprocessor architecture that precisely
                 integrates a state-of-the-art phase change memory into
                 its multi-core accelerator. In this work, we implement
                 an FPGA-based memory controller that extracts important
                 device parameters from real phase change memory chips,
                 and apply them to a commercially available hardware
                 platform that employs multiple processing elements over
                 a PCIe fabric. The evaluation results reveal that
                 NearZero achieves on average 47 percent better
                 performance than advanced coprocessor approaches that
                 use direct I/Os (between storage and coprocessors),
                 while consuming only 19 percent of the total energy of
                 such advanced coprocessors.",
  acknowledgement = ack-nhfb,
  affiliation =  "Jung, M (Reprint Author), Yonsei Univ, Seoul 03722,
                 South Korea. Jung, Myoungsoo, Yonsei Univ, Seoul 03722,
                 South Korea.",
  author-email = "m.jung@yonsei.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NRF [2016R1C1B2015312, DE-AC02-05CH 11231];
                 MSIP [IITP-2017-2017-0-01015]; [MemRay 2015-11-1731]",
  funding-text = "The author thanks MemRay Corporation, Samsung, TI for
                 their research sample donation and technical support.
                 The author also thanks J. Zhang, H. Jeong and G. Park
                 who help him prepare to set up preliminary evaluation
                 environment. This research is supported by MemRay
                 2015-11-1731. This work is also supported in part by
                 NRF 2016R1C1B2015312, DE-AC02-05CH 11231 and MSIP
                 IITP-2017-2017-0-01015.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerators; additional system context; advanced
                 coprocessor approaches; Computer architecture;
                 coprocessors; Coprocessors; data processing; Data
                 storage; Data transfer; DRAM chips; DRAM-less
                 coprocessor architecture; external storage device;
                 Field programmable gate arrays; field programmable gate
                 arrays; hardware architecture; host-side resources;
                 hybrid systems; important device parameters; mass
                 storage; memory structures; multicore accelerator;
                 multicore-based coprocessors; multiple processing
                 elements; multiprocessing systems; multiprocessors;
                 NearZero; Network architecture; non-volatile memory;
                 Nonvolatile memory; parallel architectures; phase
                 change memories; phase change memory chips; Phase
                 change random access memory; powerful research
                 vehicles; redundant memory copies; Storage devices;
                 storage management; unnecessary data transfers",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  researcherid-numbers = "Jung, Myoungsoo/F-4565-2019",
  times-cited =  "2",
  unique-id =    "Jung:2017:NIP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yavits:2017:RAD,
  author =       "Leonid Yavits and Uri Weiser and Ran Ginosar",
  title =        "Resistive Address Decoder",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "141--144",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2670539",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardwired dynamic NAND address decoders are widely
                 used in random access memories to decode parts of the
                 address. Replacing wires by resistive elements allows
                 storing and reprogramming the addresses and matching
                 them to an input address. The resistive address decoder
                 thus becomes a content addressable memory, while the
                 read latency and dynamic energy remain almost identical
                 to those of a hardwired address decoder. One
                 application of the resistive address decoder is a fully
                 associative TLB with read latency and energy
                 consumption similar to those of a one-way associative
                 TLB. Another application is a many-way associative
                 cache with read latency and energy consumption similar
                 to those of a direct mapped one. A third application is
                 elimination of physical addressing and using virtual
                 addresses throughout the entire memory hierarchy by
                 introducing the resistive address decoder into the main
                 memory.",
  acknowledgement = ack-nhfb,
  affiliation =  "Yavits, L (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-3200000 Haifa, Israel.
                 Yavits, Leonid; Weiser, Uri; Ginosar, Ran, Technion
                 Israel Inst Technol, Dept Elect Engn, IL-3200000 Haifa,
                 Israel.",
  author-email = "yavits@tx.technion.ac.il uri.weiser@ee.technion.ac.il
                 ran@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Address decoder; cache; cache storage; CAM; content
                 addressable memory; content-addressable storage;
                 Decoding; decoding; dynamic energy; energy consumption;
                 Energy consumption; fully associative TLB; hardwired
                 address decoder; hardwired dynamic NAND address
                 decoders; Logic gates; many-way associative cache;
                 memory hierarchy; memristors; Memristors; memristors;
                 NAND circuits; Network address translation; one-way
                 associative TLB; physical address; physical addressing
                 using virtual addresses; Programming; RAM; random
                 access memories; Random access memory; random-access
                 storage; read latency; resistive address decoder;
                 resistive memory; TLB; virtual address; virtual
                 addresses",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Yavits:2017:RAD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Manivannan:2017:RAG,
  author =       "Madhavan Manivannan and Miquel Peric{\`a}s and
                 Vassilis Papaefstathiou and Per Stenstr{\"o}m",
  title =        "Runtime-Assisted Global Cache Management for
                 Task-Based Parallel Programs",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "145--148",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2606593",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Dead blocks are handled inefficiently in multi-level
                 cache hierarchies because the decision as to whether a
                 block is dead has to be taken locally at each cache
                 level. This paper introduces runtime-assisted global
                 cache management to quickly deem blocks dead across
                 cache levels in the context of task-based parallel
                 programs. The scheme is based on a cooperative
                 hardware/software approach that leverages static and
                 dynamic information about future data region reuse(s)
                 available to runtime systems for task-based parallel
                 programming models. We show that our proposed
                 runtime-assisted global cache management approach
                 outperforms previously proposed local dead-block
                 management schemes for task-based parallel programs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Manivannan, M (Reprint Author), Chalmers Univ Technol,
                 Dept Comp Sci \& Engn, S-41258 Gothenburg, Sweden.
                 Manivannan, Madhavan; Pericas, Miquel; Papaefstathiou,
                 Vassilis; Stenstrom, Per, Chalmers Univ Technol, Dept
                 Comp Sci \& Engn, S-41258 Gothenburg, Sweden.",
  author-email = "madhavan@chalmers.se miquelp@chalmers.se
                 vaspap@chalmers.se per.stenstrom@chalmers.se",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Swedish Foundation for Strategic Research
                 (SSF) under SCHEME project [RIT10-0033]; European
                 Research Council (ERC) under MECCA project [340328]",
  funding-text = "This research is supported by grants from the Swedish
                 Foundation for Strategic Research (SSF) under the
                 SCHEME project (RIT10-0033) and the European Research
                 Council (ERC) under the MECCA project (contract
                 340328). The simulations were run on the resources
                 provided by the Swedish National Infrastructure for
                 Computing (SNIC) at C3SE.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache level; Cache memory; cache storage; Data models;
                 dead blocks; dead-block management schemes; Multi-level
                 cache hierarchies; multilevel cache hierarchies;
                 Optimization; parallel programming; Parallel
                 programming; parallel programming models; parallel
                 programs; prediction; Predictive models; run-time
                 system; Runtime; runtime systems; runtime-assisted
                 global cache management; Semantics; storage
                 management",
  keywords-plus = "REPLACEMENT; PREDICTION",
  number-of-cited-references = "20",
  oa =           "Bronze",
  ORCID-numbers = "Stenstrom, Per/0000-0002-4280-3843",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Manivannan:2017:RAG",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Perais:2017:SFM,
  author =       "Arthur Perais and Andre Seznec",
  title =        "Storage-Free Memory Dependency Prediction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "149--152",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2628379",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memory Dependency Prediction (MDP) is paramount to
                 good out-of-order performance, but decidedly not
                 trivial as a all instances of a given static load may
                 not necessarily depend on all instances of a given
                 static store. As a result, for a given load, MDP should
                 predict the exact store instruction the load depends
                 on, and not only whether it depends on an inflight
                 store or not, i.e., ideally, prediction should not be
                 binary. However, we first argue that given the high
                 degree of sophistication of modern branch predictors,
                 the fact that a given dynamic load depends on an
                 inflight store can be captured using the binary
                 prediction capabilities of the branch predictor,
                 providing coarse MDP at zero storage overhead. Second,
                 by leveraging hysteresis counters, we show that the
                 precise producer store can in fact be identified. This
                 embodiment of MDP yields performance levels that are on
                 par with state-of-the-art, and requires less than 70
                 additional bits of storage over a baseline without MDP
                 at all.",
  acknowledgement = ack-nhfb,
  affiliation =  "Perais, A (Reprint Author), INRIA IRISA, F-35000
                 Rennes, France. Perais, Arthur; Seznec, Andre, INRIA
                 IRISA, F-35000 Rennes, France.",
  author-email = "arthur.perais@inria.fr andre.seznec@inria.fr",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "binary prediction capabilities; branch prediction
                 space-efficiency; branch predictor; cache storage;
                 coarse MDP; instruction sets; MDP yields performance
                 levels; Memory dependency prediction; memory dependency
                 prediction; Memory management; modern branch
                 predictors; Out of order; out-of-order performance;
                 precise producer store; Predictive models; storage
                 management; storage-free memory dependency prediction;
                 zero storage overhead",
  keywords-plus = "COMMUNICATION; QUEUE",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Perais:2017:SFM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Mirhosseini:2017:SPB,
  author =       "Amirhossein Mirhosseini and Aditya Agrawal and Josep
                 Torrellas",
  title =        "{Survive}: Pointer-Based In-{DRAM} Incremental
                 Checkpointing for Low-Cost Data Persistence and
                 Rollback-Recovery",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "153--157",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2646340",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper introduces the Survive DRAM architecture
                 for effective in-memory micro-checkpointing. Survive
                 implements low-cost incremental checkpointing, enabling
                 fast rollback that can be used in various architectural
                 techniques such as speculation, approximation, or low
                 voltage operation. Survive also provides crash
                 consistency when used as the frontend of a hybrid
                 DRAM-NVM memory system. This is accomplished by
                 carefully copying the incremental checkpoints generated
                 in the DRAM frontend to the NVM backend. Simulations
                 show that Survive only imposes an average 3.5 percent
                 execution time overhead over an unmodified DRAM
                 main-memory system with no checkpointing, while
                 reducing the number of NVM writes by 89 percent over an
                 NVM-only main-memory system.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mirhosseini, A (Reprint Author), Univ Michigan, Ann
                 Arbor, MI 48109 USA. Mirhosseini, Amirhossein, Univ
                 Michigan, Ann Arbor, MI 48109 USA. Agrawal, Aditya,
                 NVIDIA Corp, Santa Clara, CA 95050 USA. Torrellas,
                 Josep, Univ Illinois, Champaign, IL 61801 USA.",
  author-email = "miramir@umich.edu adityaa@nvidia.com
                 torrella@illinois.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architectural techniques; checkpointing;
                 Checkpointing; checkpointing; Computer architecture;
                 Computer crashes; DRAM chips; hybrid DRAM-NVM memory
                 system; In-DRAM incremental checkpointing; in-memory
                 microcheckpointing; incremental checkpoints; low
                 voltage operation; low-cost data persistence; low-cost
                 incremental checkpointing; memory architecture;
                 Non-volatile memory; Nonvolatile memory; NVM-only
                 main-memory system; Random access memory; random-access
                 storage; reliability; rollback-recovery; software fault
                 tolerance; survive DRAM architecture; system recovery;
                 Transistors; unmodified DRAM main-memory system",
  keywords-plus = "PHASE-CHANGE MEMORY",
  number-of-cited-references = "21",
  ORCID-numbers = "Mirhosseini, Amirhossein/0000-0001-6501-6087",
  research-areas = "Computer Science",
  times-cited =  "5",
  unique-id =    "Mirhosseini:2017:SPB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Pinto:2017:TTA,
  author =       "Sandro Pinto and Jorge Pereira and Tiago Gomes and
                 Mongkol Ekpanyapong and Adriano Tavares",
  title =        "Towards a {TrustZone}-Assisted Hypervisor for
                 Real-Time Embedded Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "158--161",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2617308",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Virtualization technology starts becoming more and
                 more widespread in the embedded space. The penalties
                 incurred by standard software-based virtualization is
                 pushing research towards hardware-assisted solutions.
                 Among the existing commercial off-the-shelf
                 technologies for secure virtualization, ARM TrustZone
                 is attracting particular attention. However, it is
                 often seen with some scepticism due to the dual-OS
                 limitation of existing state-of-the-art solutions. This
                 letter presents the implementation of a TrustZone-based
                 hypervisor for real-time embedded systems, which allows
                 multiple RTOS partitions on the same hardware platform.
                 The results demonstrate that virtualization overhead is
                 less than 2 percent for a 10 milliseconds
                 guest-switching rate, and the system remains
                 deterministic. This work goes beyond related work by
                 implementing a TrustZone-assisted solution that allows
                 the execution of an arbitrary number of guest OSes
                 while providing the foundation to drive next generation
                 of secure virtualization solutions for
                 resource-constrained embedded devices.",
  acknowledgement = ack-nhfb,
  affiliation =  "Pinto, S (Reprint Author), Univ Minho, Dept Ctr
                 Algoritmi, P-4704553 Braga, Portugal. Pinto, Sandro;
                 Pereira, Jorge; Gomes, Tiago; Tavares, Adriano, Univ
                 Minho, Dept Ctr Algoritmi, P-4704553 Braga, Portugal.
                 Ekpanyapong, Mongkol, Asian Inst Technol, Pathum Thani
                 12120, Thailand.",
  author-email = "sandro.pinto@algoritmi.uminho.pt
                 jorge.m.pereira@algoritmi.uminho.pt
                 tiago.m.gomes@algoritmi.uminho.pt mongkol@ait.ac.th
                 adriano.tavares@algoritmi.uminho.pt",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "COMPETE [POCI-01-0145-FEDER-007043]; FCT -
                 Fundacao para a Ciencia e Tecnologia
                 [SFRH/BD/91530/2012, UID/CEC/00319/2013]",
  funding-text = "This work has been supported by COMPETE:
                 POCI-01-0145-FEDER-007043 and FCT --- Fundacao para a
                 Ciencia e Tecnologia (grant SFRH/BD/91530/2012 and
                 UID/CEC/00319/2013).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ARM; ARM TrustZone; dual-OS limitation; embedded
                 space; embedded systems; Embedded systems; embedded
                 systems; hardware platform; hardware-assisted
                 solutions; monitor; Monitoring; multiple RTOS
                 partitions; operating systems (computers); Program
                 processors; real-time; real-time embedded systems;
                 Real-time systems; RODOS; secure virtualization
                 solutions; security of data; standard software; trusted
                 computing; TrustZone; TrustZone-assisted solution;
                 Virtual machine monitors; virtualisation;
                 Virtualization; virtualization overhead; virtualization
                 technology",
  number-of-cited-references = "12",
  ORCID-numbers = "Gomes, Tiago/0000-0002-4071-9015 Salgado Pinto,
                 Sandro Emanuel/0000-0003-4580-7484 Tavares,
                 Adriano/0000-0001-8316-6927",
  research-areas = "Computer Science",
  researcherid-numbers = "Gomes, Tiago/A-4751-2016 Salgado Pinto, Sandro
                 Emanuel/D-6725-2015 Tavares, Adriano/M-5257-2013",
  times-cited =  "3",
  unique-id =    "Pinto:2017:TTA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Carlson:2017:THL,
  author =       "Trevor E. Carlson and Kim-Anh Tran and Alexandra
                 Jimborean and Konstantinos Koukos and Magnus
                 Sj{\"a}lander and Stefanos Kaxiras",
  title =        "Transcending Hardware Limits with Software
                 Out-of-Order Processing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "162--165",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2672559",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Building high-performance, next-generation processors
                 require novel techniques to enable improved performance
                 given today's power-and energy-efficiency requirements.
                 Additionally, a widening gap between processor and
                 memory performance makes it even more difficult to
                 improve efficiency with conventional techniques. While
                 out-of-order architectures attempt to hide this memory
                 latency with dynamically reordered instructions, they
                 lack the energy efficiency seen in in-order processors.
                 Thus, our goal is to reorder the instruction stream to
                 avoid stalls and improve utilization for energy
                 efficiency and performance. To accomplish this goal, we
                 propose an enhanced stall-on-use in-order core that
                 improves energy efficiency (and therefore performance
                 in these power-limited designes) through
                 out-of-program-order execution. During long latency
                 loads, the Software Out-of-Order Processing (SWOOP)
                 core exposes additional memory-and instruction-level
                 parallelism to perform useful, non-speculative work.
                 The resulting instruction lookahead of the SWOOP core
                 reaches beyond the conventional fixed-sized processor
                 structures with the help of transparent hardware
                 register contexts. Our results show that SWOOP
                 demonstrates a 34 percent performance improvement on
                 average compared with an in-order, stall-on-use core,
                 with an energy reduction of 23 percent.",
  acknowledgement = ack-nhfb,
  affiliation =  "Carlson, TE (Reprint Author), Uppsala Univ, S-75236
                 Uppsala, Sweden. Carlson, Trevor E.; Tran, Kim-Anh;
                 Jimborean, Alexandra; Koukos, Konstantinos; Sjalander,
                 Magnus; Kaxiras, Stefanos, Uppsala Univ, S-75236
                 Uppsala, Sweden. Sjalander, Magnus, Norwegian Univ Sci
                 \& Technol NTNU, N-7491 Trondheim, Norway.",
  author-email = "trevor.carlson@it.uu.se kim-anh.tran@it.uu.se
                 alexandra.jimborean@it.uu.se
                 konstantinos.koukos@it.uu.se
                 magnus.sjalander@idi.ntnu.no
                 stefanos.kaxiras@it.uu.se",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Compilation; Context awareness; decoupled
                 access-execute; dynamically reordered instructions;
                 energy; energy conservation; energy efficiency; Energy
                 management; energy reduction; energy-efficiency
                 requirements; enhanced stall-on-use; fixed-sized
                 processor structures; hardware limits; in-order core;
                 in-order processors; instruction stream;
                 instruction-level parallelism; memory level
                 parallelism; microprocessor chips; next-generation
                 processors; Out of order; out-of-program-order
                 execution; parallel architectures; power-limited
                 designes; Prefetching; resulting instruction lookahead;
                 software out-of-order processing; stall-on-use core;
                 SWOOP",
  number-of-cited-references = "9",
  ORCID-numbers = "Sjalander, Magnus/0000-0003-4232-6976 Jimborean,
                 Alexandra/0000-0001-8642-2447",
  research-areas = "Computer Science",
  researcherid-numbers = "Sjalander, Magnus/N-5995-2019",
  times-cited =  "0",
  unique-id =    "Carlson:2017:THL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Ahmadvand:2017:UDV,
  author =       "Hossein Ahmadvand and Maziar Goudarzi",
  title =        "Using Data Variety for Efficient Progressive Big Data
                 Processing in Warehouse-Scale Computers",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "166--169",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2636293",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Warehouse Scale Computers (WSC) are often used for
                 various big data jobs where the big data under
                 processing comes from a variety of sources. We show
                 that different data portions, from the same or
                 different sources, have different significances in
                 determining the final outcome of the computation, and
                 hence, by prioritizing them and assigning more
                 resources to processing of more important data, the WSC
                 can be used more efficiently in terms of time as well
                 as cost. We provide a simple low-overhead mechanism to
                 quickly assess the significance of each data portion,
                 and show its effectiveness in finding the best ranking
                 of data portions. We continue by demonstrating how this
                 ranking is used in resource allocation to improve time
                 and cost by up to 24 and 9 percent respectively, and
                 also discuss other uses of this ranking information,
                 e.g., in faster progressive approximation of the final
                 outcome of big data job without processing entire data,
                 and in more effective use of renewable energies in
                 WSCs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ahmadvand, H (Reprint Author), Sharif Univ Technol,
                 Dept Comp Engn, Azadi Ave, Tehran 1136511155, Iran.
                 Ahmadvand, Hossein; Goudarzi, Maziar, Sharif Univ
                 Technol, Dept Comp Engn, Azadi Ave, Tehran 1136511155,
                 Iran.",
  author-email = "ahmadvand@ce.sharif.edu goudarzi@sharif.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Sharif University of Technology [G930826]",
  funding-text = "This research is supported by grant number G930826
                 from Sharif University of Technology. We are grateful
                 for their support.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Big data; Big Data; Big data;
                 Computers; data warehouses; Distributed databases;
                 efficiency; efficient progressive Big Data processing;
                 order of processing; resource allocation; Resource
                 management; sampling; warehouse-scale computers; WSC",
  number-of-cited-references = "16",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Ahmadvand:2017:UDV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zhang:2017:WDP,
  author =       "Dan Zhang and Xiaoyu Ma and Derek Chiou",
  title =        "Worklist-Directed Prefetching",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "16",
  number =       "2",
  pages =        "170--173",
  month =        jul # "\slash " # dec,
  year =         "2017",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2016.2627571",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Researchers have demonstrated the benefits of hardware
                 worklist accelerators, which offload scheduling and
                 load balancing operations in parallel graph
                 applications. However, many of these applications are
                 still heavily memory latency-bound due to the irregular
                 nature of graph data structure access patterns. We
                 utilize the fact that the accelerator has knowledge of
                 upcoming work items to accurately issue prefetch
                 requests, a technique we call worklist-directed
                 prefetching. A credit-based system to improve prefetch
                 timeliness and prevent cache thrashing is proposed. The
                 proposed prefetching scheme is simulated on a 64-core
                 CMP with a hardware worklist accelerator on several
                 graph algorithms and inputs. Enabling worklist-directed
                 prefetching into the L2 cache results in an average
                 speedup of 1.99, and up to 2.35 on Breadth-First
                 Search.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zhang, D (Reprint Author), Univ Texas Austin, Dept
                 Elect \& Comp Engn, Austin, TX 78712 USA. Zhang, Dan;
                 Ma, Xiaoyu; Chiou, Derek, Univ Texas Austin, Dept Elect
                 \& Comp Engn, Austin, TX 78712 USA.",
  author-email = "dan.zhang@utexas.edu xma@utexas.edu
                 derek@ece.utexas.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FR2AX",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerators; cache storage; data structures; graph
                 algorithms; graph data structure access patterns; graph
                 problems; graph theory; Hardware; hardware worklist
                 accelerator; load-balancing operations; microprocessor
                 chips; parallel graph applications; parallel
                 processors; Prefetching; prefetching researchers;
                 prefetching scheme; Processor scheduling; resource
                 allocation; scheduling; Software algorithms; storage
                 management",
  keywords-plus = "ARCHITECTURAL SUPPORT; ALGORITHM",
  number-of-cited-references = "23",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Zhang:2017:WDP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Scionti:2018:EMM,
  author =       "Alberto Scionti and Somnath Mazumdar and Stephane
                 Zuckerman",
  title =        "Enabling Massive Multi-Threading with Fast Hashing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2697863",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The next generation of high-performance computers is
                 expected to execute threads in orders of magnitude
                 higher than today's systems. Improper management of
                 such huge amount of threads can create resource
                 contention, leading to overall degraded system
                 performance. By leveraging more practical approaches to
                 distribute threads on the available resources,
                 execution models and manycore chips are expected to
                 overcome limitations of current systems. Here, we
                 present DELTA --- a Data-Enabled muLti-Threaded
                 Architecture, where a producer-consumer scheme is used
                 to execute threads via complete distributed thread
                 management mechanism. We consider a manycore tiled-chip
                 architecture where Network-on-Chip (NoC) routers are
                 extended to support our execution model. The proposed
                 extension is analysed, while simulation results confirm
                 that DELTA can manage a large number of simultaneous
                 threads, relying on a simple hardware structure.",
  acknowledgement = ack-nhfb,
  affiliation =  "Scionti, A (Reprint Author), ISMB, I-10138 Turin,
                 Italy. Scionti, Alberto, ISMB, I-10138 Turin, Italy.
                 Mazumdar, Somnath, Univ Siena, Siena, SI, Italy.
                 Zuckerman, Stephane, Michigan Technol Univ, Houghton,
                 MI 49931 USA.",
  author-email = "scionti@ismb.it mazumdar@dii.unisi.it
                 szuckerm@mtu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "complete distributed thread management mechanism;
                 Computational modeling; Computer architecture;
                 data-enabled multithreaded architecture; Dataflow;
                 degraded system performance; DELTA; execution model;
                 fast hashing; Hardware; hashing; high-performance
                 computers; Instruction sets; manycore chips; manycore
                 tiled-chip architecture; massive multihreading;
                 microprocessor chips; multi-threading; multiprocessing
                 systems; network-on-chip; network-on-chip routers;
                 Organizations; producer-consumer scheme; Programming;
                 resource contention; Scheduling; simultaneous threads;
                 thread-scheduling",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Scionti:2018:EMM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2018:IIC,
  author =       "Anonymous",
  title =        "2017 Index {{\booktitle{IEEE Computer Architecture
                 Letters}}} Vol. 16",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "1--6",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2799560",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Jeon:2018:HMP,
  author =       "Dong-Ik Jeon and Kyeong-Bin Park and Ki-Seok Chung",
  title =        "{HMC-MAC}: Processing-in Memory Architecture for
                 Multiply--Accumulate Operations with Hybrid Memory
                 Cube",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2700298",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Many studies focus on implementing processing-in
                 memory (PIM) on the logic die of the hybrid memory cube
                 (HMC) architecture. The multiply-accumulate (MAC)
                 operation is heavily used in digital signal processing
                 (DSP) systems. In this paper, a novel PIM architecture
                 called HMC-MAC that implements the MAC operation in the
                 HMC is proposed. The vault controllers of the
                 conventional HMC are working independently to maximize
                 the parallelism, and HMC-MAC is based on the
                 conventional HMC without modifying the architecture
                 much. Therefore, a large number of MAC operations can
                 be processed in parallel. In HMC-MAC, the MAC operation
                 can be carried out simultaneously with as much as 128
                 KB data. The correctness on HMC-MAC is verified by
                 simulations, and its performance is better than the
                 conventional CPU-based MAC operation when the MAC
                 operation is consecutively executed at least six
                 times",
  acknowledgement = ack-nhfb,
  affiliation =  "Chung, KS (Reprint Author), Hanyang Univ, Dept Elect
                 \& Comp Engn, Seoul 04763, South Korea. Jeon, Dong-Ik;
                 Park, Kyeong-Bin; Chung, Ki-Seok, Hanyang Univ, Dept
                 Elect \& Comp Engn, Seoul 04763, South Korea.",
  author-email = "estwingz@naver.com lay1523@naver.com
                 kchung@hanyang.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Basic Science Research Program through the
                 National Research Foundation of Korea(NRF) --- Ministry
                 of Education [NRF-2015R1D1A1A09061079]",
  funding-text = "This research was supported by Basic Science Research
                 Program through the National Research Foundation of
                 Korea(NRF) funded by the Ministry of Education
                 (NRF-2015R1D1A1A09061079).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computers; CPU-based MAC operation; digital signal
                 processing; digital signal processing systems; DRAM
                 chips; DSP systems; Electronic mail; HMC-MAC; hybrid
                 memory cube architecture; logic circuits; logic die;
                 memory architecture; Memory architecture; Memory
                 management; memory size 128.0 KByte; Memory structures;
                 memory used as logic; multiple data stream
                 architectures; multiply-accumulate operation; parallel
                 processing; processing-in memory architecture; Random
                 access memory; Registers; vault controllers",
  number-of-cited-references = "11",
  ORCID-numbers = "Jeon, Dong-Ik/0000-0002-8572-4184",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Jeon:2018:HMP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{VandenSteen:2018:MSP,
  author =       "Sam {Van den Steen} and Lieven Eeckhout",
  title =        "Modeling Superscalar Processor Memory-Level
                 Parallelism",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2701370",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper proposes an analytical model to predict
                 Memory-Level Parallelism (MLP) in a superscalar
                 processor. We profile the workload once and measure a
                 set of distributions to characterize the workload's
                 inherent memory behavior. We subsequently generate a
                 virtual instruction stream, over which we then process
                 an abstract MLP model to predict MLP for a particular
                 micro-architecture with a given ROB size, LLC size,
                 MSHR size and stride-based prefetcher. Experimental
                 evaluation reports an improvement in modeling error
                 from 16.9 percent for previous work to 3.6 percent on
                 average for the proposed model.",
  acknowledgement = ack-nhfb,
  affiliation =  "Van den Steen, S (Reprint Author), Univ Ghent, Ghent,
                 Belgium. Van den Steen, Sam; Eeckhout, Lieven, Univ
                 Ghent, Ghent, Belgium.",
  author-email = "sam.vandensteen@ugent.be lieven.eeckhout@ugent.be",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Agency for Innovation by Science and
                 Technology (IWT)",
  funding-text = "We thank the anonymous reviewers for their
                 constructive and insightful feedback. Sam Van den Steen
                 is supported through a doctoral fellowship by the
                 Agency for Innovation by Science and Technology
                 (IWT).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Analytical models; Computational modeling; Computer
                 architecture; Hardware; LLC size; Load modeling; memory
                 architecture; memory level parallelism (MLP);
                 micro-architecture; MLP model; Modeling; MSHR size;
                 Predictive models; Prefetching; ROB size; superscalar
                 processor memory-level parallelism modeling; virtual
                 instruction stream",
  number-of-cited-references = "11",
  ORCID-numbers = "Van den Steen, Sam/0000-0003-3630-2214",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "denSteen:2018:MSP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Durkovic:2018:BNS,
  author =       "Srdjan Durkovic and Zoran Cica",
  title =        "{Birkhoff--von Neumann} Switch Based on Greedy
                 Scheduling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2707082",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "It is important to develop high performance packet
                 switches that are highly scalable. Among the popular
                 solutions are input queued (IQ) switches and load
                 balanced Birkhoff-von Neumann (LB-BvN) switches.
                 However, both solutions have their drawbacks. Switch
                 configuration pattern in IQ switches is random which
                 can limit the supported port speed. On the other hand,
                 LB-BvN switches require two switching stages which
                 increase the overall cost. Also, some LB-BvN solutions
                 suffer from the packet out of sequence problem. In this
                 paper, we propose a novel packet switch architecture
                 that combines the best properties of the IQ and LB-BvN
                 switches and eliminates their drawbacks.",
  acknowledgement = ack-nhfb,
  affiliation =  "Cica, Z (Reprint Author), Univ Belgrade, Sch Elect
                 Engn, Belgrade 11120, Serbia. Durkovic, Srdjan; Cica,
                 Zoran, Univ Belgrade, Sch Elect Engn, Belgrade 11120,
                 Serbia.",
  author-email = "srdjad6@gmail.com zoran.cica@etf.rs",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Architecture; data communications; Delays; greedy
                 scheduling; high performance packet; Internet; IP
                 networks; IQ switches; LB-BvN solutions; LB-BvN
                 switches; load balanced Birkhoff-von Neumann switches;
                 packet switch architecture; packet switching;
                 packet-switching networks; Ports (Computers); queueing
                 theory; Random access memory; resource allocation;
                 routers; Scheduling; switch configuration pattern;
                 Switches; switching stages; telecommunication
                 scheduling",
  keywords-plus = "2-STAGE SWITCHES; DESIGN; ALGORITHM",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Durkovic:2018:BNS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Pham:2018:TSM,
  author =       "Binh Pham and Derek Hower and Abhishek Bhattacharjee
                 and Trey Cain",
  title =        "{TLB} Shootdown Mitigation for Low-Power Many-Core
                 Servers with {L1} Virtual Caches",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2712140",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Power efficiency has become one of the most important
                 design constraints for high-performance systems. In
                 this paper, we revisit the design of low-power
                 virtually-addressed caches. While virtually-addressed
                 caches enable significant power savings by obviating
                 the need for Translation Lookaside Buffer (TLB)
                 lookups, they suffer from several challenging design
                 issues that curtail their widespread commercial
                 adoption. We focus on one of these challenges-cache
                 flushes due to virtual page remappings. We use detailed
                 studies on an ARM many-core server to show that this
                 problem degrades performance by up to 25 percent for a
                 mix of multi-programmed and multi-threaded workloads.
                 Interestingly, we observe that many of these flushes
                 are spurious, and caused by an indiscriminate
                 invalidation broadcast on ARM architecture. In
                 response, we propose a low-overhead and readily
                 implementable hardware mechanism using bloom filters to
                 reduce spurious invalidations and mitigate their ill
                 effects.",
  acknowledgement = ack-nhfb,
  affiliation =  "Pham, B (Reprint Author), Rutgers State Univ, Dept
                 Comp Sci, Piscataway, NJ 08854 USA. Binh Pham;
                 Bhattacharjee, Abhishek, Rutgers State Univ, Dept Comp
                 Sci, Piscataway, NJ 08854 USA. Hower, Derek, Qualcomm
                 Technol Inc, Piscataway, NJ 08854 USA. Cain, Trey,
                 Qualcomm Datactr Technol Inc, Piscataway, NJ 08854
                 USA.",
  author-email = "binhpham@rutgers.edu dhower@qti.qualcomm.com
                 abhib@rutgers.edu tcain@qti.qualcomm.com",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ARM many-core server; Benchmark testing; bloom
                 filters; cache flushes; cache storage; Coherence;
                 Computer architecture; design constraints; Hardware;
                 high-performance systems; Indexes; L1 virtual caches;
                 low-overhead; low-power many-core servers; low-power
                 virtually-addressed caches; microprocessor chips;
                 multi-threading; multicores; multiprocessing systems;
                 multiprogrammed workloads; multiprogramming;
                 multithreaded workloads; multithreading; power
                 efficiency; power savings; Registers; Servers; TLB; TLB
                 shootdown mitigation; Virtual Cache; virtual memory;
                 virtual page remappings",
  number-of-cited-references = "21",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Pham:2018:TSM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yavits:2018:ASM,
  author =       "Leonid Yavits and Ran Ginosar",
  title =        "Accelerator for Sparse Machine Learning",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2714667",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Sparse matrix by vector multiplication (SpMV) plays a
                 pivotal role in machine learning and data mining. We
                 propose and investigate an SpMV accelerator,
                 specifically designed to accelerate the sparse matrix
                 by sparse vector multiplication (SpMSpV), and to be
                 integrated in a CPU core. We show that our accelerator
                 outperforms a similar solution by 70x while achieving
                 8x higher power efficiency, which yields an estimated
                 29x energy reduction for SpMSpV based applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Yavits, L (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-3200000 Haifa, Israel.
                 Yavits, Leonid; Ginosar, Ran, Technion Israel Inst
                 Technol, Dept Elect Engn, IL-3200000 Haifa, Israel.",
  author-email = "yavits@technion.ac.il ran@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; accelerator; Algorithm design and
                 analysis; CPU core; data mining; Indexes; learning
                 (artificial intelligence); matrix multiplication;
                 Memory management; microprocessor chips; power aware
                 computing; power efficiency; Random access memory;
                 regression analysis; sparse machine learning; sparse
                 matrices; Sparse matrices; sparse matrix; sparse matrix
                 by sparse vector multiplication; Sparse matrix
                 multiplication; sparse vector multiplication; SpMSpV
                 based applications; SpMV; SpMV accelerator; tree
                 searching; vectors",
  keywords-plus = "MATRIX-VECTOR MULTIPLICATION",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Yavits:2018:ASM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Christoforidis:2018:CTC,
  author =       "Eleftherios-Iordanis Christoforidis and Sotirios Xydis
                 and Dimitrios Soudris",
  title =        "{CF-TUNE}: Collaborative Filtering Auto-Tuning for
                 Energy Efficient Many-Core Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2716919",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Energy efficiency is considered today as a first class
                 design principle of modern many-core computing systems
                 in the effort to overcome the limited power envelope.
                 However, many-core processors are characterised by high
                 micro-architectural complexity, which is propagated up
                 to the application level affecting both performance and
                 energy consumption. In this paper, we present CF-TUNE,
                 an online and scalable auto-tuning framework for energy
                 aware applications mapping on emerging many-core
                 architectures. CF-TUNE enables the extraction of an
                 energy-efficient tuning configuration point with
                 minimal application characterisation on the whole
                 tuning configuration space. Instead of analyzing every
                 application against every tuning configuration, it
                 adopts a collaborative filtering technique that quickly
                 and with high accuracy configures the application's
                 tuning parameters by identifying similarities with
                 previously optimized applications. We evaluate
                 CF-TUNE's efficiency against a set of demanding and
                 diverse applications mapped on Intel Many Integrated
                 Core processor and we show that with minimal
                 characterization, e.g., only either two or four
                 evaluations, CF-TUNE recommends a tuning configuration
                 that performs at least at the 94 percent level of the
                 optimal one.",
  acknowledgement = ack-nhfb,
  affiliation =  "Xydis, S (Reprint Author), Natl Tech Univ Athens, Sch
                 Elect \& Comp Engn, Zografos 15780, Greece.
                 Christoforidis, Eleftherios-Iordanis; Xydis, Sotirios;
                 Soudris, Dimitrios, Natl Tech Univ Athens, Sch Elect \&
                 Comp Engn, Zografos 15780, Greece.",
  author-email = "eleftherios.christoforidis@gmail.com
                 sxydis@microlab.ntua.gr dsoudris@microlab.ntua.gr",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application level; application tuning parameters;
                 Auto-tuning; CF-TUNE efficiency; Collaboration;
                 collaborative filtering auto-tuning; Computer
                 architecture; design space exploration; energy aware
                 application mapping; energy conservation; energy
                 consumption; energy efficient computing; energy
                 efficient many-core processors; energy-efficient tuning
                 configuration point; Instruction sets; Intel many
                 integrated core processor; Intel MIC; machine learning;
                 many-core architectures; manycore architectures;
                 microarchitectural complexity; microprocessor chips;
                 Microwave integrated circuits; minimal application
                 characterisation; modern many-core computing systems;
                 multiprocessing systems; online auto-tuning framework;
                 Optimization; power aware computing; power envelope;
                 scalable auto-tuning framework; Tuning; tuning
                 configuration space",
  number-of-cited-references = "15",
  ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847",
  research-areas = "Computer Science",
  researcherid-numbers = "Soudris, Dimitrios/O-8843-2019",
  times-cited =  "0",
  unique-id =    "Christoforidis:2018:CTC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Almatrood:2018:DGP,
  author =       "Amjad F. Almatrood and Harpreet Singh",
  title =        "Design of Generalized Pipeline Cellular Array in
                 Quantum-Dot Cellular Automata",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "29--32",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2719021",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Cellular arrays have been the topic of interest in
                 computer arithmetic and architecture for the last four
                 decades. In this letter, an overall quantum-dot
                 cellular automata (QCA) design for a generalized
                 pipeline cellular array is presented. QCA is one of the
                 promising emerging nanotechnologies that are being
                 considered as possible alternatives to complementary
                 metal-oxide semiconductor technology due to the
                 physical limitations of CMOS. The QCA designs for
                 arithmetic cell and control cell used in the pipeline
                 array are discussed in detail. The equivalent majority
                 logic networks to these cells are generated using the
                 best existing majority logic synthesis method in order
                 to obtain the optimal majority networks which require
                 fewer QCA cells and clock zones compared to other
                 synthesis methods. The proposed array can perform all
                 the basic arithmetic operations such as squaring,
                 square rooting, multiplication, division, etc., which
                 could be quite valuable in considering future
                 large-scale QCA designs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Almatrood, AF (Reprint Author), Wayne State Univ, Dept
                 Elect \& Comp Engn, Detroit, MI 48202 USA. Almatrood,
                 Amjad F.; Singh, Harpreet, Wayne State Univ, Dept Elect
                 \& Comp Engn, Detroit, MI 48202 USA.",
  author-email = "amjad.almatrood@wayne.edu hsingh@eng.wayne.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "arithmetic cell; Arithmetic processor; cellular
                 arrays; cellular automata; clock zones; Clocks; clocks;
                 CMOS logic circuits; CMOS technology; complementary
                 metal-oxide semiconductor technology; computer
                 architecture; Computer architecture; computer
                 arithmetic; control cell; Delays; equivalent majority
                 logic networks; generalized pipeline cellular array
                 design; large-scale QCA designs; Logic arrays; logic
                 design; Logic gates; majority logic; majority logic
                 synthesis method; Microprocessors; nanoelectronics;
                 nanotechnologies; pipeline array; Pipelines;
                 quantum-dot cellular automata (QCA); quantum-dot
                 cellular automata design",
  number-of-cited-references = "16",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Almatrood:2018:DGP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zha:2018:CRC,
  author =       "Yue Zha and Jing Li",
  title =        "{CMA}: a Reconfigurable Complex Matching Accelerator
                 for Wire-Speed Network Intrusion Detection",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "33--36",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2719023",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The rapid growth in network bandwidth and the ever
                 more sophisticated network attack techniques pose
                 challenges to current network intrusion detection
                 systems (NIDS). While software-based solutions are
                 incapable of performing wire-speed network traffic
                 monitoring, many hardware-based pattern matching
                 solutions also suffer from capacity limitation and high
                 power consumption. To effectively address these
                 challenges, we propose a reconfigurable complex
                 matching accelerator (CMA) enabled by the emerging
                 nonvolatile memory technology (resistive random access
                 memory) to speed up intrusion detection systems with
                 better energy efficiency. Beyond common equality
                 matching in current NIDS, CMA can be configured to
                 provide a comprehensive set of arithmetic matching
                 functions (e.g., less than), resulting in improved
                 utilization and higher energy efficiency. We evaluate
                 CMA using real-world network security benchmarks. On
                 average, it achieves 84.9 percent area reduction, 97.3
                 percent energy consumption reduction, and 20 percent
                 improvement in searching speed compared to the
                 SRAM-based Ternary Content Addressable Memory (TCAM)
                 design in state-of-the-art NIDS. It also outperforms
                 emerging RRAM-based TCAM (2.5T1R) design in area,
                 energy and search delay, on the set of evaluated
                 workloads.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zha, Y (Reprint Author), Univ Wisconsin, Elect \& Comp
                 Engn, Madison, WI 53706 USA. Zha, Yue; Li, Jing, Univ
                 Wisconsin, Elect \& Comp Engn, Madison, WI 53706 USA.",
  author-email = "yzha3@wisc.edu jli587@wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerator; arithmetic matching functions; CMA;
                 Computer architecture; computer network security;
                 computer networks; content-addressable storage;
                 Coprocessors; emerging nonvolatile memory technology;
                 Encoding; energy consumption reduction; higher energy
                 efficiency; intrusion detection; Intrusion detection;
                 IP networks; network bandwidth; network intrusion
                 detection systems; Network security; NIDS; pattern
                 matching; pattern matching solutions; Ports
                 (Computers); random-access storage; real-world network
                 security benchmarks; reconfigurable complex matching
                 accelerator; ReRAM; resistive random access memory;
                 security of data; sophisticated network attack
                 techniques; SRAM chips; TCAM; telecommunication
                 traffic; ternary content addressable memory design;
                 wire-speed network intrusion detection; wire-speed
                 network traffic monitoring",
  keywords-plus = "PACKET CLASSIFICATION; MODEL",
  number-of-cited-references = "15",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Zha:2018:CRC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Jung:2018:SMS,
  author =       "Myoungsoo Jung and Jie Zhang and Ahmed Abulila and
                 Miryeong Kwon and Narges Shahidi and John Shalf and Nam
                 Sung Kim and Mahmut Kandemir",
  title =        "{SimpleSSD}: Modeling Solid State Drives for Holistic
                 System Simulation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "37--41",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2750658",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Existing solid state drive (SSD) simulators
                 unfortunately lack hardware and/or software
                 architecture models. Consequently, they are far from
                 capturing the critical features of contemporary SSD
                 devices. More importantly, while the performance of
                 modern systems that adopt SSDs can vary based on their
                 numerous internal design parameters and storage-level
                 configurations, a full system simulation with
                 traditional SSD models often requires unreasonably long
                 runtimes and excessive computational resources. In this
                 work, we propose SimpleSSD, a high-fidelity simulator
                 that models all detailed characteristics of hardware
                 and software, while simplifying the nondescript
                 features of storage internals. In contrast to existing
                 SSD simulators, SimpleSSD can easily be integrated into
                 publicly-available full system simulators. In addition,
                 it can accommodate a complete storage stack and
                 evaluate the performance of SSDs along with diverse
                 memory technologies and microarchitectures. Thus, it
                 facilitates simulations that explore the full design
                 space at different levels of system abstraction.",
  acknowledgement = ack-nhfb,
  affiliation =  "Jung, M (Reprint Author), Yonsei Univ, Comp
                 Architecture \& Memory Syst Lab, Seoul 03722, South
                 Korea. Jung, Myoungsoo; Zhang, Jie; Kwon, Miryeong,
                 Yonsei Univ, Comp Architecture \& Memory Syst Lab,
                 Seoul 03722, South Korea. Abulila, Ahmed; Kim, Nam
                 Sung, Univ Illinois, Champaign, IL 61820 USA. Shahidi,
                 Narges; Kandemir, Mahmut, Penn State Univ, State Coll,
                 PA 16801 USA. Shalf, John, Lawrence Berkeley Natl Lab,
                 Berkeley, CA 94720 USA.",
  author-email = "m.jung@yonsei.ac.kr jie@yonsei.ac.kr
                 abulila2@illinois.edu mkwon@camelab.org nxs314@psu.edu
                 jshalf@lbl.gov nskim@illinois.edu
                 kandemir@cse.psu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NRF [2016R1C1B2015312]; Mem-Ray grant
                 [2015-11-1731]; US National Science Foundation
                 [1640196, 1439021, 1439057, 1409095, 1626251, 1629915,
                 1629129, 1526750]; SRC/NRC NERC [2016-NE-2697-A];
                 [IITP-2017-2017-0-01015]; [NRF-2015M3C4A7065645]; [DOE
                 DE-AC02-05CH 11231]",
  funding-text = "This research is mainly supported by NRF
                 2016R1C1B2015312. This work is also supported in part
                 by IITP-2017-2017-0-01015, NRF-2015M3C4A7065645, DOE
                 DE-AC02-05CH 11231, and Mem-Ray grant (2015-11-1731).
                 Dr. Kim is supported in part by US National Science
                 Foundation 1640196 and SRC/NRC NERC 2016-NE-2697-A. Dr.
                 Kandemir is supported in part by US National Science
                 Foundation grants 1439021, 1439057, 1409095, 1626251,
                 1629915, 1629129 and 1526750.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "complete storage stack; computational modeling;
                 Computational modeling; computational modeling;
                 computer architecture; Computer architecture;
                 contemporary SSD devices; flash memories; Hardware;
                 high-fidelity simulator; internal design parameters;
                 microprocessors; Microprocessors; microprocessors;
                 nondescript features; parallel processing; Parallel
                 processing; parallel processing; publicly-available
                 full system simulators; SimpleSSD; software; Software;
                 software; solid state drive simulators; SSD simulators;
                 storage-level configurations; system abstraction;
                 system simulation; systems simulation; Systems
                 simulation; systems simulation",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  researcherid-numbers = "Jung, Myoungsoo/F-4565-2019",
  times-cited =  "2",
  unique-id =    "Jung:2018:SMS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Chowdhury:2018:EMP,
  author =       "Zamshed Chowdhury and Jonathan D. Harms and S. Karen
                 Khatamifard and Masoud Zabihi and Yang Lv and Andrew P.
                 Lyle and Sachin S. Sapatnekar and Ulya R. Karpuzcu and
                 Jian-Ping Wang",
  title =        "Efficient In-Memory Processing Using Spintronics",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "42--46",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2751042",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As the overhead of data retrieval becomes forbidding,
                 bringing processor logic to the memory where the data
                 reside becomes more energy-efficient. While traditional
                 CMOS structures are unsuited to the tight integration
                 of logic and memory, emerging spintronic technologies
                 show remarkable versatility. This paper introduces a
                 novel spintronics-based processing-in-memory (PIM)
                 framework called computational RAM (CRAM) to solve
                 data-intensive computing problems.",
  acknowledgement = ack-nhfb,
  affiliation =  "Chowdhury, Z (Reprint Author), Univ Minnesota, Dept
                 Elect \& Comp Engn, Minneapolis, MN 55455 USA.
                 Chowdhury, Zamshed; Harms, Jonathan D.; Khatamifard, S.
                 Karen; Zabihi, Masoud; Lv, Yang; Lyle, Andrew P.;
                 Sapatnekar, Sachin S.; Karpuzcu, Ulya R.; Wang,
                 Jian-Ping, Univ Minnesota, Dept Elect \& Comp Engn,
                 Minneapolis, MN 55455 USA.",
  author-email = "chowh005@umn.edu harms074@umn.edu khatami@umn.edu
                 zabih003@umn.edu lvxxx057@umn.edu
                 czamshediqbal@gmail.com sachin@umn.edu ukarpuzc@umn.edu
                 jpwang@umn.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "DARPA Non-Volatile Logic program; NSF SPX
                 [1725420]; by C-SPIN, one of the six SRC STARnet
                 Centers; MARCO; DARPA",
  funding-text = "This work is supported by DARPA Non-Volatile Logic
                 program, NSF SPX grant no. 1725420, and by C-SPIN, one
                 of the six SRC STARnet Centers, sponsored by MARCO and
                 DARPA. Chowdhury and Harms equally contributed to this
                 work.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Adders; computational RAM; CRAM; data retrieval;
                 data-intensive computing problems; Efficient In-Memory
                 Processing; energy-efficiency; Logic arrays; Logic
                 gates; Magnetic tunneling; magnetoelectronics; Memory
                 management; MRAM devices; MTJ; PIM framework;
                 processing-in-memory; processing-in-memory framework;
                 processor logic; Random access memory; spintronic
                 technologies; spintronics; STT-MRAM; traditional CMOS
                 structures",
  keywords-plus = "UNIVERSAL MEMORY; LOGIC",
  number-of-cited-references = "25",
  ORCID-numbers = "Sapatnekar, Sachin/0000-0002-5353-2364",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Chowdhury:2018:EMP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Ajdari:2018:SHB,
  author =       "Mohammadamin Ajdari and Pyeongsu Park and Dongup Kwon
                 and Joonsung Kim and Jangwoo Kim",
  title =        "A Scalable {HW}-Based Inline Deduplication for {SSD}
                 Arrays",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "47--50",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2753258",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "SSD arrays are becoming popular in modern storage
                 servers as a primary storage, and they aim to reduce
                 the high cost of the devices by performing inline
                 deduplications. Unfortunately, existing software-based
                 inline deduplications cannot achieve the devices'
                 maximum throughput due to their high CPU utilization
                 and power overhead. A recently proposed approach to
                 perform device-wide deduplications inside each SSD can
                 distribute the CPU overhead among multiple SSDs, but it
                 also suffers from severely decreasing deduplication
                 opportunities with the increasing number of SSDs
                 deployed per node. Therefore, we propose a node-wide
                 deduplication engine that relies on specialized
                 hardware to perform two key steps of deduplication;
                 data signature generation and table management. Our
                 FPGA-based prototype detects all duplicates, and
                 compared to software-based inline deduplication, it
                 reduces the overall CPU utilization and power
                 consumption by 93.6 and similar to 20 percent
                 respectively for a slow baseline and more for faster
                 baselines.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, J (Reprint Author), Seoul Natl Univ, Dept Elect
                 \& Comp Engn, Seoul 08826, South Korea. Ajdari,
                 Mohammadamin, POSTECH, Dept Comp Sci \& Engn, Pohang
                 37673, South Korea. Park, Pyeongsu; Kwon, Dongup; Kim,
                 Joonsung; Kim, Jangwoo, Seoul Natl Univ, Dept Elect \&
                 Comp Engn, Seoul 08826, South Korea.",
  author-email = "majdari@postech.ac.kr pyeongsu@snu.ac.kr
                 dongup@snu.ac.kr joonsung90@snu.ac.kr
                 jangwoo@snu.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Basic Science Research Program through the
                 National Research Foundation of Korea (NRF) ---
                 Ministry of Science, ICT \& Future Planning
                 [NRF-2015M3C4A7065647, NRF-2017R1A2B3011038]; Institute
                 for Information \& communications Technology Promotion
                 (IITP) grant --- Korea government (MSIT)
                 [R0190-15-2012]",
  funding-text = "This work was partly supported by Basic Science
                 Research Program through the National Research
                 Foundation of Korea (NRF) funded by the Ministry of
                 Science, ICT \& Future Planning (NRF-2015M3C4A7065647,
                 NRF-2017R1A2B3011038), and Institute for Information \&
                 communications Technology Promotion (IITP) grant funded
                 by the Korea government (MSIT) (No. R0190-15-2012).
                 Mohammadamin Ajdari and Pyeongsu Park contributed
                 equally to this work.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "CPU overhead; CPU utilization; data handling; data
                 integrity; deduplication; deduplication opportunities;
                 device-wide deduplications; Engines; field programmable
                 gate arrays; file servers; flash memories; FPGA;
                 FPGA-based prototype; Hardware; inline deduplication;
                 modern storage servers; node-wide deduplication engine;
                 Performance evaluation; power consumption; Power
                 demand; power overhead; primary storage; Random access
                 memory; Servers; software-based inline deduplications;
                 SSD; SSD arrays; storage management; Storage server;
                 Throughput",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Ajdari:2018:SHB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hoseinzadeh:2018:FBS,
  author =       "Morteza Hoseinzadeh",
  title =        "Flow-Based Simulation Methodology",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "51--54",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2756051",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper presents flow-based simulation, a new
                 methodology for evaluating novel and intricate computer
                 system designs. The main idea of flow-based simulation
                 is to keep the history of every simulated memory
                 element, instead of its latest value, to make it time
                 bonded so that sliding the time forward and backward
                 changes the state of the system accordingly. Having
                 this opportunity, new architectural designs can be
                 evaluated in terms of timing and energy by implementing
                 only a functional simulation. Due to serial execution,
                 the process of the design in a flow-based simulation is
                 traceable and easy to understand. As a result,
                 comparing with cycle-driven and event-driven
                 techniques, complicated algorithms can be evaluated
                 much easier. Flow-based simulation simplifies the
                 burden of the timing simulation, and consequently leads
                 to faster development and simulation time.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hoseinzadeh, M (Reprint Author), Univ Calif San Diego,
                 Dept Comp Sci \& Engn, La Jolla, CA 92093 USA.
                 Hoseinzadeh, Morteza, Univ Calif San Diego, Dept Comp
                 Sci \& Engn, La Jolla, CA 92093 USA.",
  author-email = "mhoseinzadeh@cs.ucsd.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computational modeling; Computer architectural
                 simulator; Concurrent computing; cycle-driven
                 techniques; digital simulation; event-driven
                 techniques; flow-based simulation; flow-based
                 simulation methodology; functional simulation; History;
                 Integrated circuit modeling; Interference; intricate
                 computer system designs; simulated memory element;
                 simulation methodologies; Timing; timing simulation;
                 Tools",
  keywords-plus = "FULL-SYSTEM",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Hoseinzadeh:2018:FBS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Eyerman:2018:MSC,
  author =       "Stijn Eyerman and Wim Heirman and Kristof {Du Bois}
                 and Ibrahim Hur",
  title =        "Multi-Stage {CPI} Stacks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "55--58",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2761751",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "CPI stacks are an intuitive way to visualize processor
                 core performance bottlenecks. However, they often do
                 not provide a full view on all bottlenecks, because
                 stall events can occur concurrently. Typically one of
                 the events is selected, which means information about
                 the non-chosen stall events is lost. Furthermore, we
                 show that there is no single correct CPI stack: stall
                 penalties can be hidden, can overlap or can cause
                 second-order effects, making total CPI more complex
                 than just a sum of components. Instead of showing a
                 single CPI stack, we propose to measure multiple CPI
                 stacks during program execution: a CPI stack at each
                 stage of the processor pipeline. This representation
                 reveals all performance bottlenecks and provides a more
                 complete view on the performance of an application.
                 Multi-stage CPI stacks are easy to collect, which means
                 that they can be included in a simulator with
                 negligible slowdown, and that they can be included in
                 the core hardware with limited overhead.",
  acknowledgement = ack-nhfb,
  affiliation =  "Eyerman, S (Reprint Author), Intel Corp, Santa Clara,
                 CA 95054 USA. Eyerman, Stijn; Heirman, Wim; Du Bois,
                 Kristof; Hur, Ibrahim, Intel Corp, Santa Clara, CA
                 95054 USA.",
  author-email = "Stijn.Eyerman@intel.com Wim.Heirman@intel.com
                 Kristof.Du.Bois@intel.com Ibrahim.Hur@intel.com",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "1/f noise; Additives; CPI stacks; Hardware;
                 microprocessor chips; multiple CPI stacks; multistage
                 CPI stacks; Performance analysis; performance counters;
                 performance evaluation; Performance gain; pipeline
                 processing; Pipelines; processor core performance
                 bottlenecks; processor pipeline; program execution;
                 Proposals; Radiation detectors; single correct CPI
                 stack; stall events; stall penalties; total CPI",
  number-of-cited-references = "7",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Eyerman:2018:MSC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zhang:2018:LHC,
  author =       "Guowei Zhang and Daniel Sanchez",
  title =        "Leveraging Hardware Caches for Memoization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "59--63",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2762308",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memoization improves performance and saves energy by
                 caching and reusing the outputs of repetitive
                 computations. Prior work has proposed software and
                 hardware memoization techniques, but both have
                 significant drawbacks. Software memoization suffers
                 from high runtime overheads, and is thus limited to
                 long computations. Conventional hardware memoization
                 techniques achieve low overheads and can memoize short
                 functions, but they rely on large, special-purpose
                 memoization caches that waste significant area and
                 energy. We propose MCACHE, a hardware technique that
                 leverages data caches for memoization. MCACHE stores
                 memoization tables in memory, and allows them to share
                 cache capacity with normal program data. MCACHE
                 introduces ISA and pipeline extensions to accelerate
                 memoization operations, bridging the gap between
                 software and conventional hardware techniques.
                 Simulation results show that MCACHE improves
                 performance by up to 21 x, outperforms software
                 memoization by up to 2.2 x, and achieves similar or
                 superior performance over conventional hardware
                 techniques without any dedicated storage.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sanchez, D (Reprint Author), MIT CSAIL, Cambridge, MA
                 02139 USA. Zhang, Guowei; Sanchez, Daniel, MIT CSAIL,
                 Cambridge, MA 02139 USA.",
  author-email = "zhanggw@csail.mit.edu sanchez@csail.mit.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "C-FAR, one of six SRC STAR-net centers by
                 MARCO; C-FAR, one of six SRC STAR-net centers by DARPA;
                 NSF [CAREER-1452994]",
  funding-text = "This work was supported in part by C-FAR, one of six
                 SRC STAR-net centers by MARCO and DARPA, and by NSF
                 grant CAREER-1452994.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; Benchmark testing; cache capacity; cache
                 storage; caches; Computer architecture; data caches;
                 energy by caching; Hardware; hardware caches; Indexes;
                 MCACHE; memoization; memoization operations;
                 memoization tables; memory systems; power aware
                 computing; Registers; runtime overheads; Semantics;
                 Software; software memoization suffers; special-purpose
                 memoization caches",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Zhang:2018:LHC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Vakil-Ghahani:2018:CRP,
  author =       "Armin Vakil-Ghahani and Sara Mahdizadeh-Shahri and
                 Mohammad-Reza Lotfi-Namin and Mohammad Bakhshalipour
                 and Pejman Lotfi-Kamran and Hamid Sarbazi-Azad",
  title =        "Cache Replacement Policy Based on Expected Hit Count",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "64--67",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2762660",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memory-intensive workloads operate on massive amounts
                 of data that cannot be captured by last-level caches
                 (LLCs) of modern processors. Consequently, processors
                 encounter frequent off-chip misses, and hence, lose
                 significant performance potential. One of the
                 components of a modern processor that has a prominent
                 influence on the off-chip miss traffic is LLC's
                 replacement policy. Existing processors employ a
                 variation of least recently used (LRU) policy to
                 determine the victim for replacement. Unfortunately,
                 there is a large gap between what LRU offers and that
                 of Belady's MIN, which is the optimal replacement
                 policy. Belady's MIN requires selecting a victim with
                 the longest reuse distance, and hence, is unfeasible
                 due to the need for knowing the future. In this work,
                 we observe that there exists a strong correlation
                 between the expected number of hits of a cache block
                 and the reciprocal of its reuse distance. Taking
                 advantage of this observation, we improve the
                 efficiency of last-level caches through a
                 low-cost-yet-effective replacement policy. We suggest a
                 hit-count based victim-selection procedure on top of
                 existing low-cost replacement policies to significantly
                 improve the quality of victim selection in last-level
                 caches without commensurate area overhead. Our proposal
                 offers 12.2 percent performance improvement over the
                 baseline LRU in a multi-core processor and outperforms
                 EVA, which is the state-of-the-art replacement
                 policy.",
  acknowledgement = ack-nhfb,
  affiliation =  "Bakhshalipour, M (Reprint Author), Sharif Univ
                 Technol, Dept Comp Engn, Tehran 1115511365, Iran.
                 Vakil-Ghahani, Armin; Mahdizadeh-Shahri, Sara;
                 Lotfi-Namin, Mohammad-Reza; Bakhshalipour, Mohammad;
                 Sarbazi-Azad, Hamid, Sharif Univ Technol, Dept Comp
                 Engn, Tehran 1115511365, Iran. Lotfi-Kamran, Pejman;
                 Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch
                 Comp Sci, Tehran 1953833511, Iran.",
  author-email = "vakil@ce.sharif.edu smahdizadeh@ce.sharif.edu
                 mrlotfi@ce.sharif.edu bakhshalipour@ce.sharif.edu
                 plotfi@ipm.ir azad@sharif.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Belady's MIN; cache block; cache replacement policy;
                 cache storage; Correlation; expected hit count;
                 History; hit-count based victim-selection procedure;
                 last-level cache; last-level caches; longest reuse
                 distance; low-cost replacement policies;
                 low-cost-yet-effective replacement policy; Memory
                 system; memory-intensive workload; memory-intensive
                 workloads; Multicore processing; multicore processor;
                 multiprocessing systems; off-chip miss traffic;
                 off-chip misses; optimal replacement policy;
                 performance evaluation; performance improvement;
                 Prefetching; Proposals; Radiation detectors;
                 replacement policy; victim selection",
  keywords-plus = "PREDICTION",
  number-of-cited-references = "16",
  ORCID-numbers = "Vakil Ghahani, Seyed Armin/0000-0002-4365-8932",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Vakil-Ghahani:2018:CRP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hadjilambrou:2018:SCV,
  author =       "Zacharias Hadjilambrou and Shidhartha Das and Marco A.
                 Antoniades and Yiannakis Sazeides",
  title =        "Sensing {CPU} Voltage Noise Through Electromagnetic
                 Emanations",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "68--71",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2766221",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This work proposes sensing CPU voltage noise through
                 wireless electromagnetic (EM) emanations from the CPU.
                 Compared to previous voltage monitoring methodologies,
                 this approach is not intrusive as it does not require
                 direct physical access to the monitored CPU. To prove
                 the effectiveness of this approach, we use EM signal
                 feedback to find the resonant frequency of the CPU
                 power delivery network, and to generate a CPU voltage
                 noise (dI/dt) virus. This study is performed on a
                 modern out-of-order CPU that supports on-chip fine
                 grain voltage monitoring. This on-chip voltage
                 monitoring capability is used to validate the proposed
                 EM methodology.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hadjilambrou, Z (Reprint Author), Univ Cyprus, CY-1678
                 Nicosia, Cyprus. Hadjilambrou, Zacharias; Antoniades,
                 Marco A.; Sazeides, Yiannakis, Univ Cyprus, CY-1678
                 Nicosia, Cyprus. Das, Shidhartha, ARM, Cambridge CB1
                 9NJ, England.",
  author-email = "zhadji01@cs.ucy.ac.cy Shidhartha.Das@arm.com
                 mantonia@ucy.ac.cy yanos@cs.ucy.ac.cy",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "European Union Horizon 2020 project
                 Uniserver [688540]; University of Cyprus",
  funding-text = "This work is partially supported by European Union
                 Horizon 2020 project Uniserver grant no. 688540 and the
                 University of Cyprus.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "circuit resonance; CPU power delivery network; CPU
                 voltage noise virus; electromagnetic emanations; EM
                 signal feedback; Frequency measurement; Genetic
                 algorithms; Hardware reliability; microprocessor chips;
                 Monitoring; on-chip fine grain voltage monitoring;
                 on-chip voltage monitoring capability; Resonant
                 frequency; RLC circuits; Stress; stress tests;
                 System-on-chip; voltage noise; voltage regulators;
                 wireless electromagnetic emanations",
  number-of-cited-references = "19",
  ORCID-numbers = "Antoniades, Marco/0000-0002-9699-2387",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Hadjilambrou:2018:SCV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Jung:2018:PCU,
  author =       "Daejin Jung and Sunjung Lee and Wonjong Rhee and Jung
                 Ho Ahn",
  title =        "Partitioning Compute Units in {CNN} Acceleration for
                 Statistical Memory Traffic Shaping",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "72--75",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2773055",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Convolutional Neural Networks (CNNs) have become the
                 default choice for processing visual information, and
                 the design complexity of CNNs has been steadily
                 increasing to improve accuracy. To cope with the
                 massive amount of computation needed for such complex
                 CNNs, the latest solutions utilize blocking of an image
                 over the available dimensions (e.g., horizontal,
                 vertical, channel, and kernel) and batching of multiple
                 input images to improve data reuse in the memory
                 hierarchy. While there has been a large collection of
                 works on maximizing data reuse, only a few studies have
                 focused on the memory bottleneck problem caused by
                 limited bandwidth. Bandwidth bottleneck can easily
                 occur in CNN acceleration as CNN layers have different
                 sizes with varying computation needs and as batching is
                 typically performed over each layer of CNN for an ideal
                 data reuse. In this case, the data transfer demand for
                 a layer can be relatively low or high compared to the
                 computation requirement of the layer, and therefore
                 temporal fluctuations in memory access can be induced
                 eventually causing bandwidth problems. In this paper,
                 we first show that there exists a high degree of
                 fluctuation in memory access to computation ratio
                 depending on CNN layers and functions in the layer
                 being processed by the compute units (cores), where the
                 compute units are tightly synchronized to maximize data
                 reuse. Then we propose a strategy of partitioning the
                 compute units where the cores within each partition
                 process a batch of input data in a synchronous manner
                 to maximize data reuse but different partitions run
                 asynchronously. Because the partitions stay
                 asynchronous and typically process different CNN layers
                 at any given moment, the memory access traffic sizes of
                 the partitions become statistically shuffled. Thus, the
                 partitioning of compute units and asynchronous use of
                 them make the total memory access traffic size be
                 smoothened over time, and the degree of partitioning
                 determines a tradeoff between data reuse efficiency and
                 memory bandwidth utilization efficiency. We call this
                 smoothing statistical memory traffic shaping, and we
                 show that it can lead to 8.0 percent of performance
                 gain on a commercial 64-core processor when running
                 ResNet-50.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rhee, W; Ahn, JH (Reprint Author), Seoul Natl Univ,
                 Dept Transdisciplinary Studies, Seoul 151742, South
                 Korea. Jung, Daejin; Lee, Sunjung; Rhee, Wonjong; Ahn,
                 Jung Ho, Seoul Natl Univ, Dept Transdisciplinary
                 Studies, Seoul 151742, South Korea.",
  author-email = "haijd@snu.ac.kr shiish@snu.ac.kr wrhee@snu.ac.kr
                 gajh@snu.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Research Foundation of Korea grant
                 --- Korea government [NRF-2017R1A2B2005416,
                 NRF-2017R1E1A1A03070560]",
  funding-text = "This work was partially supported by the National
                 Research Foundation of Korea grant funded by the Korea
                 government (NRF-2017R1A2B2005416 and
                 NRF-2017R1E1A1A03070560).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; Bandwidth; bandwidth bottleneck;
                 bandwidth problems; CNN; CNN acceleration; CNN layers;
                 complex CNNs; comput units; computation requirement;
                 Computational modeling; Computer architecture;
                 Convolution; convolutional neural networks; data
                 transfer demand; horizontal channel; ideal data reuse;
                 image processing; Kernel; maximize data reuse; memory
                 access traffic sizes; memory bandwidth utilization
                 efficiency; memory bottleneck; memory bottleneck
                 problem; memory hierarchy; microprocessor chips;
                 multiprocessing systems; neural nets; Neural networks;
                 parallel processing; partitioning; partitioning compute
                 units; smoothing statistical memory traffic shaping;
                 traffic shaping; vertical channel",
  number-of-cited-references = "16",
  oa =           "Bronze",
  ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394 Rhee,
                 Wonjong/0000-0002-2590-8774",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Jung:2018:PCU",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{SanMiguel:2018:EMA,
  author =       "Joshua {San Miguel} and Karthik Ganesan and Mario Badr
                 and Natalie {Enright Jerger}",
  title =        "The {EH} Model: Analytical Exploration of
                 Energy-Harvesting Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "76--79",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2777834",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Energy-harvesting devices-which operate solely on
                 energy collected from their environment-have brought
                 forth a new paradigm of intermittent computing. These
                 devices succumb to frequent power outages that would
                 cause conventional systems to be stuck in a perpetual
                 loop of restarting computation and never making
                 progress. Ensuring forward progress in an intermittent
                 execution model is difficult and requires saving state
                 in non-volatile memory. In this work, we propose the EH
                 model to explore the trade-offs associated with backing
                 up data to maximize forward progress. In particular, we
                 focus on the relationship between energy and forward
                 progress and how they are impacted by backups/restores
                 to derive insights for programmers and architects.",
  acknowledgement = ack-nhfb,
  affiliation =  "San Miguel, J (Reprint Author), Univ Toronto, Edward S
                 Rogers Sr Dept Elect \& Comp Engn, Toronto, ON M5S,
                 Canada. San Miguel, Joshua; Ganesan, Karthik; Badr,
                 Mario; Jerger, Natalie Enright, Univ Toronto, Edward S
                 Rogers Sr Dept Elect \& Comp Engn, Toronto, ON M5S,
                 Canada.",
  author-email = "joshua.sanmiguel@mail.utoronto.ca
                 karthik.ganesan@mail.utoronto.ca
                 mario.badr@mail.utoronto.ca enright@ece.utoronto.ca",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "analytical exploration; analytical model; Analytical
                 models; Computational modeling; Computer architecture;
                 conventional systems; EH model; energy harvesting;
                 Energy-harvesting; energy-harvesting architectures;
                 energy-harvesting devices; forward progress; frequent
                 power outages; intermittent computing; intermittent
                 execution model; Mathematical model; Nonvolatile
                 memory; nonvolatile memory; perpetual loop; power aware
                 computing; Power system reliability; random-access
                 storage",
  number-of-cited-references = "11",
  ORCID-numbers = "Ganesan, Karthik/0000-0002-2541-1549",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Miguel:2018:EMA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2018:SPM,
  author =       "Jihun Kim and Joonsung Kim and Pyeongsu Park and Jong
                 Kim and Jangwoo Kim",
  title =        "{SSD} Performance Modeling Using Bottleneck Analysis",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "80--83",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2779122",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Solid-State Drives (SSDs) are widely deployed for high
                 throughput and low latency. However, the unpredictable
                 access latency of SSDs makes it difficult to satisfy
                 quality-of-service requirements and fully achieve the
                 performance potential. In fact, it has been a
                 fundamental challenge to accurately predict the access
                 latency of modern SSDs performing many non-disclosed,
                 device-specific intra-SSD optimizations. In this paper,
                 we propose SSDcheck, a novel SSD performance model
                 which accurately predicts the latency of future SSD
                 accesses. After first identifying write buffer (WB) and
                 garbage collection (GC) as the key components in
                 modeling the access latency, we develop diagnosis
                 snippets to identify the target SSDs critical intra-SSD
                 parameters (e.g., WB size). Finally, we construct the
                 SSDs access-latency model with the identified
                 parameters. Our system-level evaluations using five
                 commodity SSDs show that SSDcheck achieves up to 93
                 percent prediction accuracy. Our real-world prototype
                 applying an SSDcheck-aware system-level request
                 scheduling can significantly improve both throughput
                 and tail latency by up to 2.1x and 1.46x,
                 respectively.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, J (Reprint Author), Seoul Natl Univ, Dept Elect
                 \& Comp Engn, Seoul 151742, South Korea. Kim, Jihun;
                 Kim, Jong, POSTECH, Dept Comp Sci \& Engn, Pohang
                 37673, Gyeongbuk, South Korea. Kim, Joonsung; Park,
                 Pyeongsu; Kim, Jangwoo, Seoul Natl Univ, Dept Elect \&
                 Comp Engn, Seoul 151742, South Korea.",
  author-email = "jihun735@postech.ac.kr jkim@postech.ac.kr
                 pyeongsu@snu.ac.kr joonsung90@snu.ac.kr
                 jangwoo@snu.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Basic Science Research Program through the
                 National Research Foundation of Korea (NRF) ---
                 Ministry of Science, ICT \& Future Planning
                 [NRF-2015M3C4A7065647, NRF-2017R1A2B3011038]; Institute
                 for Information \& communications Technology Promotion
                 (IITP) grant --- Korea government (MSIT)
                 [R0190-15-2012]",
  funding-text = "This work was partly supported by Basic Science
                 Research Program through the National Research
                 Foundation of Korea (NRF) funded by the Ministry of
                 Science, ICT \& Future Planning (NRF-2015M3C4A7065647,
                 NRF-2017R1A2B3011038), and Institute for Information \&
                 communications Technology Promotion (IITP) grant funded
                 by the Korea government (MSIT) (No. R0190-15-2012).
                 Jihun Kim and Joonsung Kim are contributed equally to
                 this work.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "bottleneck analysis; cache storage; commodity SSD;
                 critical intra-SSD parameters; device-specific intraSSD
                 optimizations; Engines; Feature extraction; flash
                 memories; future SSD accesses; garbage collection;
                 identified parameters; Interference; Monitoring;
                 Predictive models; quality-of-service requirements;
                 Resource management; scheduling; solid-state drives;
                 SSD access-latency model; SSD check-aware system-level
                 request scheduling; SSD performance model; SSD
                 performance modeling; storage management; Throughput;
                 unpredictable access latency",
  number-of-cited-references = "10",
  ORCID-numbers = "Kim, Jihun/0000-0001-8893-8447",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kim:2018:SPM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Angstadt:2018:MOS,
  author =       "Kevin Angstadt and Jack Wadden and Vinh Dang and Ted
                 Xie and Dan Kramp and Westley Weimer and Mircea Stan
                 and Kevin Skadron",
  title =        "{MNCaRT}: an Open-Source, Multi-Architecture
                 Automata-Processing Research and Execution Ecosystem",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "84--87",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2780105",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We present MNCaRT, a comprehensive software ecosystem
                 for the study and use of automata processing across
                 hardware platforms. Tool support includes manipulation
                 of automata, execution of complex machines, high-speed
                 processing of NFAs and DFAs, and compilation of regular
                 expressions. We provide engines to execute automata on
                 CPUs (with VASim and Intel Hyperscan), GPUs (with
                 custom DFA and NFA engines), and FPGAs (with an HDL
                 translator). We also introduce MNRL, an open-source,
                 general-purpose and extensible state machine
                 representation language developed to support MNCaRT.
                 The representation is flexible enough to support
                 traditional finite automata (NFAs, DFAs) while also
                 supporting more complex machines, such as those which
                 propagate multi-bit signals between processing
                 elements. We hope that our ecosystem and representation
                 language stimulates new efforts to develop efficient
                 and specialized automata processing applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Angstadt, K (Reprint Author), Univ Michigan, Comp Sci
                 \& Engn Div, Dept Elect Engn \& Comp Sci, Ann Arbor, MI
                 48109 USA. Angstadt, Kevin; Weimer, Westley, Univ
                 Michigan, Comp Sci \& Engn Div, Dept Elect Engn \& Comp
                 Sci, Ann Arbor, MI 48109 USA. Wadden, Jack; Dang, Vinh;
                 Xie, Ted; Kramp, Dan; Stan, Mircea; Skadron, Kevin,
                 Univ Virginia, Dept Comp Sci, Charlottesville, VA 22904
                 USA.",
  author-email = "angstadt@umich.edu wadden@virginia.edu
                 vqd8a@virginia.edu ted.xie@virginia.edu
                 dankramp@virginia.edu weimerw@umich.edu
                 mircea@virginia.edu skadron@virginia.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "US National Science Foundation
                 [CCF-1116673, CCF-1629450, CCF-1619123, CNS-1619098];
                 AFRL [FA8750-15-2-0075]; Jefferson Scholars Foundation;
                 Achievement Rewards for College Scientists (ARCS)
                 Foundation; Xilinx; C-FAR, one of six centers of
                 STARnet; Semiconductor Research Corporation program -
                 MARCO; DARPA",
  funding-text = "This work was supported in part by grants from the US
                 National Science Foundation (CCF-1116673, CCF-1629450,
                 CCF-1619123, CNS-1619098), AFRL (FA8750-15-2-0075),
                 Jefferson Scholars Foundation, Achievement Rewards for
                 College Scientists (ARCS) Foundation, a grant from
                 Xilinx, and support from C-FAR, one of six centers of
                 STARnet, a Semiconductor Research Corporation program
                 sponsored by MARCO and DARPA. Any opinions, findings
                 and conclusions or recommendations expressed in this
                 material are those of the authors and do not
                 necessarily reflect the views of AFRL.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerator architectures; Automata; Benchmark
                 testing; complex machines; comprehensive software
                 ecosystem; DFA; Ecosystems; efficient automata
                 processing applications; Engines; extensible state
                 machine representation language; Field programmable
                 gate arrays; field programmable gate arrays; finite
                 automata; finite state machines; formal languages;
                 hardware platforms; high-speed processing; Intel
                 Hyperscan; MNCaRT; NFA engines; open source software;
                 Open source software; open source software;
                 open-source-multiarchitecture automata-processing
                 research; software tools; specialized automata
                 processing applications; Tools; traditional finite
                 automata",
  number-of-cited-references = "21",
  ORCID-numbers = "Angstadt, Kevin/0000-0002-0104-5257",
  research-areas = "Computer Science",
  researcherid-numbers = "Stan, Mircea/L-6219-2019",
  times-cited =  "2",
  unique-id =    "Angstadt:2018:MOS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zheng:2018:EPE,
  author =       "Hao Zheng and Ahmed Louri",
  title =        "{EZ-Pass}: an Energy \& Performance-Efficient
                 Power-Gating Router Architecture for Scalable {NoCs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "88--91",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2783918",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "With technology scaling into nanometer regime, static
                 power is becoming the dominant factor in the overall
                 power consumption of Network-on-Chips (NoCs). Static
                 power can be reduced by powering off routers during
                 consecutive idle time through power-gating techniques.
                 However, power-gating techniques suffer from a large
                 wake-up latency to wake up the powered-off routers.
                 Recent research aims to improve the wake-up latency
                 penalty by hiding it through early wake-up techniques.
                 However, these techniques do not exploit the full
                 advantage of power-gating due to the early wake-up.
                 Consequently, they do not achieve significant power
                 savings. In this paper, we propose an architecture
                 called Easy Pass (EZ-Pass) router that remedies the
                 large wake-up latency overheads while providing
                 significant static power savings. The proposed
                 architecture takes advantage of idle resources in the
                 network interface to transmit packets without waking up
                 the router. Additionally, the technique hides the
                 wake-up latency by continuing to provide packet
                 transmission during the wake-up phase. We use full
                 system simulation to evaluate our EZ-Pass router on a
                 64-core NoC with a mesh topology using PARSEC benchmark
                 suites. Our results show that the proposed router
                 reduces static power by up to 31 percent and overall
                 network latency by up to 32 percent as compared to
                 early-wakeup optimized power-gating techniques.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zheng, H (Reprint Author), George Washington Univ,
                 Dept Elect \& Comp Engn, Washington, DC 20052 USA.
                 Zheng, Hao; Louri, Ahmed, George Washington Univ, Dept
                 Elect \& Comp Engn, Washington, DC 20052 USA.",
  author-email = "haozheng@gwu.edu louri@gwu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; early-wakeup optimized
                 power-gating techniques; easy pass router; energy
                 conservation; energy-efficient; energy-efficient
                 power-gating router architecture; EZ-Pass router;
                 Latches; mesh topology; network interface; network
                 routing; network-on-chip; network-on-chips;
                 nework-on-chips; Nickel; NoC; PARSEC benchmark suites;
                 performance-efficient power-gating router architecture;
                 Ports (Computers); power consumption; Power-gating;
                 Routing; Routing protocols; scalable NoCs; static power
                 savings; Switches; wake-up latency overheads; wake-up
                 latency penalty; wake-up phase",
  keywords-plus = "ON-CHIP",
  number-of-cited-references = "16",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Zheng:2018:EPE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Delshadtehrani:2018:NPM,
  author =       "Leila Delshadtehrani and Schuyler Eldridge and
                 Sadullah Canakci and Manuel Egele and Ajay Joshi",
  title =        "{Nile}: a Programmable Monitoring Coprocessor",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "92--95",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2784416",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Researchers widely employ hardware performance
                 counters (HPCs) as well as debugging and profiling
                 tools in processors for monitoring different events
                 such as cache hits, cache misses, and branch prediction
                 statistics during the execution of programs. The
                 collected information can be used for power,
                 performance, and thermal management of the system as
                 well as detecting anomalies or malicious behavior in
                 the software. However, monitoring new or complex events
                 using HPCs and existing tools is a challenging task
                 because HPCs only provide a fixed pool of raw events to
                 monitor. To address this challenge, we propose the
                 implementation of a programmable hardware monitor in a
                 complete system framework including the hardware
                 monitor architecture and its interface with an in-order
                 single-issue RISC-V processor as well as an operating
                 system. As a proof of concept, we demonstrate how to
                 programmatically implement a shadow stack using our
                 hardware monitor and how the programmed shadow stack
                 detects stack buffer overflow attacks. Our hardware
                 monitor design incurs a 26 percent power overhead and a
                 15 percent area overhead over an unmodified RISC-V
                 processor. Our programmed shadow stack has less than 3
                 percent performance overhead in the worst case.",
  acknowledgement = ack-nhfb,
  affiliation =  "Delshadtehrani, L (Reprint Author), Boston Univ, Dept
                 Elect \& Comp Engn, Boston, MA 02215 USA.
                 Delshadtehrani, Leila; Eldridge, Schuyler; Canakci,
                 Sadullah; Egele, Manuel; Joshi, Ajay, Boston Univ, Dept
                 Elect \& Comp Engn, Boston, MA 02215 USA.",
  author-email = "delshad@bu.edu schuye@bu.edu scanakci@bu.edu
                 megele@bu.edu joshi@bu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-1533663]",
  funding-text = "We thank Prof. Jonathan Appavoo for providing
                 invaluable help in designing the OS support and the
                 software interface for Nile. This work was supported in
                 part by NSF grant CCF-1533663.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "branch prediction statistics; cache hits; cache
                 misses; cache storage; complete system framework;
                 complex events; coprocessors; Coprocessors; debugging;
                 fixed pool; Hardware; Hardware coprocessor; hardware
                 monitor architecture; hardware monitor design; hardware
                 performance counters; HPCs; Linux; malicious behavior;
                 Monitoring; Nile; operating system; operating systems
                 (computers); Pattern matching; performance evaluation;
                 performance overhead; power overhead; profiling tools;
                 Program processors; programmable hardware; programmable
                 hardware monitor; programmable monitoring coprocessor;
                 programmed shadow stack; raw events; reduced
                 instruction set computing; Rockets; security; shadow
                 stack; single-issue RISC-V processor; stack buffer
                 overflow attack; stack buffer overflow attacks; thermal
                 management; unmodified RISC-V processor",
  number-of-cited-references = "17",
  ORCID-numbers = "Joshi, AJay/0000-0002-3256-9942",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Delshadtehrani:2018:NPM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lee:2018:TTW,
  author =       "Eojin Lee and Sukhan Lee and G. Edward Suh and Jung Ho
                 Ahn",
  title =        "{TWiCe}: Time Window Counter Based Row Refresh to
                 Prevent Row-Hammering",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "1",
  pages =        "96--99",
  month =        jan # "\slash " # jun,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2017.2787674",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Computer systems using DRAM are exposed to
                 row-hammering attacks, which can flip data in a DRAM
                 row without directly accessing a row but by frequently
                 activating its adjacent ones. There have been a number
                 of proposals to prevent row-hammering, but they either
                 incur large area/performance overhead or provide
                 probabilistic protection. In this paper, we propose a
                 new row-hammering mitigation mechanism named Time
                 Window Counter based row refresh (TWiCe) which prevents
                 row-hammering by using a small number of counters
                 without performance overhead. We first make a key
                 observation that the number of rows that can cause
                 flipping their adjacent ones (aggressor candidates) is
                 limited by the maximum values of row activation
                 frequency and DRAM cell retention time. TWiCe exploits
                 this limit to reduce the required number of counter
                 entries by counting only actually activated DRAM rows
                 and periodically invalidating the entries that are not
                 activated frequently enough to be an aggressor. We
                 calculate the maximum number of required counter
                 entries per DRAM bank, with which row-hammering
                 prevention is guaranteed. We further improve energy
                 efficiency by adopting a pseudo-associative cache
                 design to TWiCe. Our analysis shows that TWiCe incurs
                 no performance overhead on normal DRAM operations and
                 less than 0.7 percent area and energy overheads over
                 contemporary DRAM devices.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lee, E; Ahn, JH (Reprint Author), Seoul Natl Univ,
                 Seoul 151742, South Korea. Lee, Eojin; Lee, Sukhan;
                 Ahn, Jung Ho, Seoul Natl Univ, Seoul 151742, South
                 Korea. Suh, G. Edward, Cornell Univ, Ithaca, NY 14850
                 USA.",
  author-email = "yohoyo@snu.ac.kr infy1026@snu.ac.kr
                 suh@csl.cornell.edu gajh@snu.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "FZ6EO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NRF of Korea [NRF-2017R1A2B2005416]; R\&D
                 program of MOTIE/KEIT [10077609]; IDEC (EDA tool)",
  funding-text = "This work was partially supported by the NRF of Korea
                 grant (NRF-2017R1A2B2005416), by the R\&D program of
                 MOTIE/KEIT (10077609), and by IDEC (EDA tool).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache storage; Computer architecture; DRAM; DRAM cell
                 retention time; DRAM chips; DRAM row; energy
                 efficiency; Microprocessors; Monitoring; performance
                 overhead; Probabilistic logic; pseudoassociative cache
                 design; Random access memory; refresh; reliability; row
                 activation frequency; row-hammering; row-hammering
                 attacks; row-hammering mitigation mechanism;
                 row-hammering prevention; time window counter based row
                 refresh; Time-frequency analysis; TWiCe",
  keywords-plus = "MEMORY",
  number-of-cited-references = "15",
  ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394 Suh,
                 Edward/0000-0001-6409-9888",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Lee:2018:TTW",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Rakshit:2018:LLO,
  author =       "Joydeep Rakshit and Kartik Mohanram",
  title =        "{LEO}: Low Overhead Encryption {ORAM} for Non-Volatile
                 Memories",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "100--104",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2795621",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Data confidentiality attacks utilizing memory access
                 patterns threaten exposure of data in modern main
                 memories. Oblivious RAM (ORAM) is an effective
                 cryptographic primitive developed to thwart
                 access-pattern-based attacks in DRAM-based systems.
                 However, in emerging non-volatile memory (NVM) systems,
                 the increased writes due to encryption of multiple data
                 blocks on every Path ORAM (state-of-the-art efficient
                 ORAM) access impose significant energy, lifetime, and
                 performance overheads. LEO (Low overhead Encryption
                 ORAM) is an efficient Path ORAM encryption architecture
                 that addresses the high write overheads of ORAM
                 integration in NVMs, while providing security
                 equivalent to the baseline Path ORAM. LEO reduces NVM
                 cell writes by securely decreasing the number of block
                 encryptions during the write phase of a Path ORAM
                 access. LEO uses a secure, two-level counter mode
                 encryption framework that opportunistically eliminates
                 re-encryption of unmodified blocks, reducing NVM
                 writes. Our evaluations show that on average, LEO
                 decreases NVM energy by 60 percent, improves lifetime
                 by 1.51 x, and increases performance by 9 percent over
                 the baseline Path ORAM.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rakshit, J (Reprint Author), Univ Pittsburgh, Dept
                 Elect \& Comp Engn, Pittsburgh, PA 15260 USA. Rakshit,
                 Joydeep; Mohanram, Kartik, Univ Pittsburgh, Dept Elect
                 \& Comp Engn, Pittsburgh, PA 15260 USA.",
  author-email = "joydeep.rakshit@pitt.edu kmram@pitt.edu",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "baseline path ORAM; block encryptions; Computer
                 architecture; cryptography; data confidentiality
                 attacks; DRAM chips; efficient path ORAM encryption
                 architecture; emerging nonvolatile memory systems;
                 Encryption; LEO; low-overhead encryption ORAM; memory
                 access patterns; memory security; multiple data blocks;
                 non-volatile memory; nonvolatile memories; Nonvolatile
                 memory; NVM; Oblivious RAM; ORAM integration; path ORAM
                 access; Random access memory; random-access storage;
                 System-on-chip; two-level counter mode encryption
                 framework",
  number-of-cited-references = "21",
  ORCID-numbers = "Rakshit, Joydeep/0000-0002-3670-4814",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Rakshit:2018:LLO",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Do:2018:CRL,
  author =       "Sang Wook Stephen Do and Michel Dubois",
  title =        "Core Reliability: Leveraging Hardware Transactional
                 Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "105--108",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2791433",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Modern microprocessors are more vulnerable to
                 transient faults or soft errors than ever before due to
                 design trends mandating low supply voltage and reduced
                 noise margins, shrinking feature sizes and increased
                 transistor density for fast, low-power circuits. As
                 industry now supports Hardware Transactional Memory
                 (HTM), the features of HTM can be leveraged to add core
                 resiliency to transient errors. In this paper, we
                 propose a novel microarchitecture for transient error
                 detection and recovery based on time redundancy and
                 backward error recovery leveraging HTM's existing
                 features especially its rollback mechanism. We provide
                 implementation details for single-core reliability,
                 minimizing additions to existing HTM supports. We
                 evaluate the performance overheads of the single core
                 with the reliability feature by comparing it to the
                 base machine without the reliability feature. Finally
                 we show how single-core reliability can be extended to
                 multi-core reliability.",
  acknowledgement = ack-nhfb,
  affiliation =  "Do, SWS (Reprint Author), Univ Southern Calif, Dept
                 Elect Engn, EEB200, Elect Engn Bldg, Los Angeles, CA
                 90089 USA. Do, Sang Wook Stephen, Univ Southern Calif,
                 Dept Elect Engn, EEB200, Elect Engn Bldg, Los Angeles,
                 CA 90089 USA. Dubois, Michel, Univ Southern Calif, Dept
                 Elect Engn, EEB228, Elect Engn Bldg, Los Angeles, CA
                 90089 USA.",
  author-email = "sdo@usc.edu dubois@usc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CCF-0954211]",
  funding-text = "The authors wish to thank Daniel Wong at UC Riverside
                 for advice on setting up the SPEC 2006 benchmark suite.
                 This material is based upon work supported by the
                 National Science Foundation under Grant CCF-0954211.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "backward error recovery; computer system organization;
                 core resiliency; design trends; Electrical engineering;
                 error detection; feature sizes; Fingerprint
                 recognition; Hardware; hardware transactional memory;
                 Hardware Transactional Memory; hardware transactional
                 memory; HTM; integrated circuit design; integrated
                 circuit reliability; low supply voltage; low-power
                 circuits; low-power electronics; memory architecture;
                 microprocessor chips; modern microprocessors; Multicore
                 processing; multicore reliability; noise margins;
                 performance and reliability; Registers; Reliability;
                 rollback mechanism; single-core reliability; soft
                 errors; time redundancy; Transient analysis; transient
                 error detection; transient error recovery; transient
                 faults; transistor density",
  keywords-plus = "TRANSIENT-FAULT RECOVERY; MULTIPROCESSORS;
                 CONSISTENCY; SUPPORT",
  number-of-cited-references = "30",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Do:2018:CRL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kaliorakis:2018:SAM,
  author =       "Manolis Kaliorakis and Athanasios Chatzidimitriou and
                 George Papadimitriou and Dimitris Gizopoulos",
  title =        "Statistical Analysis of Multicore {CPUs} Operation in
                 Scaled Voltage Conditions",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "109--112",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2798604",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Designers try to reduce the voltage margins of CPU
                 chips to gain energy without sacrificing reliable
                 operation. Statistical analysis methods are appealing
                 to predict the safe operational margins at the system
                 level as they do not induce area overheads and they can
                 be applied during manufacturing or after the chips'
                 release to the market. In this study, we present a
                 comprehensive statistical analysis of the behavior of
                 ARMv8 64-bit cores that are part of the enterprise
                 8-core X-Gene 2 micro-server family when they operate
                 in scaled voltage conditions. Our prediction schemes
                 that use real hardware counters as input are based on
                 linear regression models with several feature selection
                 techniques that aim to predict the safe voltage margins
                 of any given workload when the cores operate in scaled
                 conditions. Our findings show that our model is able to
                 accurately predict safe voltage margins that provide up
                 to 20.28\% power savings.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kaliorakis, M (Reprint Author), Univ Athens, Comp
                 Architecture Lab, Athens, Greece. Kaliorakis, Manolis;
                 Chatzidimitriou, Athanasios; Papadimitriou, George;
                 Gizopoulos, Dimitris, Univ Athens, Comp Architecture
                 Lab, Athens, Greece.",
  author-email = "manoliskal@di.uoa.gr achatz@di.uoa.gr
                 georgepap@di.uoa.gr dgizop@di.uoa.gr",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "H2020 Programme of the European Union
                 through the UniServer Project [688540]",
  funding-text = "This work is funded by the H2020 Programme of the
                 European Union through the UniServer Project (Grant
                 Agreement 688540).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "area overheads; ARMv8 cores; comprehensive statistical
                 analysis; Computational modeling; Computer crashes; CPU
                 chips; design margins; Energy-efficient computing;
                 enterprise 8-core X-Gene 2 microserver family; feature
                 selection; feature selection techniques; Hardware;
                 hardware counters; hardware reliability; Linear
                 regression; linear regression models; microprocessor
                 chips; multicore CPUs operation; multiprocessing
                 systems; power aware computing; power savings;
                 prediction schemes; Predictive models; regression
                 analysis; safe operational margins; safe voltage
                 margins; scaled voltage conditions; statistical
                 methods; system level; voltage margins; Voltage
                 measurement; word length 64 bit",
  keywords-plus = "NOISE",
  number-of-cited-references = "10",
  ORCID-numbers = "Gizopoulos, Dimitris/0000-0002-1613-9061
                 Chatzidimitriou, Athanasios/0000-0001-8161-7165",
  research-areas = "Computer Science",
  researcherid-numbers = "Gizopoulos, Dimitris/U-2731-2018",
  times-cited =  "2",
  unique-id =    "Kaliorakis:2018:SAM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Khoram:2018:AAA,
  author =       "Soroosh Khoram and Yue Zha and Jing Li",
  title =        "An Alternative Analytical Approach to Associative
                 Processing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "113--116",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2789424",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Associative Processing (AP) is a promising alternative
                 to the Von Neumann model as it addresses the memory
                 wall problem through its inherent in-memory
                 computations. However, because of the countless design
                 parameter choices, comparisons between implementations
                 of two so radically different models are challenging
                 for simulation-based methods. To tackle these
                 challenges, we develop an alternative analytical
                 approach based on a new concept called
                 architecturally-determined complexity. Using this
                 method, we asymptotically evaluate the
                 runtime/storage/energy bounds of the two models, i.e.,
                 AP and Von Neumann. We further apply the method to gain
                 more insights into the performance bottlenecks of
                 traditional AP and develop a new machine model named
                 Two Dimensional AP to address these limitations.
                 Finally, we experimentally validate our analytical
                 method and confirm that the simulation results match
                 our theoretical projections.",
  acknowledgement = ack-nhfb,
  affiliation =  "Khoram, S (Reprint Author), Univ Wisconsin, Dept Elect
                 \& Comp Engn, 1415 Johnson Dr, Madison, WI 53706 USA.
                 Khoram, Soroosh; Zha, Yue; Li, Jing, Univ Wisconsin,
                 Dept Elect \& Comp Engn, 1415 Johnson Dr, Madison, WI
                 53706 USA.",
  author-email = "khoram@wisc.edu yzha.3@wisc.edu jli@ece.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "alternative analytical approach; analysis of
                 algorithms and problem complexity; analytical method;
                 Analytical models; architecturally-determined
                 complexity; associative processing; Associative
                 Processing; Associative processors; Complexity theory;
                 Computational modeling; Computer architecture;
                 content-addressable storage; countless design parameter
                 choices; in-memory computations; machine model; memory
                 wall problem; modeling techniques; models of
                 computation; Parallel processing; Runtime;
                 runtime-storage-energy bounds; simulation-based
                 methods; traditional AP; two dimensional AP; Two
                 dimensional displays; Von Neumann model",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Khoram:2018:AAA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Khatamifard:2018:MSD,
  author =       "S. Karen Khatamifard and M. Hassan Najafi and Ali
                 Ghoreyshi and Ulya R. Karpuzcu and David J. Lilja",
  title =        "On Memory System Design for Stochastic Computing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "117--121",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2804926",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Growing uncertainty in design parameters (and
                 therefore, in design functionality) renders stochastic
                 computing particularly promising, which represents and
                 processes data as quantized probabilities. However, due
                 to the difference in data representation, integrating
                 conventional memory (designed and optimized for
                 non-stochastic computing) in stochastic computing
                 systems inevitably incurs a significant data conversion
                 overhead. Barely any stochastic computing proposal
                 to-date covers the memory impact. In this paper, as the
                 first study of its kind to the best of our knowledge,
                 we rethink the memory system design for stochastic
                 computing. The result is a seamless stochastic system,
                 StochMem, which features analog memory to trade the
                 energy and area overhead of data conversion for
                 computation accuracy. In this manner StochMem can
                 reduce the energy (area) overhead by up-to 52.8\%
                 (93.7\%) at the cost of at most 0.7\% loss in
                 computation accuracy.",
  acknowledgement = ack-nhfb,
  affiliation =  "Khatamifard, SK (Reprint Author), Univ Minnesota,
                 Minneapolis, MN 55455 USA. Khatamifard, S. Karen;
                 Najafi, M. Hassan; Ghoreyshi, Ali; Karpuzcu, Ulya R.;
                 Lilja, David J., Univ Minnesota, Minneapolis, MN 55455
                 USA.",
  author-email = "khatami@umn.edu najaf011@umn.edu ghore002@umn.edu
                 ukarpuzc@umn.edu lilja@umn.edu",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "US National Science Foundation
                 [CCF-1408123, XPS-CCA-1438286]",
  funding-text = "This work was supported in part by US National Science
                 Foundation grant no. CCF-1408123 and XPS-CCA-1438286.
                 Any opinions, findings and conclusions or
                 recommendations expressed in this material are those of
                 the authors and do not necessarily reflect the views of
                 the National Science Foundation.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "analog memory; Analog memory; analog memory;
                 computation accuracy; conventional memory; Data
                 conversion; data representation; design functionality;
                 design parameters; energy-efficient design; Image
                 processing; Image sensors; memory architecture; memory
                 impact; memory system design; near-sensor processing;
                 probability; seamless stochastic system; Sensors;
                 significant data conversion overhead; Stochastic
                 computing; stochastic computing proposal to-date;
                 stochastic computing systems; stochastic processes;
                 Stochastic systems; System analysis and design",
  keywords-plus = "COMPUTATION",
  number-of-cited-references = "16",
  ORCID-numbers = "Najafi, M. Hassan/0000-0002-4655-6229 Lilja,
                 David/0000-0003-3785-8206",
  research-areas = "Computer Science",
  researcherid-numbers = "Najafi, M. Hassan/I-2952-2019",
  times-cited =  "1",
  unique-id =    "Khatamifard:2018:MSD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Mouris:2018:TSB,
  author =       "Dimitris Mouris and Nektarios Georgios Tsoutsos and
                 Michail Maniatakos",
  title =        "{TERMinator} Suite: Benchmarking Privacy-Preserving
                 Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "122--125",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2812814",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Security and privacy are fundamental objectives
                 characterizing contemporary cloud computing. Despite
                 the wide adoption of encryption for protecting data in
                 transit and at rest, data in use remains unencrypted
                 inside cloud processors and memories, as computation is
                 not applicable on encrypted values. This limitation
                 introduces security risks, as unencrypted values can be
                 leaked through side-channels or hardware Trojans. To
                 address this problem, encrypted architectures have
                 recently been proposed, which leverage homomorphic
                 encryption to natively process encrypted data using
                 datapaths of thousands of bits. In this case,
                 additional security protections are traded for higher
                 performance penalties, which drives the need for more
                 efficient architectures. In this work, we develop
                 benchmarks specifically tailored to homomorphic
                 computers, to enable comparisons across different
                 architectures. Our benchmark suite, dubbed TERMinator,
                 is unique as it avoids ``termination problems'' that
                 prohibit making control-flow decisions and evaluating
                 early termination conditions based on encrypted data,
                 as these can leak information. Contrary to generic
                 suites that ignore the fundamental challenges of
                 encrypted computation, our algorithms are tailored to
                 the security primitives of the target encrypted
                 architecture, such as the existence of branching
                 oracles. In our experiments, we compiled our benchmarks
                 for the Cryptoleq architecture and evaluated their
                 performance for a range of security parameters.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tsoutsos, NG (Reprint Author), NYU, New York, NY 10003
                 USA. Mouris, Dimitris, Univ Athens, GR-10679 Athens,
                 Greece. Tsoutsos, Nektarios Georgios; Maniatakos,
                 Michail, NYU, New York, NY 10003 USA.",
  author-email = "jimouris@di.uoa.gr nektarios.tsoutsos@nyu.edu
                 michail.maniatakos@nyu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NYU Abu Dhabi Global Ph.D. Student
                 Fellowship program",
  funding-text = "This work was partially sponsored by the NYU Abu Dhabi
                 Global Ph.D. Student Fellowship program. D. Mouris
                 thanks Orestis Polychroniou for the fruitful
                 discussions.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Benchmarks; Cloud computing; cloud
                 computing; cloud processors; Computer architecture;
                 control-flow decisions; cryptography; Cryptoleq
                 architecture; data privacy; dubbed TERMinator;
                 encrypted architectures; encrypted computation;
                 encrypted data; encrypted values; Encryption; hardware
                 Trojans; higher performance penalties; homomorphic
                 computers; homomorphic encryption; leakage prevention;
                 performance evaluation; privacy-preserving architecture
                 benchmarking; Program processors; security parameters;
                 security protections; security risks; target encrypted
                 architecture; termination problem; TERMinator suite;
                 unencrypted values",
  number-of-cited-references = "14",
  ORCID-numbers = "Maniatakos, Michail/0000-0001-6899-0651",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Mouris:2018:TSB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Choukse:2018:CEM,
  author =       "Esha Choukse and Mattan Erez and Alaa Alameldeen",
  title =        "{CompressPoints}: an Evaluation Methodology for
                 Compressed Memory Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "126--129",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2821163",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Current memory technology has hit a wall trying to
                 scale to meet the increasing demands of modern client
                 and datacenter systems. Data compression is a promising
                 solution to this problem. Several compressed memory
                 systems have been proposed in the past years [1], [2],
                 [3], [4]. Unfortunately, a reasonable methodology to
                 evaluate these systems is missing. In this paper, we
                 identify the challenges for evaluating main memory
                 compression. We propose an effective methodology to
                 evaluate a compressed memory system by proposing
                 mechanisms to: (i) incorporate correct virtual address
                 translation, (ii) choose a region in the application
                 that is representative of the compression ratio, in
                 addition to regular metrics like IPC and cache hit
                 rates, and (iii) choose a representative region for
                 multi-core workloads, bringing down the correlation
                 error from 12.8 to 3.8 percent.",
  acknowledgement = ack-nhfb,
  affiliation =  "Choukse, E (Reprint Author), Univ Texas Austin,
                 Austin, TX 78712 USA. Choukse, Esha; Erez, Mattan, Univ
                 Texas Austin, Austin, TX 78712 USA. Alameldeen, Alaa,
                 Intel Labs, Santa Clara, CA 95054 USA.",
  author-email = "esha.choukse@utexas.edu mattan.erez@utexas.edu
                 alaa.r.alameldeen@intel.com",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; cache storage; compressed memory;
                 compressed memory system; Compression; compression
                 ratio; Computational modeling; computer centres;
                 Correlation; current memory technology; data
                 compression; datacenter systems; DRAM; evaluation;
                 evaluation methodology; Hardware; Linux; main memory
                 compression; Measurement; memory; memory architecture;
                 Memory management; methodology; modern client;
                 multi-core; multicore workloads; representative
                 regions; storage management; translation; workloads",
  number-of-cited-references = "8",
  ORCID-numbers = "Choukse, Esha/0000-0003-0371-5522",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Choukse:2018:CEM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2018:ZRV,
  author =       "Seikwon Kim and Wonsang Kwak and Changdae Kim and
                 Jaehyuk Huh",
  title =        "{Zebra} Refresh: Value Transformation for Zero-Aware
                 {DRAM} Refresh Reduction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "130--133",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2822808",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Refresh operations consume growing portions of DRAM
                 power with increasing DRAM capacity. To reduce the
                 power consumption of such refresh operations, this
                 paper proposes a novel value-aware refresh reduction
                 technique exploiting the abundance of zero values in
                 the memory contents. The proposed Zebra refresh
                 architecture transforms the value and mapping of DRAM
                 data to increase consecutive zero values, and skips a
                 refresh operation for a row containing zero values
                 entirely. Zebra converts memory blocks to base and
                 delta values, inspired by a prior compression
                 technique. Once values are converted, bits are
                 transposed to place consecutive zeros matching the
                 refresh granularity. The experimental results show
                 Zebra refresh can reduce DRAM refresh operations by 43
                 percent on average for a set of benchmark
                 applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Huh, J (Reprint Author), Korea Adv Inst Sci \&
                 Technol, Sch Comp, Daejeon 34141, South Korea. Kim,
                 Seikwon; Kwak, Wonsang; Kim, Changdae; Huh, Jaehyuk,
                 Korea Adv Inst Sci \& Technol, Sch Comp, Daejeon 34141,
                 South Korea. Kim, Seikwon, Samsung Elect Co Ltd,
                 Samsung Res, Suwon 443803, Gyeonggi Do, South Korea.",
  author-email = "seikwon@calab.kaist.ac.kr wskwak@calab.kaist.ac.kr
                 cdkim@calab.kaist.ac.kr jhuh@calab.kaist.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Research Foundation of Korea
                 [NRF-2016R1A2B4013352]; Institute for Information \&
                 communications Technology Promotion [IITP-2017-000466];
                 Ministry of Science and ICT, Korea",
  funding-text = "This work is supported by the National Research
                 Foundation of Korea (NRF-2016R1A2B4013352) and by the
                 Institute for Information \& communications Technology
                 Promotion (IITP-2017-000466). Both grants are funded by
                 the Ministry of Science and ICT, Korea.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; data compression; data content
                 conversion; data reduction; DRAM chips; DRAM data; DRAM
                 energy; DRAM power; DRAM refresh; DRAM refresh
                 operations; memory contents; Memory management;
                 Microprocessors; power aware computing; power
                 consumption; Power demand; Random access memory;
                 refresh granularity; Transforms; value transformation;
                 value-aware refresh reduction; Zebra refresh
                 architecture; zero values; zero-aware DRAM refresh
                 reduction",
  keywords-plus = "ENERGY",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kim:2018:ZRV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kwon:2018:CMC,
  author =       "Youngeun Kwon and Minsoo Rhu",
  title =        "A Case for Memory-Centric {HPC} System Architecture
                 for Training Deep Neural Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "134--138",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2823302",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As the models and the datasets to train deep learning
                 (DL) models scale, system architects are faced with new
                 challenges, one of which is the memory capacity
                 bottleneck, where the limited physical memory inside
                 the accelerator device constrains the algorithm that
                 can be studied. We propose a memory-centric deep
                 learning system that can transparently expand the
                 memory capacity accessible to the accelerators while
                 also providing fast inter-device communication for
                 parallel training. Our proposal aggregates a pool of
                 memory modules locally within the device-side
                 interconnect, which are decoupled from the host
                 interface and function as a vehicle for transparent
                 memory capacity expansion. Compared to conventional
                 systems, our proposal achieves an average 2: 1 x
                 speedup on eight DL applications and increases the
                 system-wide memory capacity to tens of TBs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rhu, M (Reprint Author), Pohang Univ Sci \& Technol,
                 Pohang 790784, Gyeongsangbuk D, South Korea. Kwon,
                 Youngeun; Rhu, Minsoo, Pohang Univ Sci \& Technol,
                 Pohang 790784, Gyeongsangbuk D, South Korea.",
  author-email = "kyeg9404@gmail.com minsoo.rhu@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Samsung Research Funding Center of Samsung
                 Electronics [SRFC-TB1703-03]",
  funding-text = "This work was supported by Samsung Research Funding
                 Center of Samsung Electronics under Project Number
                 SRFC-TB1703-03.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Computer architecture; conventional
                 systems; deep learning; deep learning models scale;
                 device-side interconnect; fast inter-device
                 communication; Graphics processing units; hardware
                 acceleration; learning (artificial intelligence);
                 Machine learning; memory architecture; memory capacity
                 bottleneck; memory modules; memory-centric deep
                 learning system; memory-centric HPC system
                 architecture; neural nets; neural network; parallel
                 processing; parallel training; Performance evaluation;
                 shared memory systems; storage management; system
                 architects; system architecture; system-wide memory
                 capacity; Systems architecture; Training; training deep
                 neural networks; transparent memory capacity expansion;
                 Virtualization",
  keywords-plus = "DESIGN",
  number-of-cited-references = "18",
  research-areas = "Computer Science",
  researcherid-numbers = "Rhu, Minsoo/O-6167-2018",
  times-cited =  "0",
  unique-id =    "Kwon:2018:CMC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Ipek:2018:BLL,
  author =       "Engin Ipek and Florian Longnos and Shihai Xiao and Wei
                 Yang",
  title =        "Bit-Level Load Balancing: a New Technique for
                 Improving the Write Throughput of Deeply Scaled
                 {STT-MRAM}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "139--142",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2819979",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Emerging non-volatile memories (NVMs) have drawn
                 significant attention as potential DRAM replacements.
                 STT-MRAM is one of the most promising NVMs due to its
                 relatively low write energy, high speed, and high
                 endurance. However, STT-MRAM suffers from its own
                 scaling problems. As the size of the access transistor
                 is decreased to reduce the cell area, the magnitude of
                 the switching current that is supplied to the storage
                 element decreases. The reduced switching current
                 significantly lengthens the switching time, which makes
                 write throughput a significant performance bottleneck
                 for a memory system constructed from dense STT-MRAM
                 cells. We introduce bit-level load balancing, a new
                 technique that mitigates the performance overhead of
                 limited write throughput in high-density, STT-MRAM
                 based main memories. Bit-level load balancing takes
                 advantage of the observation that many of the bits
                 within a row of STT-MRAM remain unchanged when
                 performing a write. The key idea is to architect the
                 memory system such that different columns of different
                 rows can be simultaneously written to an STT-MRAM
                 subarray. By interleaving in time the bit updates from
                 multiple writes, bit level load balancing improves
                 average system performance by 19 percent, and comes
                 within 6 percent of the performance of a DRAM based
                 system.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ipek, E (Reprint Author), Univ Rochester, Dept Comp
                 Sci, CSB Room 422, Rochester, NY 14627 USA. Ipek, E
                 (Reprint Author), Univ Rochester, Dept Elect \& Comp
                 Engn, CSB Room 422, Rochester, NY 14627 USA. Ipek,
                 Engin, Univ Rochester, Dept Comp Sci, CSB Room 422,
                 Rochester, NY 14627 USA. Ipek, Engin, Univ Rochester,
                 Dept Elect \& Comp Engn, CSB Room 422, Rochester, NY
                 14627 USA. Longnos, Florian; Xiao, Shihai; Yang, Wei,
                 Huawei Technol Co Ltd, Shenzhen 115371, Guangdong,
                 Peoples R China.",
  author-email = "ipek@cs.rochester.edu florian.longnos@huawei.com
                 xiaoshihai@huawei.com william.yangwei@huawei.com",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "bit level load balancing; bit-level load balancing;
                 Computer architecture; deeply scaled STT-MRAM; dense
                 STT-MRAM cells; DRAM chips; Load management; memory
                 system; memory systems; Microprocessors; MRAM devices;
                 non-volatile memories; nonvolatile memories; NVMs;
                 performance bottleneck; Random access memory; resource
                 allocation; STT-MRAM; STT-MRAM based main memories;
                 STT-MRAM subarray; Switches; Throughput; Transistors;
                 write throughput",
  keywords-plus = "PERFORMANCE; DESIGN; ENERGY",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Ipek:2018:BLL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Iliakis:2018:DMS,
  author =       "Konstantinos Iliakis and Sotirios Xydis and Dimitrios
                 Soudris",
  title =        "Decoupled {MapReduce} for Shared-Memory Multi-Core
                 Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "143--146",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2827929",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Modern multi-core processors exhibit high integration
                 densities, e.g., up to several tens of cores. Multiple
                 programming frameworks have emerged to facilitate the
                 development of highly parallel applications. The
                 MapReduce programming model, after having demonstrated
                 its usability in the area of distributed computing
                 systems, has been adapted to the needs of shared-memory
                 multi-processors showing promising results in
                 comparison with conventional multi-threaded libraries,
                 e.g., pthreads. In this paper we enhance the
                 traditional MapReduce architecture by decoupling the
                 map and combine phases in order to boost parallel
                 execution. We show that combiners' memory intensive
                 features limit the system's degree of parallelism, thus
                 resulting in sub-optimal hardware utilization, leaving
                 space for further performance improvements. The
                 proposed decoupled MapReduce architecture is evaluated
                 into a NUMA server platform, showing that the adoption
                 of the De-MapR runtime enables more efficient hardware
                 utilization and competent run-time improvements. We
                 demonstrate that the proposed solution achieves
                 execution speedups of up to 2.46x compared to a
                 state-of-the-art, shared-memory MapReduce library.",
  acknowledgement = ack-nhfb,
  affiliation =  "Iliakis, K (Reprint Author), Natl Tech Univ Athens,
                 Zografos 15780, Greece. Iliakis, Konstantinos; Xydis,
                 Sotirios; Soudris, Dimitrios, Natl Tech Univ Athens,
                 Zografos 15780, Greece.",
  author-email = "konstantinos.iliakis@cern.ch sxydis@microlab.ntua.gr
                 dsoudris@microlab.ntua.gr",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; Containers; decoupled MapReduce
                 architecture; distributed computing systems; hardware
                 utilization; highly parallel applications; Instruction
                 sets; Libraries; MapReduce; MapReduce programming
                 model; modern multicore processors; multi-cores;
                 multiple programming frameworks; parallel
                 architectures; parallel execution; Parallel processing;
                 parallel programming; Runtime; runtime systems; shared
                 memory systems; shared-memory MapReduce library;
                 shared-memory multicore architectures; shared-memory
                 multiprocessors; sub-optimal hardware utilization; Task
                 analysis",
  number-of-cited-references = "13",
  ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847",
  research-areas = "Computer Science",
  researcherid-numbers = "Soudris, Dimitrios/O-8843-2019",
  times-cited =  "0",
  unique-id =    "Iliakis:2018:DMS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Li:2018:BSB,
  author =       "Zhaoshi Li and Leibo Liu and Yangdong Deng and Shouyi
                 Yin and Shaojun Wei",
  title =        "Breaking the Synchronization Bottleneck with
                 Reconfigurable Transactional Execution",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "147--150",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2828402",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The advent of FPGA-based hybrid architecture offers
                 the opportunity of customizing memory subsystems to
                 enhance the overall system performance. However, it is
                 not straightforward to design efficient FPGA circuits
                 for emerging FPGAs applications such as in-memory
                 database and graph analytics, which heavily depend on
                 concurrent data structures (CDS'). Highly dynamic
                 behaviors of CDS' have to be orchestrated by
                 synchronization primitives for correct execution. These
                 primitives induce overwhelming memory traffic for
                 synchronizations on FPGAs. This paper proposes a novel
                 method for systematically exploring and exploiting
                 memory-level parallelism (MLP) of CDS by transactional
                 execution on FPGAs. Inspired by the idea that semantics
                 of transactions can be implemented in a more efficient
                 and scalable manner on FPGAs than on CPUs, we propose a
                 transaction-based reconfigurable runtime system for
                 capturing MLP of CDS'. Experiments on linked-list and
                 skip-list show our approach achieves 5.18x and 1.55x
                 throughput improvement on average than lock-based FPGA
                 implementations and optimized CDS algorithms on a
                 state-of-the-art multi-core CPU respectively.",
  acknowledgement = ack-nhfb,
  affiliation =  "Liu, LB (Reprint Author), Tsinghua Univ, Natl Lab
                 Informat Sci \& Technol, Beijing 100084, Peoples R
                 China. Li, Zhaoshi; Liu, Leibo; Deng, Yangdong; Yin,
                 Shouyi; Wei, Shaojun, Tsinghua Univ, Natl Lab Informat
                 Sci \& Technol, Beijing 100084, Peoples R China.",
  author-email = "li-zs12@mail.tsinghua.edu.cn liulb@tsinghua.edu.cn
                 dengyd@tsinghua.edu.cn yinsy@tsinghua.edu.cn
                 wsj@tsinghua.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Natural Science Foundation of
                 China [61672317]; National Science Technology Major
                 Project [2016ZX01012101]",
  funding-text = "This work was supported in part by National Natural
                 Science Foundation of China (No. 61672317) and National
                 Science Technology Major Project (No.
                 2016ZX01012101).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "concurrent data structures; data structures; Data
                 structures; data structures; efficient FPGA circuits;
                 field programmable gate arrays; Field programmable gate
                 arrays; FPGA-based hybrid architecture; graph
                 analytics; heterogeneous systems; highly dynamic
                 behaviors; in-memory database; Instruction sets; memory
                 subsystems; memory traffic; memory-level parallelism;
                 MLP; multicore CPU; optimized CDS algorithms; parallel
                 architectures; Programming; Reconfigurable hardware;
                 reconfigurable transactional execution; Semantics;
                 synchronisation; Synchronization; synchronization
                 bottleneck; synchronization primitives; system
                 performance enhancement; Throughput; transaction-based
                 reconfigurable runtime system",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Li:2018:BSB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Ipek:2018:VWC,
  author =       "Engin Ipek and Florian Longnos and Shihai Xiao and Wei
                 Yang",
  title =        "Vertical Writes: Closing the Throughput Gap between
                 Deeply Scaled {STT-MRAM} and {DRAM}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "151--154",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2820027",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "STT-MRAMis a second generation MRAM technology that
                 addresses many of the scaling problems of earlier
                 generation magnetic RAMs, and is a promising candidate
                 to replace DRAM due to its high operational speed,
                 scalable energy characteristics, and high write
                 endurance. However, making the density of STT-MRAM
                 competitive with that of DRAM while maintaining
                 DRAM-like write throughput has proven challenging.
                 Reducing the area of an STT-MRAM cell requires
                 decreasing the width of the cell access transistor,
                 which lowers the magnitude of the switching current
                 supplied to the storage element during writes, and
                 significantly hampers the switching speed.
                 Consequently, write throughput constitutes a
                 fundamental performance bottleneck for memory systems
                 built from deeply scaled, dense STT-MRAM cells. This
                 paper introduces vertical writes, a new technique that
                 improves the write throughput of memory systems built
                 from high-density STT-MRAM. Vertical writes exploit the
                 observation that once the switching voltage has been
                 applied across the bit lines and source lines in an
                 STT-MRAM array, it is possible to initiate the write
                 operation for additional cells that are attached to the
                 same column by simply turning on the corresponding word
                 lines. By leveraging the ability to write a 0 or a 1 to
                 multiple cells at once, vertical writes improve average
                 system performance by 21 percent, and enable an
                 STT-MRAM based system to come within 5 percent of the
                 performance of a DRAM based system.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ipek, E (Reprint Author), Univ Rochester, Rochester,
                 NY 14627 USA. Ipek, Engin, Univ Rochester, Rochester,
                 NY 14627 USA. Longnos, Florian; Xiao, Shihai; Yang,
                 Wei, Huawei Technol Co Ltd, Shenzhen 518129, Guangdong,
                 Peoples R China.",
  author-email = "ipek@cs.rochester.edu florian.longnos@huawei.com
                 florian.longnos@huawei.com william.yangwei@huawei.com",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cell access transistor; Computer architecture;
                 Decoding; deeply scaled STT-MRAM cells; dense STT-MRAM
                 cells; DRAM based system; DRAM chips; DRAM-like write
                 throughput; earlier generation magnetic RAMs;
                 generation MRAM technology; high operational speed;
                 high write endurance; high-density STT-MRAM; magnetic
                 tunnelling; Memory systems; memory systems; Memory
                 systems; Microprocessors; MRAM devices; non-volatile
                 memories; Random access memory; random-access storage;
                 scalable energy characteristics; STT-MRAM; STT-MRAM
                 array; STT-MRAM based system; Switches; switching
                 current; switching speed; Throughput; throughput gap;
                 write operation; Writing",
  keywords-plus = "PERFORMANCE; DESIGN",
  number-of-cited-references = "23",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Ipek:2018:VWC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Gan:2018:AIC,
  author =       "Yu Gan and Christina Delimitrou",
  title =        "The Architectural Implications of Cloud
                 Microservices",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "155--158",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2839189",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Cloud services have recently undergone a shift from
                 monolithic applications to microservices, with hundreds
                 or thousands of loosely-coupled microservices
                 comprising the end-to-end application. Microservices
                 present both opportunities and challenges when
                 optimizing for quality of service (QoS) and cloud
                 utilization. In this paper we explore the implications
                 cloud microservices have on system bottlenecks, and
                 datacenter server design. We first present and
                 characterize an end-to-end application built using tens
                 of popular open-source microservices that implements a
                 movie renting and streaming service, and is modular and
                 extensible. We then use the end-to-end service to study
                 the scalability and performance bottlenecks of
                 microservices, and highlight implications they have on
                 the design of datacenter hardware. Specifically, we
                 revisit the long-standing debate of brawny versus wimpy
                 cores in the context of microservices, we quantify the
                 I-cache pressure they introduce, and measure the time
                 spent in computation versus communication between
                 microservices over RPCs. As more cloud applications
                 switch to this new programming model, it is
                 increasingly important to revisit the assumptions we
                 have previously used to build and manage cloud
                 systems.",
  acknowledgement = ack-nhfb,
  affiliation =  "Delimitrou, C (Reprint Author), Cornell Univ, Ithaca,
                 NY 14850 USA. Gan, Yu; Delimitrou, Christina, Cornell
                 Univ, Ithaca, NY 14850 USA.",
  author-email = "lyg397@cornell.edu delimitrou@cornell.edu",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application studies resulting in better
                 multiple-processor systems; architectural implications;
                 Cloud computing; cloud computing; cloud microservices;
                 cloud utilization; computer centres; datacenter server
                 design; distributed applications; Electric breakdown;
                 end-to-end service; Motion pictures; movie renting;
                 Open source software; open-source microservices; power
                 aware computing; QoS; quality of service; Quality of
                 service; quality of service; Servers; streaming
                 service; Super (very large) computers; Videos",
  number-of-cited-references = "20",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Gan:2018:AIC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Shwartz:2018:DMI,
  author =       "Ofir Shwartz and Yitzhak Birk",
  title =        "Distributed Memory Integrity Trees",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "159--162",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2822705",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Ensuring the correct execution of a program running on
                 untrusted computing platforms, wherein the OS,
                 hypervisor, and all off-CPU-chip hardware, including
                 memory, are untrusted, (also) requires protecting the
                 integrity of the memory content against replay attacks.
                 This requires dedicated tracking structures and in-chip
                 state storage. For this purpose, integrity trees are
                 used in various forms, varying in complexity, size, and
                 performance; yet, existing integrity trees do not
                 address distributed, shared-memory computations, for
                 which one must also ensure the integrity of the
                 coherence state of the memory. Observing that a block
                 not residing at a given node merely needs to be known
                 by that node as such, we present the novel Distributed
                 Integrity Tree (DIT) method, and show that it can be
                 used effectively to extend existing integrity trees to
                 parallel and distributed environments. Using DIT, we
                 constructed a Distributed Merkle Tree, a Distributed
                 Bonsai Merkle Tree, and a distributed Intel SGX's
                 Memory Encryption Engine integrity mechanism. All these
                 extensions entail negligible overhead.",
  acknowledgement = ack-nhfb,
  affiliation =  "Shwartz, O (Reprint Author), Technion, Elect Engn
                 Dept, IL-3200003 Haifa, Israel. Shwartz, Ofir; Birk,
                 Yitzhak, Technion, Elect Engn Dept, IL-3200003 Haifa,
                 Israel.",
  author-email = "ofirshw@tx.technion.ac.il birk@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Hasso Plattner Institute",
  funding-text = "This work was supported in part by the Hasso Plattner
                 Institute.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "coherence state; computer security; correct execution;
                 cryptography; data integrity; Data transfer;
                 distributed Bonsai Merkle tree; Distributed computing;
                 Distributed databases; distributed environment;
                 distributed integrity tree method; distributed Intel
                 SGX's Memory Encryption Engine integrity mechanism;
                 distributed memory integrity; Encryption; hypervisor;
                 in-chip state storage; integrity tree; memory content;
                 Memory management; Metadata; off-CPU-chip hardware;
                 operating systems (computers); parallel environment;
                 parallel processing; shared memory; shared memory
                 systems; shared-memory computations; trees
                 (mathematics); trusted computing; untrusted computing
                 platforms",
  keywords-plus = "PERFORMANCE",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Shwartz:2018:DMI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yun:2018:RPP,
  author =       "Ji-Tae Yun and Su-Kyung Yoon and Jeong-Geun Kim and
                 Bernd Burgstaller and Shin-Dug Kim",
  title =        "Regression Prefetcher with Preprocessing for
                 {DRAM--PCM} Hybrid Main Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "163--166",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2841835",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This research is to design an effective hybrid main
                 memory structure for graph processing applications,
                 because it is quite expensive to use only high-speed
                 DRAM for such applications. Thus, we propose a DRAM-PCM
                 hybrid main memory structure to reduce the cost and
                 energy consumption and design regression prefetch
                 scheme to cope with irregular access patterns in large
                 graph processing workloads. In addition, the prefetch
                 includes preprocessing algorithm to maximize
                 prefetching performance. Our experimental evaluation
                 shows a performance improvement of 36 percent over a
                 conventional DRAM model, 15 percent over existing
                 prefetch models such as GHB/PC, SMS, and AMPM, and 6
                 percent over the latest model.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, SD (Reprint Author), Yonsei Univ, Dept Comp Sci,
                 Seoul 03722, South Korea. Yun, Ji-Tae; Yoon, Su-Kyung;
                 Kim, Jeong-Geun; Burgstaller, Bernd; Kim, Shin-Dug,
                 Yonsei Univ, Dept Comp Sci, Seoul 03722, South Korea.",
  author-email = "jty11@yonsei.ac.kr sk.yoon@yonsei.ac.kr
                 junggeun@yonsei.ac.kr bburg@yonsei.ac.kr
                 sdkim@yonsei.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Next Generation Information Computing
                 Development Program through the National Research
                 Foundation of Korea (NRF) --- Ministry of Science, ICT
                 \& Future Planning [NRF-2015M3C4A7065522]; Samsung
                 Electronics; Yonsei University",
  funding-text = "This research was partially supported by the Next
                 Generation Information Computing Development Program
                 through the National Research Foundation of Korea (NRF)
                 funded by the Ministry of Science, ICT \& Future
                 Planning (NRF-2015M3C4A7065522) and by an
                 Industry-Academy joint research program between Samsung
                 Electronics and Yonsei University.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "buffer management; conventional DRAM model; cost
                 reduction; design regression prefetch scheme; DRAM
                 chips; effective hybrid main memory structure; energy
                 consumption reduction; Engines; graph processing
                 applications; graph theory; high-speed DRAM; irregular
                 access patterns; large graph processing workloads; Load
                 modeling; machine learning; main memory; Memory
                 management; PCM; Phase change materials; phase change
                 memories; prefetch models; Prefetching; prefetching
                 performance; preprocessing algorithm; Random access
                 memory; storage management; Training data",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Yun:2018:RPP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zhang:2018:RFA,
  author =       "Jiangwei Zhang and Donald {Kline, Jr.} and Long Fang
                 and Rami Melhem and Alex K. Jones",
  title =        "{RETROFIT}: Fault-Aware Wear Leveling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "167--170",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2840137",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Phase-change memory (PCM) and resistive memory (RRAM)
                 are promising alternatives to traditional memory
                 technologies. However, both PCM and RRAM suffer from
                 limited write endurance and due to process variation
                 from scaling, increasing number of early cell failures
                 continue to put pressure on wear-leveling and fault
                 tolerance techniques. In this paper, we propose
                 RETROFIT, which leverages the spare ``gap'' row used as
                 temporary storage in wear leveling to also be used
                 strategically to guard against early cell wear out.
                 RETROFIT is compatible with error correction schemes
                 targeted at mitigating stuck-at faults and provides
                 benefits when single or multiple spare rows are
                 available. RETROFIT enhances lifetime by as much as 107
                 percent over traditional gap-based wear leveling and 8
                 percent over perfectly uniform wear leveling with a
                 similar overhead. Furthermore, RETROFIT scales better
                 than wear-leveling combined with error correction as
                 process variation increases.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zhang, JW (Reprint Author), Natl Univ Def Technol,
                 Changsha 410073, Hunan, Peoples R China. Zhang, JW
                 (Reprint Author), Univ Pittsburgh, ECE Dept,
                 Pittsburgh, PA 15261 USA. Zhang, Jiangwei; Fang, Long,
                 Natl Univ Def Technol, Changsha 410073, Hunan, Peoples
                 R China. Zhang, Jiangwei; Fang, Long, Univ Pittsburgh,
                 ECE Dept, Pittsburgh, PA 15261 USA. Melhem, Rami, Univ
                 Pittsburgh, CS Dept, Pittsburgh, PA 15260 USA.",
  author-email = "jiz148@pitt.edu dek61@pitt.edu lfang@nudt.edu.cn
                 melhem@cs.pitt.edu akjones@pitt.edu",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Aging; and fault-tolerance; Computer architecture;
                 early cell failures; early cell wear; Emerging
                 memories; error correction; Error correction; Error
                 correction codes; fault tolerance; fault tolerance
                 techniques; fault-aware wear leveling; fault-tolerance;
                 multiple spare rows; PCM; perfectly uniform wear
                 leveling; Phase change materials; phase change
                 memories; process variation; Random access memory;
                 random-access storage; Registers; reliability;
                 resistive memory; RETROFIT scales; RRAM; single rows;
                 spare gap row; traditional memory technologies; wear;
                 wear-leveling",
  number-of-cited-references = "15",
  ORCID-numbers = "Kline, Jr, Donald/0000-0002-4414-1513",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Zhang:2018:RFA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kulkarni:2018:LAI,
  author =       "Neeraj Kulkarni and Feng Qi and Christina Delimitrou",
  title =        "Leveraging Approximation to Improve Datacenter
                 Resource Efficiency",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "171--174",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2845841",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Cloud multi-tenancy is typically constrained to a
                 single interactive service colocated with one or more
                 batch, low-priority services, whose performance can be
                 sacrificed. Approximate computing applications offer
                 the opportunity to enable tighter colocation among
                 multiple applications whose performance is important.
                 We present Pliant, a lightweight cloud runtime that
                 leverages the ability of approximate computing
                 applications to tolerate some loss in output quality to
                 boost the utilization of shared servers. During periods
                 of high contention, Pliant employs incremental and
                 interference-aware approximation to reduce interference
                 in shared resources. We evaluate Pliant across
                 different approximate applications, and show that it
                 preserves QoS for all co-scheduled workloads, while
                 incurring at most a 5 percent loss in output quality.",
  acknowledgement = ack-nhfb,
  affiliation =  "Delimitrou, C (Reprint Author), Cornell Univ, Ithaca,
                 NY 14850 USA. Kulkarni, Neeraj; Qi, Feng; Delimitrou,
                 Christina, Cornell Univ, Ithaca, NY 14850 USA.",
  author-email = "nsk49@cornell.edu fq26@cornell.edu
                 delimitrou@cornell.edu",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Approximate computing; approximate computing
                 applications; cloud computing; Cloud computing; cloud
                 multitenancy; co-scheduled workloads; computer centres;
                 datacenter resource efficiency; Interference;
                 interference-aware approximation; lightweight cloud
                 runtime; low-priority services; Monitoring; Pliant;
                 QoS; quality of service; Quality of service; Runtime;
                 scheduling; scheduling and task partitioning; shared
                 resources; single interactive service; Super (very
                 large) computers; support for dynamic compilation;
                 Switches",
  keywords-plus = "ACCURACY-AWARE OPTIMIZATION; PROGRAMS",
  number-of-cited-references = "20",
  ORCID-numbers = "Qi, Feng/0000-0002-0759-5268 Kulkarni,
                 Neeraj/0000-0003-0768-0187",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kulkarni:2018:LAI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{AlBarakat:2018:MFM,
  author =       "Laith M. AlBarakat and V. Paul Gratz and Daniel A.
                 Jim{\'e}nez",
  title =        "{MTB-Fetch}: Multithreading Aware Hardware Prefetching
                 for Chip Multiprocessors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "175--178",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2847345",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "To fully exploit the scaling performance in Chip
                 Multiprocessors, applications must be divided into
                 semi-independent processes that can run concurrently on
                 multiple cores within a system. One major class of such
                 applications, shared-memory, multi-threaded
                 applications, requires programmers insert thread
                 synchronization primitives (i.e., locks, barriers, and
                 condition variables) in their critical sections to
                 synchronize data access between processes. For this
                 class of applications, scaling performance requires
                 balanced per-thread workloads with little time spent in
                 critical sections. In practice, however, threads often
                 waste significant time waiting to acquire
                 locks/barriers in their critical sections, leading to
                 thread imbalance and poor performance scaling.
                 Moreover, critical sections often stall data
                 prefetchers that mitigate the effects of long critical
                 section stalls by ensuring data is preloaded in the
                 core caches when the critical section is complete. In
                 this paper we examine a pure hardware technique to
                 enable safe data prefetching beyond synchronization
                 points in CMPs. We show that successful prefetching
                 beyond synchronization points requires overcoming two
                 significant challenges in existing prefetching
                 techniques. First, we find that typical data
                 prefetchers are designed to trigger prefetches based on
                 current misses. This approach this works well for
                 traditional, continuously executing, single-threaded
                 applications. However, when a thread stalls on a
                 synchronization point, it typically does not produce
                 any new memory references to trigger a prefetcher.
                 Second, even in the event that a prefetch were to be
                 correctly directed to read beyond a synchronization
                 point, it will likely prefetch shared data from another
                 core before this data has been written. While this
                 prefetch would be considered ``accurate'' it is highly
                 undesirable, because such a prefetch would lead to
                 three extra ``ping-pong'' movements back and forth
                 between private caches in the producing and consuming
                 cores, incurring more latency and energy overhead than
                 without prefetching. We develop a new data prefetcher,
                 Multi-Thread B-Fetch (MTBFetch), built as an extension
                 to a previous single-threaded data prefetcher. MTBFetch
                 addresses both issues in prefetching for shared memory
                 multi-threaded workloads. MTB-Fetch achieves a speedup
                 of 9.3 percent for multi-threaded applications with
                 little additional hardware.",
  acknowledgement = ack-nhfb,
  affiliation =  "AlBarakat, LM (Reprint Author), Texas A\&M Univ, Dept
                 Elect \& Comp Engn, College Stn, TX 77843 USA.
                 AlBarakat, Laith M.; Gratz, Paul, V, Texas A\&M Univ,
                 Dept Elect \& Comp Engn, College Stn, TX 77843 USA.
                 Jimenez, Daniel A., Texas A\&M Univ, Dept Comp Sci \&
                 Engn, College Stn, TX 77843 USA.",
  author-email = "lalbarakat@tamu.edu pgratz@tamu.edu
                 djimenez@cse.tamu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation
                 [I/UCRC-1439722, CCF-1649242, CCF-1216604/1332598];
                 Intel Corp.",
  funding-text = "We thank the National Science Foundation, which
                 partially supported this work through grants
                 I/UCRC-1439722, CCF-1649242 and CCF-1216604/1332598 and
                 Intel Corp. for their generous support.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache storage; Chip multiprocessor; Chip
                 Multiprocessors; CMPs; core caches; data access
                 synchronization; energy overhead; Hardware; hardware
                 prefetching; long critical section stalls;
                 microprocessor chips; MTB-Fetch; multi-threading;
                 Multicore processing; multiple cores; multithread
                 B-fetch; multithreading aware hardware prefetching;
                 per-thread workloads; poor performance scaling;
                 Prefetching; prefetching techniques; private caches;
                 pure hardware technique; Scalability; scaling
                 performance; semiindependent processes; shared memory;
                 shared memory multithreaded workloads; shared memory
                 systems; single-threaded applications; single-threaded
                 data prefetcher; storage management; synchronisation;
                 Synchronization; synchronization point; thread
                 imbalance; thread synchronization primitives; typical
                 data prefetchers",
  keywords-plus = "PROCESSORS",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "AlBarakat:2018:MFM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Vijayaraghavan:2018:MBA,
  author =       "Thiruvengadam Vijayaraghavan and Amit Rajesh and
                 Karthikeyan Sankaralingam",
  title =        "{MPU--BWM}: Accelerating Sequence Alignment",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "179--182",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2849064",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "DNA sequencing and assembly spans life-altering
                 applications like disease diagnosis to answering
                 questions about our ancestry. Sequencing involves
                 state-of-the-art machines generating nucleic acid
                 sequences (AGCT) from wet samples like blood or salvia,
                 followed by aligning these sequences against known
                 reference sequences. Due to the rapid advancement in
                 sequence generation machines relative to Moore's law,
                 the second step (alignment) has now become the
                 bottleneck. Today's state-of-the-art technology for
                 alignment runs software like BWA-MEM on a cluster of
                 high performance general purpose machines that cannot
                 keep up with the rapid rate of data generated by each
                 new generation of sequencer machines. Recent proposals
                 from academia that claim orders of magnitude alignment
                 speedup come at a cost of significant disruption to the
                 hardware and software currently in use in the industry.
                 In this work, we propose MPU-BWM, a hardware-software
                 solution that achieves orders of magnitude speedup (57
                 x over single core x86) on the state-of-the-art BWA-MEM
                 algorithm, with non-intrusive integration to existing
                 processing clusters and with minimal modifications to
                 the BWA-MEM software.",
  acknowledgement = ack-nhfb,
  affiliation =  "Vijayaraghavan, T (Reprint Author), SimpleMachines
                 Inc, Madison, WI 53719 USA. Vijayaraghavan,
                 Thiruvengadam; Sankaralingam, Karthikeyan,
                 SimpleMachines Inc, Madison, WI 53719 USA. Rajesh,
                 Amit, James Madison Mem High Sch, Madison, WI 53717
                 USA. Sankaralingam, Karthikeyan, Univ Wisconsin,
                 Madison, WI 53706 USA.",
  author-email = "vijay@simplemachinesinc.com amitrajesh200@gmail.com
                 karu@cs.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "assembly spans life-altering applications;
                 bioinformatics; bioinformatics (genome or protein)
                 databases; BWA-MEM software; disease diagnosis;
                 diseases; DNA; DNA sequencing; Engines; genomics;
                 Hardware; hardware-software solution; Heterogeneous
                 (hybrid) systems; high performance general purpose
                 machines; magnitude alignment speedup; Moore's law;
                 MPU-BWM; nucleic acid sequences; parallel
                 architectures; parallel processing; Pipelines; Program
                 processors; reference sequences; Rockets; sequence
                 alignment; sequence generation machines; sequencer
                 machines; sequences; Sequential analysis; sequential
                 machines",
  number-of-cited-references = "15",
  ORCID-numbers = "Rajesh, Amit/0000-0003-1679-5517",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Vijayaraghavan:2018:MBA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{DePestel:2018:RRP,
  author =       "Sander {De Pestel} and Sam {Van den Steen} and Shoaib
                 Akram and Lieven Eeckhout",
  title =        "{RPPM}: Rapid Performance Prediction of Multithreaded
                 Applications on Multicore Hardware",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "183--186",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2849983",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper proposes RPPM which, based on a
                 microarchitecture-independent profile of a
                 multithreaded application, predicts its performance on
                 a previously unseen multicore platform. RPPM breaks up
                 multithreaded program execution into epochs based on
                 synchronization primitives, and then predicts per-epoch
                 active execution times for each thread and
                 synchronization overhead to arrive at a prediction for
                 overall application performance. RPPM predicts
                 performance within 12 percent on average (27 percent
                 max error) compared to cycle-level simulation. We
                 present a case study to illustrate that RPPM can be
                 used for making accurate multicore design trade-offs
                 early in the design cycle.",
  acknowledgement = ack-nhfb,
  affiliation =  "De Pestel, S (Reprint Author), Univ Ghent, B-9000
                 Ghent, Belgium. De Pestel, Sander; Van den Steen, Sam;
                 Akram, Shoaib; Eeckhout, Lieven, Univ Ghent, B-9000
                 Ghent, Belgium.",
  author-email = "sander.depestel@ugent.be sam.vandensteen@ugent.be
                 shoaib.akram@ugent.be lieven.eeckhout@ugent.be",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Agency for Innovation by Science and
                 Technology in Flanders (IWT); European Research Council
                 (ERC) [741097]",
  funding-text = "Sander De Pestel is supported through a doctoral
                 fellowship by the Agency for Innovation by Science and
                 Technology in Flanders (IWT). Additional support is
                 provided through the European Research Council (ERC)
                 Advanced Grant agreement no. 741097.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accurate multicore design trade-offs; active execution
                 times; Computational modeling; Instruction sets;
                 Mathematical model; micro-architecture;
                 Microarchitecture; microarchitecture-independent
                 profile; microprocessor chips; Modeling;
                 multi-threaded; multi-threading; multicore hardware;
                 Multicore processing; multiprocessing systems;
                 multithreaded application; multithreaded program
                 execution; performance; Predictive models; rapid
                 performance prediction; RPPM; Synchronization;
                 synchronization overhead; synchronization primitives;
                 unseen multicore platform",
  number-of-cited-references = "12",
  ORCID-numbers = "Van den Steen, Sam/0000-0003-3630-2214",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Pestel:2018:RRP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zhao:2018:KOA,
  author =       "Wenyi Zhao and Quan Chen and Minyi Guo",
  title =        "{KSM}: Online Application-Level Performance Slowdown
                 Prediction for Spatial Multitasking {GPGPU}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "187--191",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2851207",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Colocating multiple applications on the same spatial
                 multitasking GPGPU improves the system-wide throughput.
                 However, the colocated applications are slowed down
                 differently due to the contention on streaming
                 multiprocessors (SMs), L2 cache and global memory
                 bandwidth. The ability to precisely predict application
                 slowdowns online is useful in many scenarios, e.g.,
                 ensuring fair pricing in multi-tenant Cloud systems.
                 Prior work on predicting application slowdown is either
                 inaccurate, due to the ignoring of contention on SMs,
                 or inefficient, due to the expensive sequential
                 profiling of concurrent applications via runtime
                 environment switching. To solve the above problem, we
                 propose KSM that enables precise and efficient
                 application-level slowdown prediction without priori
                 application knowledge. KSM is proposed based on the
                 observation that hardware event statistics caused by
                 the colocated applications are strongly correlated with
                 their slowdowns. In more detail, KSM builds a slowdown
                 model based on the hardware event statistics using
                 machine learning techniques offline. At runtime, KSM
                 collects the hardware event statistics, and predicts
                 the slowdowns of all the colocated applications based
                 on the model. Our experimental results show that KSM
                 has negligible runtime overhead and precisely predicts
                 the application-level slowdowns with the prediction
                 error smaller than 9.7 percent.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zhao, WY (Reprint Author), Shanghai Jiao Tong Univ,
                 Dept Comp Sci \& Engn, Shanghai 200240, Peoples R
                 China. Zhao, Wenyi; Chen, Quan; Guo, Minyi, Shanghai
                 Jiao Tong Univ, Dept Comp Sci \& Engn, Shanghai 200240,
                 Peoples R China.",
  author-email = "wenyizhao@sjtu.edu.cn chen-quan@cssjtu.edu.cn
                 guo-my@cssjtu.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "GP4TI",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Basic Research 973 Program of
                 China [2015CB352403]; National Natural Science
                 Foundation of China (NSFC) [61602301, 61632017]",
  funding-text = "This work is partially sponsored by the National Basic
                 Research 973 Program of China (No. 2015CB352403), the
                 National Natural Science Foundation of China (NSFC)
                 (61602301, 61632017). Quan Chen and Minyi Guo are
                 co-corresponding authors of this paper.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application-level slowdowns; Bandwidth; cloud
                 computing; colocated applications; computer centres;
                 concurrent applications; Graphics processing units;
                 graphics processing units; Hardware; hardware event
                 statistics; interference; Interference; interference;
                 Kernel; KSM; learning (artificial intelligence);
                 machine learning technique; multiprocessing systems;
                 multitenant cloud systems; online application-level
                 performance slowdown prediction; precise
                 application-level slowdown prediction; priori
                 application knowledge; Resource management;
                 scalability; Slowdown prediction; SM; spatial
                 multitasking GPGPU; spatial multitasking GPGPUs;
                 system-wide throughput; Training",
  number-of-cited-references = "13",
  ORCID-numbers = "Zhao, Wenyi/0000-0001-7308-9542 Chen,
                 Quan/0000-0001-5832-0347",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Zhao:2018:KOA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Swami:2018:AAS,
  author =       "Shivam Swami and Kartik Mohanram",
  title =        "{ARSENAL}: Architecture for Secure Non-Volatile
                 Memories",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "192--196",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2863281",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Whereas data persistence in non-volatile memories
                 (NVMs) enables instant data recovery (IDR) in the face
                 of power/system failures, it also exposes NVMs to data
                 confidentiality and integrity attacks. Counter mode
                 encryption and Merkle Tree authentication are
                 established measures to thwart data confidentiality and
                 integrity attacks, respectively, in NVMs. However,
                 these security mechanisms require high overhead atomic
                 security meta-data updates on every write-back in order
                 to support IDR in NVMs. This increases memory traffic
                 and negatively impacts system performance and memory
                 lifetime. Architecture for Secure Non-Volatile Memories
                 (ARSENAL) is an IDR-preserving, low cost, high
                 performance security solution that protects NVM systems
                 against data confidentiality and integrity attacks.
                 ARSENAL synergistically integrates (i) Smart Writes for
                 Faster Transactions (SWIFT), a novel technique to
                 reduce the performance overhead of atomic security
                 meta-data updates on every write-back, with (ii)
                 Terminal BMT Updates (TBU), a novel
                 BMT-consistency-preserving technique, to facilitate IDR
                 in the face of power/system failures. Our evaluations
                 show that on average, ARSENAL improves system
                 performance (measured in IPC) by 2.26x (4x), reduces
                 memory traffic overhead by 1.47x (1.88x), and improves
                 memory lifetime by 2x (3.5x) in comparison to
                 conventional IDR-preserving 64-bit (128-bit)
                 encryption+authentication.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mohanram, K (Reprint Author), Univ Pittsburgh, Dept
                 Elect \& Comp Engn, Pittsburgh, PA 15260 USA. Swami,
                 Shivam; Mohanram, Kartik, Univ Pittsburgh, Dept Elect
                 \& Comp Engn, Pittsburgh, PA 15260 USA.",
  author-email = "shs173@pitt.edu kmram@pitt.edu",
  da =           "2019-06-20",
  doc-delivery-number = "GT5EV",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-1217738]",
  funding-text = "This research was supported by NSF Award CCF-1217738.
                 We also thank the editor and the reviewers for their
                 constructive comments that have helped us elaborate and
                 improve the content of the paper.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architecture for nonvolatile memories; ARSENAL;
                 authentication; Authentication; authentication;
                 cryptography; data confidentiality; data integrity;
                 data integrity attacks; data persistence; encryption;
                 Encryption; failure analysis; hardware security; high
                 overhead atomic security meta-data updates; high
                 performance security solution; IDR; IDR-preserving
                 encryption-authentication; instant data recovery;
                 integrated circuit reliability; memory architecture;
                 memory lifetime; Memory management; memory traffic
                 overhead; Non-volatile memories; Nonvolatile memory;
                 NVMs; power failures; Random access memory;
                 random-access storage; security mechanisms; smart
                 writes for faster transactions; SWIFT; system failures;
                 system performance; terminal BMT updates",
  keywords-plus = "ENCRYPTION; PERFORMANCE",
  number-of-cited-references = "28",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Swami:2018:AAS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Basak:2018:ECC,
  author =       "Abanti Basak and Xing Hu and Shuangchen Li and Sang
                 Min Oh and Yuan Xie",
  title =        "Exploring Core and Cache Hierarchy Bottlenecks in
                 Graph Processing Workloads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "197--200",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2864964",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Graph processing is an important analysis technique
                 for a wide range of big data problems. The ability to
                 explicitly represent relationships between entities
                 gives graph analytics significant performance advantage
                 over traditional relational databases. In this paper,
                 we perform an in-depth data-aware characterization of
                 graph processing workloads on a simulated multi-core
                 architecture, find bottlenecks in the core and the
                 cache hierarchy that are not highlighted by previous
                 characterization work, and analyze the behavior of the
                 specific application data type causing the
                 corresponding bottleneck. We find that load-load
                 dependency chains involving different application data
                 types form the primary bottleneck in achieving a high
                 memory-level parallelism in graph processing workloads.
                 We also observe that the private L2 cache has a
                 negligible contribution to performance. whereas the
                 shared L3 cache has higher performance sensitivity. In
                 addition, we present a study on the effectiveness of
                 several replacement policies. Finally, we study the
                 relationship between different graph algorithms and the
                 access volumes to the different data types. Overall, we
                 provide useful insights and guidelines toward
                 developing a more optimized CPU-based architecture for
                 high performance graph processing.",
  acknowledgement = ack-nhfb,
  affiliation =  "Basak, A (Reprint Author), Univ Calif Santa Barbara,
                 Santa Barbara, CA 93106 USA. Basak, Abanti; Hu, Xing;
                 Li, Shuangchen; Oh, Sang Min; Xie, Yuan, Univ Calif
                 Santa Barbara, Santa Barbara, CA 93106 USA.",
  author-email = "abasak@umail.ucsb.edu xinghu.cs@gmail.com
                 shuangchenli@ece.ucsb.edu sangminoh@umail.ucsb.edu
                 yuanxie@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "GT5EV",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "US National Science Foundation
                 [1730309/1719160/1500848]; CRISP, one of six centers in
                 JUMP, a Semiconductor Research Corporation program -
                 DARPA",
  funding-text = "This work was supported in part by US National Science
                 Foundation 1730309/1719160/1500848 and by CRISP, one of
                 six centers in JUMP, a Semiconductor Research
                 Corporation program sponsored by DARPA.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application data type; Arrays; Benchmark testing; Big
                 Data; big data problems; Cache Hierarchy; cache
                 hierarchy bottlenecks; cache storage; CPU-based
                 architecture; graph algorithms; graph analytics; Graph
                 Processing; graph processing workloads; graph theory;
                 Guidelines; high performance graph processing; in-depth
                 data-aware characterization; Layout; load-load
                 dependency chains; mathematics computing; Memory-Level
                 Parallelism; memory-level parallelism; microprocessor
                 chips; multicore architecture; multiprocessing systems;
                 parallel architectures; performance evaluation;
                 performance sensitivity; private L2 cache; Random
                 access memory; Sensitivity; shared L3 cache",
  number-of-cited-references = "13",
  ORCID-numbers = "Oh, Sang Min/0000-0001-7119-6934",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Basak:2018:ECC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Khatamifard:2018:NCC,
  author =       "S. Karen Khatamifard and Longfei Wang and Selcuk
                 K{\"o}se and Ulya R. Karpuzcu",
  title =        "A New Class of Covert Channels Exploiting Power
                 Management Vulnerabilities",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "201--204",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2860006",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Effective runtime power management requires hardware
                 activity to be tracked at a very fine granularity in
                 both space and time in order to meet diverse workload
                 performance requirements within a tight power budget.
                 As the available instantaneous power budget itself
                 represents a shared resource, this practically
                 translates into finding the optimal allocation of the
                 power budget among active tasks of execution. Covert
                 communication over a previously unexplored class of
                 channels thereby becomes possible, which forms the
                 focus of this paper.",
  acknowledgement = ack-nhfb,
  affiliation =  "Khatamifard, SK (Reprint Author), Univ Minnesota,
                 Minneapolis, MN 55455 USA. Khatamifard, S. Karen;
                 Karpuzcu, Ulya R., Univ Minnesota, Minneapolis, MN
                 55455 USA. Wang, Longfei; Kose, Selcuk, Univ S Florida,
                 Tampa, FL 33620 USA.",
  author-email = "khatami@umn.edu longfei@mail.usf.edu
                 ukarpuzc@umn.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HA2CO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF CAREER Award [CCF-1350451]; NSF/SRC
                 [CNS-1715286]; Cisco Systems Research Award",
  funding-text = "This work is supported in part by the NSF CAREER Award
                 under Grant CCF-1350451, in part by the NSF/SRC Award
                 under Grant CNS-1715286, and in part by the Cisco
                 Systems Research Award.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "covert channels; covert communication; diverse
                 workload performance requirements; fine granularity;
                 Hardware; hardware activity; instantaneous power
                 budget; Monitoring; optimal allocation; power aware
                 computing; Power demand; Power management
                 vulnerabilities; power management vulnerabilities;
                 Power system management; runtime power management;
                 security of data; Software; System-on-chip; tight power
                 budget; Voltage control",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Khatamifard:2018:NCC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kondguli:2018:BUS,
  author =       "Sushant Kondguli and Michael Huang",
  title =        "{Bootstrapping}: Using {SMT} Hardware to Improve
                 Single-Thread Performance",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "205--208",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2859945",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Decoupled look-ahead (DLA) architectures have been
                 shown to be an effective way to improve single-thread
                 performance. However, a default implementation requires
                 an additional core. While an SMT flavor is possible, a
                 naive implementation is inefficient and thus slow. In
                 this paper, we propose an optimized implementation
                 called Bootstrapping that makes DLA just as effective
                 on a single (SMT) core as using two cores. While fusing
                 two cores can improve single-thread performance by
                 1.23x, Bootstrapping provides a speedup of 1.51.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kondguli, S (Reprint Author), Univ Rochester, Dept
                 Elect \& Comp Engn, Rochester, NY 14627 USA. Kondguli,
                 Sushant; Huang, Michael, Univ Rochester, Dept Elect \&
                 Comp Engn, Rochester, NY 14627 USA.",
  author-email = "sushant.kondguli@rochester.edu
                 michael.huang@rochester.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HA2CO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [1514433, 1533842]",
  funding-text = "This work is supported in part by NSF under grants
                 1514433 and 1533842.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "bootstrapping; Computer architecture; Context;
                 Decoupled look-ahead (DLA) architectures; decoupled
                 look-ahead architectures; DLA architecture;
                 multi-threading; multiprocessing systems; optimisation;
                 optimized implementation; Prefetching; Resource
                 management; simultaneous multi-threading (SMT); single
                 core; single thread performance; single-thread
                 performance; Skeleton; SMT hardware; Substrates",
  number-of-cited-references = "20",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Kondguli:2018:BUS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kline:2018:CAR,
  author =       "Donald {Kline, Jr.} and Rami Melhem and Alex K.
                 Jones",
  title =        "Counter Advance for Reliable Encryption in Phase
                 Change Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "209--212",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2861012",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The use of hardware encryption and new memory
                 technologies such as phase change memory (PCM) are
                 gaining popularity in a variety of server applications
                 such as cloud systems. While PCM provides energy and
                 density advantages over conventional DRAM memory, it
                 faces endurance challenges. Such challenges are
                 exacerbated when employing memory encryption as the
                 stored data is essentially randomized. losing data
                 locality and reducing or eliminating the effectiveness
                 of energy and endurance aware encoding techniques. This
                 results in increasing dynamic energy consumption and
                 accelerated wear out. In this paper we propose counter
                 advance, a technique to leverage the process of
                 encryption to improve reliability and lifetime while
                 maintaining low-energy and low-latency operation.
                 Counter advance is compatible with standard
                 error-correction codes (ECC) and error correction
                 pointers (ECP), the standard for mitigating endurance
                 faults in PCM. Counter advance achieves the same fault
                 tolerance using three ECP pointers for a 10(-4) cell
                 failure rate compared to the leading approach to
                 consider energy savings and reliability for encrypted
                 PCM (SECRET) using five ECP pointers. At a failure rate
                 of 10(-2), counter advance can achieve an uncorrectable
                 bit error rate (UBER) of 10(-1), compared to < 10(-4)
                 for SECRET using six ECP pointers. This leads to a
                 lifetime improvement of 3.8x while maintaining
                 comparable energy consumption and access latency.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kline, D (Reprint Author), Univ Pittsburgh, Dept Elect
                 \& Comp Engn, Pittsburgh, PA 15260 USA. Kline, Donald,
                 Jr.; Jones, Alex K., Univ Pittsburgh, Dept Elect \&
                 Comp Engn, Pittsburgh, PA 15260 USA. Melhem, Rami, Univ
                 Pittsburgh, Dept Comp Sci, Pittsburgh, PA 15260 USA.",
  author-email = "dek61@pitt.edu melhem@cs.pitt.edu akjones@pitt.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HA2CO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [1747452]; IUCRC Program of the
                 National Science Foundation [CNS-1738783]; SHREC",
  funding-text = "This work was supported by NSF Graduate Research
                 Fellowship award number 1747452, and SHREC industry and
                 agency members and by the IUCRC Program of the National
                 Science Foundation (Grant No. CNS-1738783).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "and error correction; Computer architecture; Emerging
                 memories; Encryption; error correction; Error
                 correction; Memory management; Microprocessors; Phase
                 change materials; reliability; stuck-at faults",
  number-of-cited-references = "16",
  oa =           "Bronze",
  ORCID-numbers = "Kline, Jr, Donald/0000-0002-4414-1513",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kline:2018:CAR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Sahoo:2018:RRD,
  author =       "Debiprasanna Sahoo and Swaraj Sha and Manoranjan
                 Satpathy and Madhu Mutyam",
  title =        "{ReDRAM}: a Reconfigurable {DRAM} Cache for {GPGPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "213--216",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2865552",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware-based DRAM cache techniques for GPGPUs
                 propose to use GPU DRAM as a cache of the host (system)
                 memory. However, these approaches do not exploit the
                 opportunity of allocating store-before-load data (data
                 that is written before being read by GPU cores) on GPU
                 DRAM that would save multiple CPU-GPU transactions. In
                 this context, we propose ReDRAM, a novel memory
                 allocation strategy for GPGPUs which re-configures GPU
                 DRAM cache as a heterogeneous unit. It allows
                 allocation of store-before-load data directly onto GPU
                 DRAM and also utilizes it as a cache of the host
                 memory. Our simulation results using a modified version
                 of GPGPU-Sim show that ReDRAM can improve performance
                 for applications that use store-before-load data by
                 57.6 percent (avg.) and 4.85x (max.) when compared to
                 the existing proposals on state-of-the-art GPU DRAM
                 caches.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sahoo, D (Reprint Author), Indian Inst Technol
                 Bhubaneswar, Bhubaneswar 751013, Odisha, India. Sahoo,
                 Debiprasanna; Sha, Swaraj; Satpathy, Manoranjan, Indian
                 Inst Technol Bhubaneswar, Bhubaneswar 751013, Odisha,
                 India. Mutyam, Madhu, Indian Inst Technol Madras,
                 Madras 600036, Tamil Nadu, India.",
  author-email = "debiprasanna.sahoo@gmail.com ss24@iitbbs.ac.in
                 manoranjan@iitbbs.ac.in madhu@cse.iitm.ac.in",
  da =           "2019-06-20",
  doc-delivery-number = "HA2CO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Arrays; cache storage; CPU-GPU communication; DRAM
                 cache; DRAM chips; efficiency 57.6 percent; GPGPU;
                 GPGPU-Sim; GPGPUs; GPU cores; GPU DRAM cache; graphics
                 processing units; Graphics processing units; Hardware;
                 hardware-based DRAM cache techniques; heterogeneous
                 unit; host memory; memory allocation strategy; Memory
                 management; multiple CPU-GPU transactions; Random
                 access memory; reconfigurable DRAM cache; ReDRAM;
                 resource allocation; Resource management;
                 store-before-load; store-before-load data allocation;
                 tagless",
  number-of-cited-references = "16",
  ORCID-numbers = "Mutyam, Madhu/0000-0003-1638-4195 Sahoo,
                 Debiprasanna/0000-0003-1438-0617",
  research-areas = "Computer Science",
  researcherid-numbers = "Mutyam, Madhu/B-1717-2012",
  times-cited =  "0",
  unique-id =    "Sahoo:2018:RRD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Mashimo:2018:VMS,
  author =       "Susumu Mashimo and Ryota Shioya and Koji Inoue",
  title =        "{VMOR}: Microarchitectural Support for Operand Access
                 in an Interpreter",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "217--220",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2866243",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Dynamic scripting languages become very popular for
                 high productivity. However, many of these languages
                 have significant runtime overheads because they employ
                 interpreter-based virtual machines. One of the major
                 overheads for the interpreter is derived from operand
                 accesses, which significantly increase memory accesses.
                 We propose VMOR, microarchitectural support for the
                 operand accesses in the interpreter. VMOR remaps
                 operand values into floating-point physical registers,
                 which are rarely used in the interpreter, and thus.
                 VMOR effectively reduces the memory accesses.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mashimo, S (Reprint Author), Kyushu Univ, Fukuoka,
                 Fukuoka 8190395, Japan. Mashimo, Susumu; Inoue, Koji,
                 Kyushu Univ, Fukuoka, Fukuoka 8190395, Japan. Shioya,
                 Ryota, Nagoya Univ, Nagoya, Aichi 4648601, Japan.",
  author-email = "susumu.mashimo@cpc.ait.kyushu-u.ac.jp
                 shioya@nuee.nagoya-u.ac.jp inoue@ait.kyushu-u.ac.jp",
  da =           "2019-06-20",
  doc-delivery-number = "HA2CO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "JSPS KAKENHI [JP17J10388]",
  funding-text = "This work was supported by JSPS KAKENHI Grant Number
                 JP17J10388.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "authoring languages; Cryptography; Dynamic scripting
                 language; dynamic scripting languages; floating-point
                 physical registers; Hardware; high productivity;
                 interpreter; interpreter-based virtual machines; memory
                 accesses; microarchitectural support;
                 Microarchitecture; operand access; operand values;
                 Pipelines; Productivity; program interpreters;
                 Proposals; Registers; virtual machines; VMOR",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Mashimo:2018:VMS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Min:2018:SCD,
  author =       "Seungwon Min and Mohammad Alian and Wen-Mei Hwu and
                 Nam Sung Kim",
  title =        "Semi-Coherent {DMA}: an Alternative {I/O} Coherency
                 Management for Embedded Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "221--224",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2866568",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Many modern embedded CPUs adopt Non-Coherent DMA
                 (NC-DMA) over Coherent DMA (C-DMA) because of
                 simplicity. An NC-DMA design, however, requires a CPU
                 device driver to explicitly invalidate or flush a wide
                 range of cache space. When an I/O DMA device writes
                 data to a main memory region, the CPU needs to
                 invalidate the cache space corresponding to the same
                 memory region twice: (1) to prevent dirty cache lines
                 from overwriting the DMA data and (2) to remove any
                 cache lines prefetched before the DMA is done. In this
                 work, we first show that such explicit invalidations
                 consume 31 percent of CPU cycles, limiting the data
                 transfer throughput of a high-speed network interface
                 card (NIC) when receiving network packets. Second, we
                 propose a Semi-Coherent DMA (SC-DMA) architecture for
                 improving the efficiency of NC-DMA with a slight
                 modification to the hardware. Specifically, our SC-DMA
                 records the DMA region and prohibits any data that is
                 prefetched from the region from entering the cache,
                 reducing nearly 50 percent of the unnecessary
                 invalidations. Lastly, we identify several software
                 optimizations that can substantially reduce excessive
                 cache invalidations prevalent in NIC drivers. Our
                 evaluation with NVIDIA Jetson TX2 shows that our
                 proposed SC-DMA design with the NIC driver
                 optimizations can improve the NIC data transfer
                 throughput by up to 53.3 percent.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, NS (Reprint Author), Univ Illinois, Elect \& Comp
                 Engn, Urbana, IL 61820 USA. Min, Seungwon; Alian,
                 Mohammad; Hwu, Wen-Mei; Kim, Nam Sung, Univ Illinois,
                 Elect \& Comp Engn, Urbana, IL 61820 USA.",
  author-email = "min16@illinois.edu malian2@illinois.edu
                 w-hwu@illinois.edu nskim@illinois.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HA2CO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "SRC/JUMP Applications Driving Architectures
                 (ADA) Research Center; IBM-ILLI-NOIS Center for
                 Cognitive Computing Systems Research (C3SR)",
  funding-text = "This work is supported in part by grants from SRC/JUMP
                 Applications Driving Architectures (ADA) Research
                 Center and IBM-ILLI-NOIS Center for Cognitive Computing
                 Systems Research (C3SR).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; C-DMA; Cache; cache lines; cache space;
                 cache storage; coherency management; coherent DMA;
                 Computer architecture; CPU cycles; CPU device driver;
                 Data transfer; device drivers; Device drivers; DMA
                 data; DMA device; DMA region; embedded CPUs; embedded
                 processor; embedded systems; Embedded systems; embedded
                 systems; Ethernet; excessive cache invalidations;
                 Hardware; high-speed network interface card; Internet
                 of Things; main memory region; microprocessor chips;
                 multiprocessing systems; NC-DMA design; NIC data
                 transfer throughput; noncoherent DMA; Prefetching;
                 SC-DMA design; SC-DMA records; semicoherent DMA
                 architecture",
  number-of-cited-references = "16",
  ORCID-numbers = "Min, Seung Won/0000-0001-7195-7182",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Min:2018:SCD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Nematollahi:2018:NSD,
  author =       "Negin Nematollahi and Mohammad Sadrosadati and Hajar
                 Falahati and Marzieh Barkhordar and Hamid
                 Sarbazi-Azad",
  title =        "{Neda}: Supporting Direct Inter-Core Neighbor Data
                 Exchange in {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "225--229",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2873679",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Image processing applications employ various fitters
                 for several purposes, such as enhancing the images and
                 extracting the features. Recent studies show that
                 filters in image processing applications take a
                 substantial amount of the execution time. and it is
                 crucial to boost their performance to improve the
                 overall performance of the image processing
                 applications. Image processing filters require a
                 significant amount of data sharing among threads which
                 are in charge of filtering neighbor pixels. Graphics
                 Processing Units (GPUs) attempt to satisfy the demand
                 of data sharing by providing the scratch-pad memory,
                 shuffle instructions, and on-chip caches. However, we
                 observe that these mechanisms are insufficient to
                 provide a fast and energy-efficient neighbor data
                 sharing for the image processing filters. In this
                 paper, we propose a new hardware/software co-design
                 mechanism for GPUs, to effectively provide a fast and
                 energy-efficient register-level neighbor data sharing
                 for the image fitters. We propose a neighbor data
                 exchange mechanism. called Neda, that adds a register
                 to each streaming processor (SP) which can be accessed
                 by its neighboring SPs. Our experimental results show
                 that Neda improves the performance and energy
                 consumption by 12.4 and 13.5 percent, on average,
                 respectively, compared to the NVIDIA SDK implementation
                 of image processing filters. Moreover, Neda's
                 performance is within 9.3 percent of the ideal GPU with
                 zero latency neighbor data exchange capability.",
  acknowledgement = ack-nhfb,
  affiliation =  "Nematollahi, N (Reprint Author), Sharif Univ Technol,
                 Dept Comp Engn, Tehran 111559517, Iran. Nematollahi,
                 Negin; Sadrosadati, Mohammad; Barkhordar, Marzieh;
                 Sarbazi-Azad, Hamid, Sharif Univ Technol, Dept Comp
                 Engn, Tehran 111559517, Iran. Falahati, Hajar;
                 Sarbazi-Azad, Hamid, Inst Res Fundamental Sci, Comp Sci
                 Sch, Tehran 193955531, Iran.",
  author-email = "negin.mahani@gmail.com m.sadr89@gmail.com
                 hfalahati@ipm.ir marzieh.barkhordar@gmail.com
                 azad@sharif.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HA2CO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; direct inter-core neighbor data
                 exchange mechanism; efficiency 13.5 percent; efficiency
                 9.3 percent; electronic data interchange; energy
                 consumption; energy-efficient neighbor data sharing;
                 energy-efficient register-level neighbor data sharing;
                 fast energy-efficient neighbor data; feature
                 extraction; GPUs; Graphics processing units; graphics
                 processing units; hardware-software co-design
                 mechanism; hardware-software codesign; image
                 enhancement; image filtering; image filters; Image
                 processing; image processing applications; image
                 processing filters; Instruction sets; inter-core
                 communication; Microsoft Windows; Neda; neighbor data
                 exchange; NVIDIA SDK implementation; on-chip caches;
                 Registers; scratch-pad memory; shuffle instructions;
                 spatial image processing filters; streaming processor;
                 Two dimensional displays; zero latency neighbor data
                 exchange capability",
  keywords-plus = "MEAN FILTERS; IMAGE; DOMAIN",
  number-of-cited-references = "40",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Nematollahi:2018:NSD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Omar:2018:MRI,
  author =       "Hamza Omar and Halit Dogan and Brian Kahne and Omer
                 Khan",
  title =        "Multicore Resource Isolation for Deterministic,
                 Resilient and Secure Concurrent Execution of
                 Safety-Critical Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "230--234",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2874216",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Multicores increasingly deploy spatial execution of
                 safety-critical applications that demand a
                 deterministic, resilient, and secure environment to
                 meet the safety standards. However, multicores
                 aggressively share hardware resources that leads to
                 non-deterministic performance due to destructive
                 interference from concurrent applications. Resource
                 sharing not only hinders efficient resilient execution,
                 but also introduces security vulnerabilities due to
                 information leakage on side-channels. This work
                 proposes a novel multicore framework that constructs
                 isolated clusters of cores for each concurrent
                 application. It guarantees concurrent applications with
                 deterministic performance, as well as an efficient
                 execution environment for resiliency and security.
                 Moreover, the framework allows dynamic re-sizing of
                 cluster sizes for load balanced execution of concurrent
                 applications. However, it leads to diminished isolation
                 between clusters, which opens various
                 performance-resilience and performance-security
                 tradeoffs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Khan, O (Reprint Author), Univ Connecticut, Dept Elect
                 \& Comp Engn, Storrs, CT 06269 USA. Omar, Hamza; Dogan,
                 Halit; Khan, Omer, Univ Connecticut, Dept Elect \& Comp
                 Engn, Storrs, CT 06269 USA. Kahne, Brian, NXP Semicond
                 Inc, Automot Microcontrollers \& Processors, Austin, TX
                 78735 USA.",
  author-email = "hamza.omar@uconn.edu halit.dogan@uconn.edu
                 brian.kahne@nxp.com omer.khan@uconn.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HA2CO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CCF-1550470,
                 CNS-1718481]",
  funding-text = "This research was partially supported by the National
                 Science Foundation under Grants No. CCF-1550470 and
                 CNS-1718481.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "concurrency control; concurrent application;
                 deterministic performance; Hardware; hardware resource
                 sharing; hardware resources; Interference; load
                 balanced execution; Multicore; multicore framework;
                 Multicore processing; multicore resource isolation;
                 multicores; multiprocessing systems; nondeterministic
                 performance; Program processors; resilience;
                 Resilience; resilience; resource allocation;
                 safety-critical applications; safety-critical systems;
                 secure environment; security; Security; security;
                 security of data; security vulnerabilities;
                 side-channels; spatial execution; System-on-chip",
  number-of-cited-references = "20",
  ORCID-numbers = "Khan, Omer/0000-0001-6293-7403",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Omar:2018:MRI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zokaee:2018:APM,
  author =       "Farzaneh Zokaee and Hamid R. Zarandi and Lei Jiang",
  title =        "{AligneR}: a Process-in-Memory Architecture for Short
                 Read Alignment in {ReRAMs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "235--238",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2854700",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Genomics is the key to enable the personal
                 customization of medical care. How to fast and
                 energy-efficiently analyze the huge amounts of genomic
                 sequence data generated by next generation sequencing
                 technologies has become one of the most significant
                 challenges facing genomics today. Existing hardware
                 platforms achieve low genome sequencing throughput with
                 significant hardware and power overhead. In this paper,
                 we propose AligneR, a ReRAM-based process-in-memory
                 architecture, to accelerate the bottleneck of genome
                 sequencing, i.e., short read alignment. Compared to
                 state-of-the-art accelerators, AligneR improves the
                 short read alignment throughput per Watt per mm(2) by
                 13x.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zokaee, F (Reprint Author), Indiana Univ, Bloomington,
                 IN 47405 USA. Zokaee, Farzaneh; Jiang, Lei, Indiana
                 Univ, Bloomington, IN 47405 USA. Zokaee, Farzaneh;
                 Zarandi, Hamid R., Amirkabir Univ Technol, Tehran
                 158754413, Iran.",
  author-email = "f\_zokaee@aut.ac.ir h\_zarandi@aut.ac.ir
                 jiang60@iu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HE6YC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bioinformatics; Computer architecture; FM-index;
                 Genome sequencing; Genomics; Memory management;
                 Microprocessors; process-in-memory; Random access
                 memory; ReRAM; Sequential analysis; short read
                 alignment; Throughput",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Zokaee:2018:APM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
  xxpages =      "237--240",
}

@Article{Lou:2018:BSB,
  author =       "Qian Lou and Lei Jiang",
  title =        "{BRAWL}: a Spintronics-Based Portable
                 Basecalling-in-Memory Architecture for Nanopore Genome
                 Sequencing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "239--242",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2882384",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Nanopore sequencing is one of the most promising
                 genome sequencing technologies because of its ability
                 to produce ultra long reads and provide portability.
                 Basecalling, the most time-consuming step in the whole
                 flow of Nanopore genome sequencing, translates analog
                 signals to digital DNA symbols. The state-of-the-art
                 basecaller relies on a complex neural network
                 consisting of convolutional, long short-term memory and
                 fully-connected layers, and a CTC decoder. Existing
                 neural network portable accelerators achieve low
                 basecalling throughput per Watt when processing such
                 neural network inferences. In this paper, we propose
                 BRAWL, a portable Basecalling-in-memory architecture,
                 to translate RAW electrical signals to digital DNA
                 symbols in SOT-MRAMs for Nanopore portable sequencers.
                 Compared to state-of-the-art accelerators, BRAWL
                 improves basecalling throughput per Watt by 3: 88x.",
  acknowledgement = ack-nhfb,
  affiliation =  "Jiang, L (Reprint Author), Indiana Univ, Bloomington,
                 IN 47405 USA. Lou, Qian; Jiang, Lei, Indiana Univ,
                 Bloomington, IN 47405 USA.",
  author-email = "louqian@iu.edu jiang60@iu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HE6YC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Artificial neural networks; basecaller;
                 Bioinformatics; Computer architecture; DNA; genome
                 sequencing; Genomics; Microprocessors; Oxford nanopore
                 technology; process-in-memory; Sequential analysis;
                 SOT-MRAM",
  keywords-plus = "PERFORMANCE; ENERGY",
  number-of-cited-references = "26",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Lou:2018:BSB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
  xxpages =      "241--244",
}

@Article{Min:2018:AAB,
  author =       "Donghyun Min and Donggyu Park and Jinwoo Ahn and Ryan
                 Walker and Junghee Lee and Sungyong Park and Youngjae
                 Kim",
  title =        "{Amoeba}: an Autonomous Backup and Recovery {SSD} for
                 Ransomware Attack Defense",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "243--246",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2883431",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Ransomware is one of growing concerns in enterprise
                 and government organizations, because it may cause
                 financial damages or loss of important data. Although
                 there are techniques to detect and prevent ransomware,
                 an evolved ransomware may evade them because they are
                 based on monitoring known behaviors. Ransomware can be
                 mitigated if backup copies of data are retained in a
                 safe place. However, existing backup solutions may be
                 under ransomware's control and an intelligent
                 ransomware may destroy backup copies too. They also
                 incur overhead to storage space, performance and
                 network traffic (in case of remote backup). In this
                 paper, we propose an SSD system that supports automated
                 backup, called Amoeba. In particular, Amoeba is armed
                 with a hardware accelerator that can detect the
                 infection of pages by ransomware attacks at high speed
                 and a fine-grained backup control mechanism to minimize
                 space overhead for original data backup. For
                 evaluation, we extended the Microsoft SSD simulator to
                 implement Amoeba and evaluated it using the realistic
                 block-level traces, which are collected while running
                 the actual ransomware. According to our experiments,
                 Amoeba has negligible overhead and outperforms in
                 performance and space efficiency over the
                 state-of-the-art SSD, FlashGuard, which supports data
                 backup within the device.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, Y (Reprint Author), Sogang Univ, Seoul 04107,
                 South Korea. Min, Donghyun; Park, Donggyu; Ahn, Jinwoo;
                 Park, Sungyong; Kim, Youngjae, Sogang Univ, Seoul
                 04107, South Korea. Walker, Ryan; Lee, Junghee, Univ
                 Texas San Antonio, San Antonio, TX 78249 USA.",
  author-email = "mdh38112@sogang.ac.kr dgpark@sogang.ac.kr
                 jinu37@sogang.ac.kr ryan.walker@utsa.edu
                 junghee.lee@utsa.edu parksy@sogang.ac.kr
                 youkim@sogang.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "HE6YC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Research Foundation of Korea (NRF)
                 --- Korea Government (MSIT) [NRF-2018R1A1A1A05079398]",
  funding-text = "This work was supported by the National Research
                 Foundation of Korea (NRF) grant funded by the Korea
                 Government (MSIT) (No. NRF-2018R1A1A1A05079398).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Aerospace electronics; Amoeba; autonomous backup SSD;
                 autonomous recovery SSD; back-up procedures;
                 cryptography; Cryptography; data backup; Entropy;
                 FlashGuard; intelligent ransomware; invasive software;
                 Microsoft SSD simulator; Performance evaluation;
                 Ransomware; ransomware attack; ransomware attack
                 defense; Solid-state drive (SSD); SSD system; storage
                 management; storage security",
  number-of-cited-references = "12",
  ORCID-numbers = "Park, Sungyong/0000-0002-0309-1820 Min,
                 Donghyun/0000-0002-6043-9264",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Min:2018:AAB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
  xxpages =      "245--248",
}

@Article{Kim:2018:HBP,
  author =       "Chinam Kim and Hyukjun Lee",
  title =        "A High-Bandwidth {PCM}-Based Memory System for Highly
                 Available {IP} Routing Table Lookup",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "17",
  number =       "2",
  pages =        "246--249",
  month =        jul # "\slash " # dec,
  year =         "2018",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2883461",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Achieving higher availability is an unending challenge
                 in router architecture, as process technology scales
                 down and more random logic/memory errors must be
                 tolerated. However, meeting extremely high targets that
                 require only few seconds of yearly downtime puts even
                 more pressure on the design of already complex router
                 architecture. In this paper, we explore the case of
                 storing the routing table in non-volatile memory, to
                 drastically reduce the router downtime and achieve
                 higher availability-without degrading lookup
                 performance. We propose a new MLC PCM architecture,
                 featuring decoupled node access and logically managed
                 duplicate bank groups, that fetches the right amount of
                 information from the most available bank. Performance
                 evaluation shows that we achieve an average of 9.9
                 percent bandwidth improvement over the DRAM baseline
                 system, and an 83.9 percent over the PCM baseline.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lee, H (Reprint Author), Sogang Univ, Dept Comp Sci \&
                 Engn, Seoul 04107, South Korea. Kim, Chinam; Lee,
                 Hyukjun, Sogang Univ, Dept Comp Sci \& Engn, Seoul
                 04107, South Korea.",
  author-email = "chinamkim@sogang.ac.kr hyukjunl@sogang.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "HE6YC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Samsung Electronics",
  funding-text = "This research is funded by Samsung Electronics. The
                 corresponding author is Hyukjun Lee.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; complex router architecture; decoupled node
                 access; DRAM baseline system; DRAM chips; duplicate
                 bank groups; high-bandwidth PCM-based memory system;
                 highly available IP routing table lookup; IP networks;
                 IP routing table lookup; MLC PCM architecture; Network
                 architecture; nonvolatile memory; PCM baseline; Phase
                 change materials; phase change memories; Phase change
                 memory; process technology; processing-in-memory;
                 Random access memory; random logic errors; random
                 memory errors; router downtime reduction; Routing;
                 table lookup; Table lookup; telecommunication network
                 routing",
  number-of-cited-references = "13",
  ORCID-numbers = "Kim, Chinam/0000-0002-7984-2643",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kim:2018:HBP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
  xxpages =      "247--250",
}

@Article{Kim:2019:IGM,
  author =       "Jiho Kim and Jehee Cha and Jason Jong Kyu Park and
                 Dongsuk Jeon and Yongjun Park",
  title =        "Improving {GPU} Multitasking Efficiency Using Dynamic
                 Resource Sharing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "1--5",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2889042",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As GPUs have become essential components for embedded
                 computing systems, a shared GPU with multiple CPU cores
                 needs to efficiently support concurrent execution of
                 multiple different applications. Spatial multitasking,
                 which assigns a different amount of streaming
                 multiprocessors (SMs) to multiple applications, is one
                 of the most common solutions for this. However, this is
                 not a panacea for maximizing total resource
                 utilization. It is because an SM consists of many
                 different sub-resources such as caches, execution units
                 and scheduling units, and the requirements of the
                 sub-resources per kernel are not well matched to their
                 fixed sizes inside an SM. To solve the resource
                 requirement mismatch problem, this paper proposes a GPU
                 Weaver, a dynamic sub-resource management system of
                 multitasking GPUs. GPU Weaver can maximize sub-resource
                 utilization through a shared resource controller (SRC)
                 that is added between neighboring SMs. The SRC
                 dynamically identifies idle sub-resources of an SM and
                 allows them to be used by the neighboring SM when
                 possible. Experiments show that the combination of
                 multiple sub-resource borrowing techniques enhances the
                 total throughput by up to 26 and 9.5 percent on average
                 over the baseline spatial multitasking GPU.",
  acknowledgement = ack-nhfb,
  affiliation =  "Park, Y (Reprint Author), Hanyang Univ, Seoul 04763,
                 South Korea. Kim, Jiho; Cha, Jehee, Hongik Univ, Seoul
                 04066, South Korea. Park, Jason Jong Kyu, Univ
                 Michigan, Ann Arbor, MI 48109 USA. Jeon, Dongsuk, Seoul
                 Natl Univ, Seoul 151742, South Korea. Park, Yongjun,
                 Hanyang Univ, Seoul 04763, South Korea.",
  author-email = "jihokimhi@gmail.com carjehee@gmail.com
                 jasonjk@umich.edu djeon1@snu.ac.kr
                 yongjunpark@hanyang.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "HI0TZ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Research Foundation of Korea (NRF)
                 --- Korea government (MSIP) [NRF-2015R1C1A1A01053844,
                 NRF-2016R1C1B2016072]; ICT R\&D program of MSIP/IITP
                 [2017-0-00142]; R\&D program of MOTIE/KEIT [10077609]",
  funding-text = "This work was supported in part by the National
                 Research Foundation of Korea (NRF) grant funded by the
                 Korea government (MSIP) (NO. NRF-2015R1C1A1A01053844,
                 NO. NRF-2016R1C1B2016072), ICT R\&D program of
                 MSIP/IITP (No. 2017-0-00142), and the R\&D program of
                 MOTIE/KEIT (No. 10077609).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; dynamic resource sharing;
                 dynamic sub-resource management system; embedded
                 computing systems; embedded systems; GPU multitasking
                 efficiency; GPU Weaver; GPUs; graphics processing
                 units; Graphics processing units; Instruction sets;
                 Kernel; Micromechanical devices; multi-programmed;
                 multiple CPU cores; multiple sub-resource borrowing
                 techniques; multiprogramming; Multitasking;
                 multitasking GPUs; resource allocation; Resource
                 management; resource requirement mismatch problem;
                 resource sharing; scheduling; scheduling units; shared
                 GPU; shared resource controller; spatial multitasking;
                 SRC; streaming multiprocessors; sub-resource
                 utilization; total resource utilization; Weaving",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kim:2019:IGM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2019:IIC,
  author =       "Anonymous",
  title =        "2018 Index {{\booktitle{IEEE Computer Architecture
                 Letters}}} Vol. 17",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "1--8",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2901240",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Presents the 2018 subject/author index for this
                 publication.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Xu:2019:PFD,
  author =       "Sheng Xu and Xiaoming Chen and Ying Wang and Yinhe Han
                 and Xuehai Qian and Xiaowei Li",
  title =        "{PIMSim}: a Flexible and Detailed Processing-in-Memory
                 Simulator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "6--9",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2885752",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "With the advent of big data applications and new
                 process technologies, Process-in-Memory (PIM) attracts
                 much attention in memory research as the architecture
                 studies gradually shift from processors to
                 heterogeneous aspects. How to achieve reliable and
                 efficient PIM architecture modeling becomes
                 increasingly urgent for the researchers, who want to
                 experiment on critical issues from detailed
                 implementations of their proposed PIM designs. This
                 paper proposes PIMSim, a full-system and
                 highly-configurable PIM simulator to facilitate
                 circuit-, architecture- and system-level researches.
                 PIMSim enables architectural simulation of PIM and
                 implements three simulation modes to provide a wide
                 range of speed/accuracy tradeoffs. It offers detailed
                 performance and energy models to simulate PIM-enabled
                 instructions, compiler, in-memory processing logic,
                 various memory devices, and PIM coherence. PIMSim is
                 open source and available at
                 https://github.com/vineodd/PIMSim.",
  acknowledgement = ack-nhfb,
  affiliation =  "Xu, S (Reprint Author), Chinese Acad Sci, Inst Comp
                 Technol, Beijing, Peoples R China. Xu, Sheng; Chen,
                 Xiaoming; Wang, Ying; Han, Yinhe; Li, Xiaowei, Chinese
                 Acad Sci, Inst Comp Technol, Beijing, Peoples R China.
                 Xu, Sheng; Li, Xiaowei, Univ Chinese Acad Sci, Beijing
                 101408, Peoples R China. Qian, Xuehai, Univ Southern
                 Calif, Los Angeles, CA 90007 USA.",
  author-email = "xusheng02@ict.ac.cn chenxiaoming@ict.ac.cn
                 wangying2009@ict.ac.cn yinhes@ict.ac.cn
                 xuehai.qian@usc.edu lxw@ict.ac.cn",
  da =           "2019-06-20",
  doc-delivery-number = "HI0TZ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Natural Science Foundation of
                 China (NSFC) [61522406, 61834006, 61521092]; Beijing
                 Municipal Science \& Technology Commission
                 [Z171100000117019, Z181100008918006]; Strategic
                 Priority Research Program of the Chinese Academy of
                 Sciences [XDPB12]; Innovative Project of Institute of
                 Computing Technology, CAS [5120186140]",
  funding-text = "This work was supported in part by National Natural
                 Science Foundation of China (NSFC) under grants
                 61522406, 61834006, and 61521092, Beijing Municipal
                 Science \& Technology Commission (Z171100000117019,
                 Z181100008918006), Strategic Priority Research Program
                 of the Chinese Academy of Sciences (XDPB12), and an
                 Innovative Project of Institute of Computing
                 Technology, CAS, under Grant 5120186140.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architectural simulation; Big Data; big data
                 applications; Coherence; Computational modeling;
                 Computer architecture; Data models; energy models;
                 heterogeneous aspects; heterogeneous computing;
                 in-memory processing logic; Kernel; memory
                 architecture; memory devices; memory research; memory
                 system; performance evaluation; PIM coherence; PIM
                 designs; PIM simulator; PIM-enabled instructions;
                 PIMSim; Process-in-Memory; Processing-in-memory;
                 processing-in-memory simulator; Program processors;
                 reliable PIM architecture modeling; simulation modes;
                 simulator; system-level researches; Tools",
  number-of-cited-references = "22",
  ORCID-numbers = "Wang, Ying/0000-0001-5172-4736",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Xu:2019:PFD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Shomron:2019:SCV,
  author =       "Gil Shomron and Uri Weiser",
  title =        "Spatial Correlation and Value Prediction in
                 Convolutional Neural Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "10--13",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2018.2890236",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Convolutional neural networks (CNNs) are a widely used
                 form of deep neural networks, introducing
                 state-of-the-art results for different problems such as
                 image classification, computer vision tasks, and speech
                 recognition. However, CNNs are compute intensive,
                 requiring billions of multiply-accumulate (MAC)
                 operations per input. To reduce the number of MACs in
                 CNNs, we propose a value prediction method that
                 exploits the spatial correlation of zero-valued
                 activations within the CNN output feature maps, thereby
                 saving convolution operations. Our method reduces the
                 number of MAC operations by 30.4 percent, averaged on
                 three modern CNNs for ImageNet, with top-1 accuracy
                 degradation of 1.7 percent, and top-5 accuracy
                 degradation of 1.1 percent.",
  acknowledgement = ack-nhfb,
  affiliation =  "Shomron, G (Reprint Author), Technion Israel Inst
                 Technol, IL-3200003 Haifa, Israel. Shomron, Gil;
                 Weiser, Uri, Technion Israel Inst Technol, IL-3200003
                 Haifa, Israel.",
  author-email = "gilsho@tx.technion.ac.il
                 uri.weiser@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "HI0TZ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "CNNs; computer vision; computer vision tasks;
                 Convolution; convolutional neural nets; convolutional
                 neural network; convolutional neural networks;
                 Correlation; Deep neural networks; deep neural
                 networks; Degradation; image classification; ImageNet;
                 learning (artificial intelligence); MAC operations;
                 Microsoft Windows; multiply-accumulate operations;
                 Neural networks; Predictive models; spatial
                 correlation; speech recognition; value prediction;
                 value prediction method; zero-valued activations",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Shomron:2019:SCV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Gupta:2019:DQL,
  author =       "Ujjwal Gupta and Sumit K. Mandal and Manqing Mao and
                 Chaitali Chakrabarti and Umit Y. Ogras",
  title =        "A Deep {Q}-Learning Approach for Dynamic Management of
                 Heterogeneous Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "14--17",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2892151",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Heterogeneous multiprocessor system-on-chips (SoCs)
                 provide a wide range of parameters that can be managed
                 dynamically. For example, one can control the type
                 (big/little), number and frequency of active cores in
                 state-of-the-art mobile processors at runtime. These
                 runtime choices lead to more than 10$ \times $ range in
                 execution time, 5$ \times $ range in power consumption,
                 and 50$ \times $ range in performance per watt.
                 Therefore, it is crucial to make optimum power
                 management decisions as a function of dynamically
                 varying workloads at runtime. This paper presents a
                 reinforcement learning approach for dynamically
                 controlling the number and frequency of active big and
                 little cores in mobile processors. We propose an
                 efficient deep Q-learning methodology to optimize the
                 performance per watt (PPW). Experiments using Odroid
                 XU3 mobile platform show that the PPW achieved by the
                 proposed approach is within 1 percent of the optimal
                 value obtained by an oracle.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mandal, SK (Reprint Author), Arizona State Univ, Sch
                 Elect Comp \& Energy Engn, Tempe, AZ 85281 USA. Gupta,
                 Ujjwal; Mandal, Sumit K.; Mao, Manqing; Chakrabarti,
                 Chaitali; Ogras, Umit Y., Arizona State Univ, Sch Elect
                 Comp \& Energy Engn, Tempe, AZ 85281 USA.",
  author-email = "ujjwal@asu.edu skmandal@asu.edu mmao7@asu.edu
                 chaitali@asu.edu umit@asu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HL5MF",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CNS-1526562]; Semiconductor Research
                 Corp. [2721.001]",
  funding-text = "This work was supported by NSF grant CNS-1526562 and
                 Semiconductor Research Corp. task 2721.001.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "active cores; deep Q-learning approach; Deep
                 reinforcement learning; dynamic management; execution
                 time; Frequency control; Heterogeneous multi-cores;
                 heterogeneous processors; Instruments; learning
                 (artificial intelligence); Memory management; mobile
                 computing; mobile processors; multiprocessing systems;
                 multiprocessor system-on-chips; Odroid XU3 mobile
                 platform show; optimum power management decisions;
                 power aware computing; power consumption; Power demand;
                 Power management; Power system management; PPW;
                 reinforcement learning approach; Runtime; SoCs;
                 system-on-chip; Training",
  number-of-cited-references = "15",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Gupta:2019:DQL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Rogers:2019:SLB,
  author =       "Samuel Rogers and Joshua Slycord and Ronak Raheja and
                 Hamed Tabkhi",
  title =        "Scalable {LLVM}-Based Accelerator Modeling in gem5",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "18--21",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2893932",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/python.bib",
  abstract =     "This article proposes a scalable integrated system
                 architecture modeling for hardware accelerator based in
                 gem5 simulation framework. The core of proposed
                 modeling is a LLVM-based simulation engine for modeling
                 any customized data-path with respect to inherent
                 data/instruction-level parallelism (derived by
                 algorithms) and available compute units (defined by the
                 user). The simulation framework also offers a
                 general-purpose communication interface that allows a
                 scalable and flexible connection into the gem5
                 ecosystem. Python API of gem5, enabling modifications
                 to the system hierarchy without the need to rebuild the
                 underlying simulator. Our simulation framework
                 currently supports full-system simulation (both
                 bare-metal and a full Linux kernel) for ARM-based
                 systems, with future plans to add support for RISC-V.
                 The LLVM-based modeling and modular integration to gem5
                 allow long-term simulation expansion and sustainable
                 design modeling for emerging applications with demands
                 for acceleration.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rogers, S (Reprint Author), Univ Noth Carolina, Dept
                 Elect \& Comp Engn, Charlotte, NC 28223 USA. Rogers,
                 Samuel; Slycord, Joshua; Raheja, Ronak; Tabkhi, Hamed,
                 Univ Noth Carolina, Dept Elect \& Comp Engn, Charlotte,
                 NC 28223 USA.",
  author-email = "sroger48@uncc.edu jslycord@uncc.edu rraheja@uncc.edu
                 htabkhiv@uncc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HL5MF",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application program interfaces; ARM-based systems;
                 Computational modeling; Computer architecture
                 simulation; customized data-path; Engines; field
                 programmable gate arrays; flexible connection;
                 full-system simulation; gem5 ecosystem; gem5 simulation
                 framework; general-purpose communication interface;
                 Hardware; hardware accelerator; hardware accelerators;
                 heterogeneous systems; inherent data; instruction-level
                 parallelism; Linux; LLVM-based modeling; LLVM-based
                 simulation engine; logic design; long-term simulation
                 expansion; microprocessor chips; multiprocessing
                 systems; parallel architectures; parallel programming;
                 program compilers; reduced instruction set computing;
                 Registers; RISC-V; Runtime; scalable connection;
                 scalable integrated system architecture modeling;
                 scalable LLVM-based accelerator modeling; Space
                 exploration; sustainable design modeling;
                 Synchronization; system hierarchy",
  number-of-cited-references = "11",
  ORCID-numbers = "Slycord, Joshua/0000-0002-0569-4094 Rogers,
                 Samuel/0000-0002-9697-2933",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Rogers:2019:SLB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Akin:2019:CAP,
  author =       "Berkin Akin and Alaa R. Alameldeen",
  title =        "A Case For Asymmetric Processing in Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "22--25",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2894800",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "By sidestepping the limitations at the memory
                 interface, processing-in-memory (PIM) unlocks
                 internally available memory bandwidth to the compute
                 units on the memory side. This abundant bandwidth is
                 conventionally utilized by highly-parallel
                 throughput-oriented many-core style PIM architectures
                 via offloading bandwidth-bound parallel tasks. However,
                 it can be difficult to fully isolate these PIM-suitable
                 tasks, and an offloaded program may include
                 compute-bound sequential phases. These PIM-averse
                 phases constitute a critical performance bottleneck for
                 conventional many-core style PIM architectures. In this
                 paper, we propose an analytical model for PIM execution
                 that considers a program's bandwidth demand as well as
                 its parallelism. Based on the proposed model, we make a
                 case for an asymmetric PIM architecture that can
                 mitigate the performance bottlenecks for PIM-averse
                 phases while keeping the performance upside for
                 PIM-suitable phases.",
  acknowledgement = ack-nhfb,
  affiliation =  "Akin, B (Reprint Author), Intel Labs, Hillsboro, OR
                 97124 USA. Akin, Berkin; Alameldeen, Alaa R., Intel
                 Labs, Hillsboro, OR 97124 USA.",
  author-email = "berkin.akin@intel.com alaa.r.alameldeen@intel.com",
  da =           "2019-06-20",
  doc-delivery-number = "HL5MF",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Analytical models; analytical performance model;
                 asymmetric multicore; asymmetric PIM architecture;
                 asymmetric processing; Bandwidth; bandwidth-bound
                 parallel tasks; Computational modeling; compute-bound
                 sequential phases; critical performance bottleneck;
                 memory bandwidth; memory interface; microprocessor
                 chips; Multicore processing; multiprocessing systems;
                 parallel processing; performance evaluation; PIM
                 execution; PIM-averse phases; PIM-suitable tasks;
                 Processing in memory; processing-in-memory; Silicon;
                 Task analysis; throughput-oriented many-core style
                 PIM",
  keywords-plus = "AMDAHLS LAW",
  number-of-cited-references = "9",
  ORCID-numbers = "Akin, Berkin/0000-0001-6908-5581",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Akin:2019:CAP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Tovletoglou:2019:SIH,
  author =       "Konstantinos Tovletoglou and Lev Mukhanov and
                 Dimitrios S. Nikolopoulos and Georgios Karakonstantis",
  title =        "{Shimmer}: Implementing a Heterogeneous-Reliability
                 {DRAM} Framework on a Commodity Server",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "26--29",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2893189",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper, we present the implementation of a
                 heterogeneous-reliability DRAM framework, Shimmer, on a
                 commodity server with a fully fledged OS. Shimmer
                 enables splitting of DRAM into multiple domains with
                 varying reliability and allocation of data depending on
                 their criticality. Compared to existing studies which
                 use simulators, we consider practical restrictions
                 stemming from the real hardware and investigate methods
                 to overcome them. In particular, we reveal that the
                 implementation of the heterogeneous-reliability memory
                 framework requires disabling of the hardware memory
                 interleaving, which results in a significant
                 degradation of the system performance. To overcome the
                 induced performance loss, we develop a software-based
                 interleaving. We evaluate the performance, power and
                 energy of the server using 35 benchmarks across three
                 memory configurations: the baseline configuration; with
                 disabled hardware memory interleaving and Shimmer with
                 software-based memory interleaving. Our results show
                 that Shimmer introduces a minor 6\% performance
                 overhead, while reducing the average DRAM power by
                 19.9\% when memory operates under relaxed refresh rate
                 and lowered memory supply voltage. As one of our main
                 contributions we demonstrate that a
                 heterogeneous-reliability framework based on Shimmer
                 can be realized on a commodity server and save 9.1\% of
                 the total processor and memory energy.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tovletoglou, K (Reprint Author), Queens Univ Belfast,
                 Belfast BT7 1NN, Antrim, North Ireland. Tovletoglou,
                 Konstantinos; Mukhanov, Lev; Nikolopoulos, Dimitrios
                 S.; Karakonstantis, Georgios, Queens Univ Belfast,
                 Belfast BT7 1NN, Antrim, North Ireland.",
  author-email = "ktovletoglou01@qub.ac.uk l.mukhanov@qub.ac.uk
                 d.nikolopoulos@qub.ac.uk g.karakonstantis@qub.ac.uk",
  da =           "2019-06-20",
  doc-delivery-number = "HL5WL",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "European Union [688540]",
  funding-text = "This work is funded by the H2020 Programme of the
                 European Union under grant no. 688540 (the UniServer
                 Project).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "average DRAM power; Bandwidth; commodity server;
                 critical data; disabled hardware memory interleaving;
                 DRAM; DRAM chips; energy saving; Hardware;
                 heterogeneous-reliability DRAM framework;
                 heterogeneous-reliability memory;
                 heterogeneous-reliability memory framework; induced
                 performance loss; integrated circuit reliability;
                 interleaved storage; lowered memory supply voltage;
                 memory configurations; memory interleaving; Memory
                 management; Power efficiency; Random access memory;
                 reliability; Reliability; reliability; Reliability;
                 reliability; Resource management; Servers; Shimmer;
                 software-based interleaving; software-based memory
                 interleaving",
  number-of-cited-references = "17",
  ORCID-numbers = "Nikolopoulos, Dimitrios/0000-0003-0217-8307
                 Tovletoglou, Konstantinos/0000-0002-1513-3143",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Tovletoglou:2019:SIH",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kumar:2019:HRA,
  author =       "Chanchal Kumar and Sidharth Singh and Gregory T.
                 Byrd",
  title =        "Hybrid Remote Access Protocol",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "30--33",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2896116",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The invalidation-based cache coherence protocols used
                 in current CMPs result in inefficient utilization of
                 cache hierarchy in the presence of heavy sharing, since
                 a significant percentage of shared cached data is
                 invalidated soon after it is brought into the private
                 cache. This work presents an analysis of a shared
                 memory cache coherence protocol; based on novel
                 insights from the analysis, we advocate direct remote
                 reads/writes at the shared last-level cache for heavily
                 contended data. Evaluation of our proposed protocol
                 with the Splash2x kernels shows 17 percent geometric
                 mean speedup over traditional MESI coherence and 8.5
                 percent better performance than prior remote-access
                 proposals.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kumar, C (Reprint Author), North Carolina State Univ,
                 Raleigh, NC 27695 USA. Kumar, Chanchal; Byrd, Gregory
                 T., North Carolina State Univ, Raleigh, NC 27695 USA.
                 Singh, Sidharth, North Carolina State Univ, Apple Inc,
                 Raleigh, NC 27695 USA.",
  author-email = "ckumar2@ncsu.edu sssingh4@ncsu.edu gbyrd@ncsu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HL5WL",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Access protocols; Benchmark testing; cache hierarchy;
                 cache storage; CMPs; Coherence; direct remote reads;
                 direct remote writes; geometric mean speedup; Hardware;
                 hybrid remote access protocol; invalidation-based cache
                 coherence protocols; Kernel; memory hierarchy; MESI
                 coherence; microprocessor chips; multi-core/single-chip
                 multiprocessors; Parallel architectures; private cache;
                 Proposals; protocols; shared cached data; shared
                 last-level cache; shared memory cache coherence
                 protocol; shared memory systems; Splash2x kernels",
  number-of-cited-references = "10",
  ORCID-numbers = "Byrd, Gregory/0000-0003-3647-8738",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kumar:2019:HRA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wang:2019:DDD,
  author =       "Yicheng Wang and Yang Liu and Peiyun Wu and Zhao
                 Zhang",
  title =        "Detect {DRAM} Disturbance Error by Using Disturbance
                 Bin Counters",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "34--37",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2897299",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/hash.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "DRAM disturbance errors are increasingly a concern to
                 computer system reliability and security. There have
                 been a number of designs to detect and prevent them;
                 however, there lacks any design that guarantees 100
                 percent detection (no false negative) with a small and
                 fixed hardware cost. This paper presents such a design
                 based on a novel idea called disturbance bin counter
                 (DBC). Each DBC is a complex counter that maintains an
                 upper bound of disturbances for a bin of DRAM rows.
                 Their access is not in the critical path of processor
                 execution and thus incurs no performance overhead. The
                 design is optimized at the circuit level to minimize
                 the storage requirement. Our simulation results using
                 multi-core SPEC CPU2006 workloads show that no false
                 positive occurs with a 1,024-entry DBC table, which
                 requires only 4.5 KB storage. The design can be
                 incorporated into a memory controller to guarantee the
                 detection of DRAM disturbance errors or row hammering
                 by malicious programs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wang, YC (Reprint Author), Univ Illinois, Chicago, IL
                 60607 USA. Wang, Yicheng; Liu, Yang; Wu, Peiyun; Zhang,
                 Zhao, Univ Illinois, Chicago, IL 60607 USA.",
  author-email = "ywang271@uic.edu yliu327@uic.edu pwu27@uic.edu
                 zhangz@uic.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HL5WL",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "US National Science Foundation
                 [CCF-1618104, CCF-1643271]",
  funding-text = "The authors appreciate the constructive comments from
                 the anonymous reviewers. This work is supported in part
                 by the US National Science Foundation under grants
                 CCF-1618104 and CCF-1643271.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "complex counter; Computer architecture; computer
                 system reliability; counting circuits; DBC table;
                 disturbance bin counter; DRAM; DRAM chips; DRAM
                 disturbance errors; DRAM rows; fixed hardware cost;
                 Hash functions; Indexes; malicious programs; memory
                 size 4.5 KByte; Microprocessors; Random access memory;
                 reliability; row-hammering; Transistors; Upper bound",
  number-of-cited-references = "10",
  ORCID-numbers = "Wu, Peiyun/0000-0001-5675-6454 Liu,
                 Yang/0000-0002-7377-1418 Wang,
                 Yicheng/0000-0003-1079-5591",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Wang:2019:DDD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
  xxpages =      "35--38",
}

@Article{Xie:2019:NXB,
  author =       "Xinfeng Xie and Xing Hu and Peng Gu and Shuangchen Li
                 and Yu Ji and Yuan Xie",
  title =        "{NNBench-X}: Benchmarking and Understanding Neural
                 Network Workloads for Accelerator Designs",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "38--42",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2898196",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The tremendous impact of deep learning algorithms over
                 a wide range of application domains has encouraged a
                 surge of neural network (NN) accelerator research. An
                 evolving benchmark suite and its associated benchmark
                 method are needed to incorporate emerging NN models and
                 characterize NN workloads. In this paper, we propose a
                 novel approach to understand the performance
                 characteristic of NN workloads for accelerator designs.
                 Our approach takes as input an application candidate
                 pool and conducts an operator-level analysis and
                 application-level analysis to understand the
                 performance characteristics of both basic tensor
                 primitives and whole applications. We conduct a case
                 study on the TensorFlow model zoo by using this
                 proposed characterization method. We find that tensor
                 operators with the same functionality can have very
                 different performance characteristics under different
                 input sizes, while operators with different
                 functionality can have similar characteristics.
                 Additionally, we observe that without operator-level
                 analysis, the application bottleneck is
                 mischaracterized for 15 out of 57 models from the
                 TensorFlow model zoo. Overall, our characterization
                 method helps users select representative applications
                 out of the large pool of possible applications, while
                 providing insightful guidelines for the design of NN
                 accelerators.",
  acknowledgement = ack-nhfb,
  affiliation =  "Xie, XF (Reprint Author), Univ Calif Santa Barbara,
                 Santa Barbara, CA 93106 USA. Xie, Xinfeng; Hu, Xing;
                 Gu, Peng; Li, Shuangchen; Ji, Yu; Xie, Yuan, Univ Calif
                 Santa Barbara, Santa Barbara, CA 93106 USA.",
  author-email = "xinfeng@ucsb.edu xinghu@ucsb.edu
                 peng\_gu@umail.ucsb.edu shuangchenli@ece.ucsb.edu
                 maple.jiyu@hotmail.com yuanxie@ucsb.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HQ4FG",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "US National Science Foundation
                 [1500848/172544/1730309]; CRISP--DARPA",
  funding-text = "This work was supported in part by US National Science
                 Foundation 1500848/172544/1730309 and by CRISP, one of
                 six centers in JUMP, a Semiconductor Research
                 Corporation program sponsored by DARPA.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerator designs; application-level analysis;
                 Artificial neural networks; benchmark; benchmark
                 method; benchmark testing; Benchmark testing;
                 characterization method; deep learning algorithms;
                 Feature extraction; Hardware; learning (artificial
                 intelligence); Measurement; neural nets; Neural
                 network; neural network accelerator research; neural
                 network workloads; NN accelerators; NN workloads;
                 NNBench-X; operator-level analysis; Parallel
                 processing; performance characteristic; tensor
                 operators; TensorFlow model zoo; workload
                 characterization",
  number-of-cited-references = "22",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Xie:2019:NXB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Khan:2019:RCA,
  author =       "Asif Ali Khan and Fazal Hameed and Robin Bl{\"a}sing
                 and Stuart Parkin and Jeronimo Castrillon",
  title =        "{RTSim}: a Cycle-Accurate Simulator for Racetrack
                 Memories",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "43--46",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2899306",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Racetrack memories (RTMs) have drawn considerable
                 attention from computer architects of late. Owing to
                 the ultra-high capacity and comparable access latency
                 to SRAM, RTMs are promising candidates to revolutionize
                 the memory subsystem. In order to evaluate their
                 performance and suitability at various levels in the
                 memory hierarchy, it is crucial to have RTM-specific
                 simulation tools that accurately model their behavior
                 and enable exhaustive design space exploration. To this
                 end, we propose RTSim, an open source cycle-accurate
                 memory simulator that enables performance evaluation of
                 the domain-wall-based racetrack memories. The
                 skyrmions-based RTMs can also be modeled with RTSim
                 because they are architecturally similar to
                 domain-wall-based RTMs. RTSim is developed in
                 collaboration with physicists and computer scientists.
                 It accurately models RTM-specific shift operations,
                 access ports management and the sequence of memory
                 commands beside handling the routine read/write
                 operations. RTSim is built on top of NVMain2.0.
                 offering larger design space for exploration.",
  acknowledgement = ack-nhfb,
  affiliation =  "Khan, AA (Reprint Author), Tech Univ Dresden, Chair
                 Compiler Construct, D-01069 Dresden, Germany. Khan,
                 Asif Ali; Hameed, Fazal; Castrillon, Jeronimo, Tech
                 Univ Dresden, Chair Compiler Construct, D-01069
                 Dresden, Germany. Blaesing, Robin; Parkin, Stuart, Max
                 Planck Inst Microstruct Phys Halle, D-06120 Halle,
                 Germany. Hameed, Fazal, Inst Space Technol, Islamabad
                 44000, Pakistan.",
  author-email = "asif\_ali.khan@tu-dresden.de
                 fazal.hameed@tu-dresden.de blaesing@mpi-halle.mpg.de
                 stuart.parkin@mpi-halle.mpg.de
                 jeronimo.castrillon@tu-dresden.de",
  da =           "2019-06-20",
  doc-delivery-number = "HQ4FG",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "German Research Council (DFG) through the
                 Cluster of Excellence `Center for Advancing Electronics
                 Dresden' (cfaed)",
  funding-text = "This work was partially funded by the German Research
                 Council (DFG) through the Cluster of Excellence `Center
                 for Advancing Electronics Dresden' (cfaed).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache; comparable access latency; Computational
                 modeling; cycle-accurate simulator; design space
                 exploration; domain wall memory; domain-wall-based
                 racetrack memories; domain-wall-based RTM; emerging
                 memory technologies; Layout; main memory; memory
                 hierarchy; Memory management; Memory simulator; memory
                 subsystem; memory system; models RTM-specific shift
                 operations; Nonvolatile memory; NVM; open source
                 cycle-accurate memory simulator; racetrack memory;
                 Random access memory; random-access storage;
                 RTM-specific simulation tools; RTSim; scratchpad;
                 simulation; skyrmions-based RTM; Space exploration;
                 storage management; Tracking",
  keywords-plus = "PERFORMANCE; MODEL; AREA",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Khan:2019:RCA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Gan:2019:SSV,
  author =       "Yiming Gan and Yuxian Qiu and Jingwen Leng and Yuhao
                 Zhu",
  title =        "{SVSoC}: Speculative Vision Systems-on-a-Chip",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "47--50",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2903241",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Frame latency in continuous vision significantly
                 impacts the agility of intelligent machines that
                 interact with the environment via cameras. However,
                 today's continuous vision systems limit the frame
                 latency due to their fundamental sequential execution
                 model. We propose a speculative execution model along
                 with two mechanisms that enable practical vision
                 speculation. We present SVSOC, a new mobile
                 Systems-on-a-chip (SoC) architecture that augments
                 conventional mobile SoCs with the speculation
                 capability. Under the same energy budget, SVSOC
                 achieves 14.3 to 35.4 percent latency reduction in
                 different scenarios.",
  acknowledgement = ack-nhfb,
  affiliation =  "Gan, YM (Reprint Author), Univ Rochester, Comp Sci,
                 601 Elmwood Ave, Rochester, NY 14627 USA. Gan, Yiming;
                 Zhu, Yuhao, Univ Rochester, Comp Sci, 601 Elmwood Ave,
                 Rochester, NY 14627 USA. Qiu, Yuxian, Shanghai Jiao
                 Tong Univ, Comp Sci, Shanghai 200240, Peoples R China.
                 Leng, Jingwen, Shanghai Jiao Tong Univ, Dept Comp Sci
                 \& Engn, Shanghai 200240, Peoples R China.",
  author-email = "ygan10@ur.rochester.edu qiuyuxian@sjtu.edu.cn
                 leng-jw@sjtu.edu.cn yzhu@rochester.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HS8NK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computational modeling; computer vision; Continuous
                 vision; continuous vision systems; control engineering
                 computing; fundamental sequential execution model;
                 Imaging; intelligent machines; IP networks;
                 microprocessor chips; mobile systems-on-a-chip
                 architecture; practical vision speculation; Predictive
                 models; Runtime; Sensors; speculation; speculation
                 capability; speculative execution model; speculative
                 vision systems-on-a-chip; SVSoC; system-on-chip;
                 systems-on-a-chip; Task analysis",
  number-of-cited-references = "11",
  ORCID-numbers = "Gan, Yiming/0000-0002-2033-5057",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Gan:2019:SSV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lin:2019:DSE,
  author =       "Ting-Ru Lin and Yunfan Li and Massoud Pedram and
                 Lizhong Chen",
  title =        "Design Space Exploration of Memory Controller
                 Placement in Throughput Processors with Deep Learning",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "51--54",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2905587",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As throughput-oriented processors incur a significant
                 number of data accesses, the placement of memory
                 controllers (MCs) has a critical impact on overall
                 performance. However, due to the lack of a systematic
                 way to explore the huge design space of MC placements,
                 only a few ad-hoc placements have been proposed,
                 leaving much of the opportunity unexploited. In this
                 paper, we present a novel deep-learning based framework
                 that explores this opportunity intelligently and
                 automatically. The proposed framework employs a genetic
                 algorithm to efficiently guide exploration through the
                 large design space while utilizing deep learning
                 methods to provide fast performance prediction of
                 design points instead of relying on slow full system
                 simulations. Evaluation shows that, the proposed deep
                 learning models achieves a speedup of 282X for the
                 search process, and the MC placement found by our
                 framework improves the average performance (IPC) of 18
                 benchmarks by 19.3 percent over the best-known
                 placement found by human intuition.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lin, TR (Reprint Author), Univ Southern Calif, Los
                 Angeles, CA 90007 USA. Lin, Ting-Ru; Pedram, Massoud,
                 Univ Southern Calif, Los Angeles, CA 90007 USA. Li,
                 Yunfan; Chen, Lizhong, Oregon State Univ, Corvallis, OR
                 97331 USA.",
  author-email = "tingruli@usc.edu liyunf@oregonstate.edu pedram@usc.edu
                 chenliz@oregonstate.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HS8NK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [1566637,
                 1619456, 1619472, 1750047]; National Science Foundation
                 Software and Hardware Foundations",
  funding-text = "We appreciate Shao-Hua Sun's assistance in DNN
                 development. This research is supported, in part, by
                 the National Science Foundation grants \#1566637,
                 \#1619456, \#1619472 and \#1750047, and Software and
                 Hardware Foundations.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ad-hoc placements; Benchmark testing; computer
                 architecture; Computer architecture; computer
                 architecture; Computer architecture; critical impact;
                 data accesses; Deep learning; deep learning; Deep
                 learning; deep learning; deep learning methods;
                 deep-learning based framework; design points; design
                 space; design space exploration; fast performance
                 prediction; genetic algorithm; genetic algorithms;
                 Interconnection networks; Kernel; MC placement; memory
                 architecture; memory controller placement; memory
                 controllers; neural nets; Program processors; search
                 problems; search process; Space exploration;
                 Throughput; throughput processors; throughput-oriented
                 processors",
  keywords-plus = "GAME; GO",
  number-of-cited-references = "10",
  ORCID-numbers = "Lin, Ting-Ru/0000-0002-7272-4070 Chen,
                 Lizhong/0000-0001-5890-7121",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Lin:2019:DSE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Arafa:2019:PGS,
  author =       "Yehia Arafa and Abdel-Hameed A. Badawy and Gopinath
                 Chennupati and Nandakishore Santhi and Stephan
                 Eidenbenz",
  title =        "{PPT--GPU}: Scalable {GPU} Performance Modeling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "55--58",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2904497",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Performance modeling is a challenging problem due to
                 the complexities of hardware architectures. In this
                 paper, we present PPT-GPU, a scalable and accurate
                 simulation framework that enables GPU code developers
                 and architects to predict the performance of
                 applications in a fast, and accurate manner on
                 different GPU architectures. PPT-GPU is part of the
                 open source project, Performance Prediction Toolkit
                 (PPT) developed at the Los Alamos National Laboratory.
                 We extend the old GPU model in PPT that predict the
                 runtimes of computational physics codes to offer better
                 prediction accuracy, for which, we add models for
                 different memory hierarchies found in GPUs and
                 latencies for different instructions. To further show
                 the utility of PPT-GPU, we compare our model against
                 real GPU device (s) and the widely used cycle-accurate
                 simulator, GPGPU-Sim using different workloads from
                 RODINIA and Parboil benchmarks. The results indicate
                 that the predicted performance of PPT-GPU is within a
                 10 percent error compared to the real device(s). In
                 addition, PPT-GPU is highly scalable, where it is up to
                 450x faster than GPGPU-Sim with more accurate
                 results.",
  acknowledgement = ack-nhfb,
  affiliation =  "Arafa, Y (Reprint Author), New Mexico State Univ,
                 Klipsch Sch ECE, Las Cruces, NM 88003 USA. Arafa,
                 Yehia; Badawy, Abdel-Hameed A., New Mexico State Univ,
                 Klipsch Sch ECE, Las Cruces, NM 88003 USA. Badawy,
                 Abdel-Hameed A.; Chennupati, Gopinath; Santhi,
                 Nandakishore; Eidenbenz, Stephan, Los Alamos Natl Lab,
                 SM 30, Los Alamos, NM 87545 USA.",
  author-email = "yarafa@nmsu.edu badawy@nmsu.edu gchennupati@lanl.gov
                 nsanthi@lanl.gov eidenben@lanl.gov",
  da =           "2019-06-20",
  doc-delivery-number = "HU4EG",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "U.S. Department of Energy (DOE) National
                 Nuclear Security Administration (NNSA)
                 [DE-AC52-06NA25396]",
  funding-text = "The authors would like to thank the anonymous
                 reviewers for their feedback which improved the quality
                 of the paper. We would also like to thank the members
                 of the PEARL laboratory at NMSU. Parts of this research
                 used resources provided at the Los Alamos National
                 Laboratory Institutional Computing Program, which is
                 supported through the U.S. Department of Energy (DOE)
                 National Nuclear Security Administration (NNSA) under
                 Contract No. DE-AC52-06NA25396. Computations were run
                 on Darwin, a research computing heterogeneous cluster.
                 Any opinions, findings, and/or conclusions expressed in
                 this paper do not necessarily represent the views of
                 the DOE or the U.S. Government.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architects; C language; Computational modeling;
                 Computer architecture; GPGPU; GPGPU-Sim; GPU
                 architectures; GPU device; GPU modeling; graphics
                 processing units; Graphics processing units; Kernel;
                 Los Alamos national laboratory; old GPU model; open
                 source project; parallel architectures; Parboil
                 benchmarks; performance evaluation; performance
                 prediction; performance prediction toolkit; power aware
                 computing; PPT; PPT-GPU; Predictive models; RODINIA;
                 Runtime; scalable GPU Performance modeling;
                 software/hardware co-design; Task analysis",
  keywords-plus = "ROOFLINE",
  number-of-cited-references = "22",
  ORCID-numbers = "Badawy, Abdel-Hameed/0000-0001-8027-1449",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Arafa:2019:PGS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Denby:2019:OEC,
  author =       "Bradley Denby and Brandon Lucia",
  title =        "Orbital Edge Computing: Machine Inference in Space",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "59--62",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2907539",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Edge computing is an emerging paradigm aiding
                 responsiveness, reliability, and scalability of
                 terrestrial computing and sensing networks like
                 cellular and IoT. However, edge computing is largely
                 unexplored in high-datarate nanosatellite
                 constellations. Cubesats are small, energy-limited
                 sensors separated from the cloud by hundreds of
                 kilometers of atmosphere and space. As they
                 proliferate, centralized architectures impede advanced
                 applications. In this work, we define and characterize
                 Orbital Edge Computing. We describe power and software
                 optimizations for the orbital edge, and we use
                 formation flying to parallelize computation in space.",
  acknowledgement = ack-nhfb,
  affiliation =  "Denby, B (Reprint Author), Carnegie Mellon Univ,
                 Pittsburgh, PA 15213 USA. Denby, Bradley; Lucia,
                 Brandon, Carnegie Mellon Univ, Pittsburgh, PA 15213
                 USA.",
  author-email = "bdenby@andrew.cmu.edu blucia@andrew.cmu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HU4EG",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Kavcic-Moura Endowment Fund; US National
                 Science Foundation CAREER Award [1751029]",
  funding-text = "We thank the reviewers for the helpful feedback. This
                 work was generously funded by the Kavcic-Moura
                 Endowment Fund and US National Science Foundation
                 CAREER Award \#1751029.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "artificial satellites; Cameras; computer vision;
                 Cubesat; CubeSat; Downlink; edge computing;
                 high-datarate nanosatellite constellations; Internet of
                 Things; machine inference; orbital edge computing;
                 Orbits; paradigm aiding responsiveness; Pipeline
                 processing; remote sensing; satellite communication;
                 Sensors; telecommunication computing; telecommunication
                 network reliability; terrestrial computing; wireless
                 sensor networks",
  number-of-cited-references = "39",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Denby:2019:OEC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Liu:2019:UFT,
  author =       "He Liu and Jianhui Han and Youhui Zhang",
  title =        "A Unified Framework for Training, Mapping and
                 Simulation of {ReRAM}-Based Convolutional Neural
                 Network Acceleration",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "63--66",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2908374",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "ReRAM-based neural network accelerators (RNAs) could
                 outshine their digital counterparts in terms of
                 computational efficiency and performance remarkably.
                 However, some open software tool for broad
                 architectural exploration and end-to-end evaluation are
                 still missing. We present a simulation framework of RNA
                 for CNN inference that encompasses a ReRAM-aware NN
                 training tool, a CNN-oriented mapper and a
                 micro-architecture simulator. Main characteristics of
                 ReRAM and circuits are reflected by the configurable
                 simulator, as well as by the customized training
                 algorithm. The function of the simulator's core
                 components is verified by the corresponding circuit
                 simulation of a real chip design. This framework
                 enables comprehensive architectural exploration and
                 end-to-end evaluation, and it's preliminary version is
                 available at https://github.com/CRAFT-THU/XB-Sim.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zhang, YH (Reprint Author), Tsinghua Univ, Dept Comp
                 Sci \& Technol, Beijing 100084, Peoples R China. Liu,
                 He; Zhang, Youhui, Tsinghua Univ, Dept Comp Sci \&
                 Technol, Beijing 100084, Peoples R China. Han, Jianhui,
                 Tsinghua Univ, Inst Microelect, Beijing 100084, Peoples
                 R China.",
  author-email = "liuhe94@hotmail.com hanjh16@mails.tsinghua.edu.cn
                 zyh02@tsinghua.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "HU4EG",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Beijing Innovation Center for Future Chip;
                 Science and Technology Innovation Special Zone project,
                 China; HUAWEI project",
  funding-text = "Thanks for the support from Beijing Innovation Center
                 for Future Chip, the support of the Science and
                 Technology Innovation Special Zone project, China, and
                 the support of HUAWEI project.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerator; Artificial neural networks; CNN
                 inference; CNN-oriented mapper; computational
                 efficiency; Computational modeling; Computer
                 architecture; configurable simulator; convolutional
                 neural nets; customized training algorithm; Deep neural
                 network; digital counterparts; end-to-end evaluation;
                 Hardware; learning (artificial intelligence);
                 microarchitecture simulator; Microprocessors; open
                 software tool; processing-in-memory; ReRAM; ReRAM-aware
                 NN training tool; ReRAM-based convolutional neural
                 network acceleration; ReRAM-based neural network
                 accelerators; RNA; simulation; Training",
  number-of-cited-references = "22",
  ORCID-numbers = "Liu, He/0000-0002-9117-5265 Han,
                 Jianhui/0000-0002-8705-134X",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Liu:2019:UFT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Tan:2019:DWO,
  author =       "Tian Tan and Eriko Nurvitadhi and Derek Chiou",
  title =        "Dark Wires and the Opportunities for Reconfigurable
                 Logic",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "67--70",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2909867",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Power has become a fundamental limit to silicon
                 performance. Most research has focused on reducing
                 transistor switching to constrain power (dark silicon.)
                 Specialized accelerators have been proposed since they
                 implement functionality with fewer transistor switches
                 than general purpose cores. Increasing efficiency
                 requirements lead to more specialization and,
                 therefore, more accelerators that potentially leads to
                 longer distances to get to all the accelerators.
                 Communication, however, consumes energy, and therefore
                 needs to be minimized as well (dark wires.) This paper
                 examines the balance between compute and communication
                 specialization in the context of hard logic (e.g.,
                 ASIC) that is highly efficient but static versus soft
                 logic (e.g., FPGA) that is less efficient but allows
                 computation to be moved to reduce communication
                 distances. Our experimental results show using soft
                 accelerators consumes 0.6$ \times $-2.1$ \times $ total
                 power compared to using hard accelerators when
                 communication costs are taken into account.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tan, T (Reprint Author), Univ Texas Austin, Elect \&
                 Comp Engn, Austin, TX 78712 USA. Tan, Tian, Univ Texas
                 Austin, Elect \& Comp Engn, Austin, TX 78712 USA.
                 Nurvitadhi, Eriko, Intel Corp, Santa Clara, CA 95054
                 USA. Chiou, Derek, Univ Texas Austin, Austin, TX 78712
                 USA. Chiou, Derek, Microsoft, Austin, TX 78712 USA.",
  author-email = "tan.tian@utexas.edu eriko.nurvitadhi@intel.com
                 derek@utexas.edu",
  da =           "2019-06-20",
  doc-delivery-number = "HW7ZH",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Intel Corporation, Hillsboro, OR",
  funding-text = "Funding for this work was provided by Intel
                 Corporation, Hillsboro, OR. The authors would like to
                 thank the colleagues in the Accelerator Architecture
                 Lab at Intel Corporation, Hillsboro, OR and FAST
                 research group at the University of Texas at Austin,
                 Austin, TX for the discussion and feedback.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application specific integrated circuits; ASIC;
                 Benchmark testing; communication costs; communication
                 distances; communication specialization; dark silicon;
                 dark wires; efficiency requirements; elemental
                 semiconductors; energy efficient architecture; Field
                 programmable gate arrays; field programmable gate
                 arrays; FPGA; fundamental limit; general purpose cores;
                 geographical locality; hard logic; hardware
                 accelerator; Layout; low-power electronics;
                 reconfigurable logic; Silicon; silicon performance;
                 soft accelerators; Specialized accelerators; static
                 versus soft logic; Throughput; transistor circuits;
                 transistor switches; transistor switching; Transistors;
                 wires; Wires",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Tan:2019:DWO",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Naithani:2019:PRE,
  author =       "Ajeya Naithani and Josue Feliu and Almutaz Adileh and
                 Lieven Eeckhout",
  title =        "Precise Runahead Execution",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "71--74",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2910518",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Runahead execution improves processor performance by
                 accurately prefetching long-latency memory accesses.
                 When a long-latency load causes the instruction window
                 to fill up and halt the pipeline, the processor enters
                 runahead mode and keeps speculatively executing code to
                 trigger accurate prefetches. A recent improvement
                 tracks the chain of instructions that leads to the
                 long-latency load, stores it in a runahead buffer, and
                 executes only this chain during runahead execution,
                 with the purpose of generating more prefetch requests
                 during runahead execution. Unfortunately, all these
                 prior runahead proposals have shortcomings that limit
                 performance and energy efficiency because they discard
                 the full instruction window to enter runahead mode and
                 then flush the pipeline to restart normal operation.
                 This significantly constrains the performance benefits
                 and increases the energy overhead of runahead
                 execution. In addition, runahead buffer limits prefetch
                 coverage by tracking only a single chain of
                 instructions that lead to the same long-latency load.
                 We propose precise runahead execution (PRE) to mitigate
                 the shortcomings of prior work. PRE leverages the
                 renaming unit to track all the dependency chains
                 leading to long-latency loads. PRE uses a novel
                 approach to manage free processor resources to execute
                 the detected instruction chains in runahead mode
                 without flushing the pipeline. Our results show that
                 PRE achieves an additional 21.1 percent performance
                 improvement over the recent runahead proposals while
                 reducing energy consumption by 6.1 percent.",
  acknowledgement = ack-nhfb,
  affiliation =  "Naithani, A (Reprint Author), Univ Ghent, B-9000
                 Ghent, Belgium. Naithani, Ajeya; Adileh, Almutaz;
                 Eeckhout, Lieven, Univ Ghent, B-9000 Ghent, Belgium.
                 Feliu, Josue, Univ Politecn Valencia, Valencia 46010,
                 Spain.",
  author-email = "ajeya.naithani@ugent.be jofepre@gap.upv.es
                 almutaz.adileh@ugent.be lieven.eeckhout@ugent.be",
  da =           "2019-06-20",
  doc-delivery-number = "HW9SJ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "FWO [G.0434.16N, G.0144.17N]; European
                 Research Council (ERC) [741097]",
  funding-text = "This research is supported through FWO grants no.
                 G.0434.16N and G.0144.17N, and European Research
                 Council (ERC) Advanced Grant agreement no. 741097.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Buffer storage; dependency chains; energy efficiency;
                 instruction window; long-latency load; long-latency
                 memory accesses; Microarchitecture; Microsoft Windows;
                 Out of order; pipeline processing; Pipelines; power
                 aware computing; precise runahead execution; prefetch
                 requests; Prefetching; Proposals; Registers; runahead
                 buffer limits; runahead execution; single-core
                 performance; storage management",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Naithani:2019:PRE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Agrawal:2019:MPS,
  author =       "V. Agrawal and M. A. Dinani and Y. Shui and M. Ferdman
                 and N. Honarmand",
  title =        "Massively Parallel Server Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "75--78",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2911287",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Modern data centers enjoy massive degrees of
                 request-level parallelism with significant
                 cross-request similarity. Although similar requests
                 follow similar instruction sequences, conventional
                 processors service them individually and do not take
                 full advantage of cross-request similarity.
                 Single-Instruction Multiple-Thread (SIMT) architectures
                 can leverage this similarity, however, existing SIMT
                 processors chief among them, GPUs are ill-suited for
                 server applications, as they are specifically designed
                 to maximize throughput at the expense of latency,
                 preventing them from meeting server QoS requirements.
                 We advocate a new approach to SIMT server processors,
                 namely Massively Parallel Server Processors (MPSPs),
                 which we outline in this paper. To begin to understand
                 their architectural needs, we measure the degree of
                 control-flow and memory-access divergence encountered
                 when running unmodified server applications on
                 MPSP-style processors. Our preliminary results indicate
                 that a software scheduler that bundles together similar
                 requests can minimize control-flow divergence, making
                 SIMT execution of unmodified server code feasible.
                 Moreover, we find that memory-access divergence,
                 although significant in raw numbers, can be tackled
                 with changes in stack and heap layouts. Overall, our
                 results encourage further consideration of MPSPs as a
                 promising architecture for server processors.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; data centers; Instruction sets;
                 Message systems; Parallel processing; Quality of
                 service; servers; Servers; Single Instruction Multiple
                 Thread",
}

@Article{Golestani:2019:PMB,
  author =       "H. Golestani and G. Gupta and R. Sen",
  title =        "Performance Modeling and Bottleneck Analysis of {EDGE}
                 Processors Using Dependence Graphs",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "79--82",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2911514",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Exploring new directions in ISA and microarchitecture
                 design can be challenging due to the large search
                 space. Efficient tools and methods are needed to
                 quickly identify rewarding design choices. In this
                 work, we develop a graph-based framework that
                 effectively models complex architectures and enables
                 efficient analysis of their performance and
                 bottlenecks. We use this framework to investigate
                 proposals for EDGE (Explicit Data Graph Execution) ISA,
                 a new class of ISA in which programs are composed from
                 atomic blocks, each of which explicitly exposes
                 dataflow to hardware. We study the impact of two
                 important EDGE-specific design choices: block formats
                 and operand-movement instructions. We demonstrate how
                 this analysis leads to insights in EDGE
                 architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Analytical models; Bottleneck analysis; Data models;
                 EDGE; EDGE (Explicit Data Graph Execution); Hardware;
                 Hazards; ISA; Load modeling; Microarchitecture;
                 microarchitecture; Microarchitecture;
                 microarchitecture; performance modeling; Program
                 processors",
}

@Article{Leng:2019:ARA,
  author =       "J. Leng and A. Buyuktosunoglu and R. Bertran and P.
                 Bose and V. J. Reddi",
  title =        "Asymmetric Resilience for Accelerator-Rich Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "1",
  pages =        "83--86",
  month =        jan # "\slash " # jun,
  year =         "2019",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2019.2917898",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Accelerators are becoming popular owing to their
                 exceptional performance and power-efficiency. However,
                 researchers are yet to pay close attention to their
                 reliability a key challenge as technology scaling makes
                 building reliable systems challenging. A
                 straightforward solution to make accelerators reliable
                 is to design the accelerator from the ground-up to be
                 reliable by itself. However, such a myopic view of the
                 system, where each accelerator is designed in
                 isolation, is unsustainable as the number of integrated
                 accelerators continues to rise in SoCs. To address this
                 challenge, we propose a paradigm called asymmetric
                 resilience that avoids accelerator-specific reliability
                 design. Instead, its core principle is to develop the
                 reliable heterogeneous system around the CPU
                 architecture. We explain the implications of
                 architecting such a system and the modifications needed
                 in a heterogeneous system to adopt such an approach. As
                 an example, we demonstrate how to use asymmetric
                 resilience to handle GPU execution errors using the CPU
                 with minimal overhead. The general principles can be
                 extended to include other accelerators.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerator architecture; error recovery; Graphics
                 processing units; heterogeneous system; Kernel; Memory
                 management; Reliability; Resilience; Runtime; soft
                 errors; Task analysis; voltage noise",
}

@Article{Sadredini:2019:SEM,
  author =       "E. Sadredini and R. Rahimi and V. Verma and M. Stan
                 and K. Skadron",
  title =        "A Scalable and Efficient In-Memory Interconnect
                 Architecture for Automata Processing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "87--90",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2909870",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Oct 1 10:18:16 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Accelerating finite automata processing benefits
                 regular-expression workloads and a wide range of other
                 applications that do not map obviously to regular
                 expressions, including pattern mining, bioinfomatics,
                 and machine learning. Existing in-memory automata
                 processing accelerators suffer from inefficient routing
                 architectures. They are either incapable of efficiently
                 place-and-route a highly connected automaton or require
                 an excessive amount of hardware resources. In this
                 paper, we propose a compact, low-overhead, and yet
                 flexible in-memory interconnect architecture that
                 efficiently implements routing for next-state
                 activation, and can be applied to the existing
                 in-memory automata processing architectures. We use
                 SRAM 8T subarrays to evaluate our interconnect.
                 Compared to the Cache Automaton routing design, our
                 interconnect reduces the number of switches $ 7 \times
                 $, therefore, reduces area overhead for the
                 interconnect. It also has faster row cycle time because
                 of shorter wires and consumes less power.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Automata; automata processing; bioinfomatics; cache
                 automaton routing design; connected automaton; finite
                 automata; finite automata processing; Hardware;
                 hardware resources; in-memory automata; in-memory
                 automata processing accelerators; in-memory
                 interconnect architecture; Indexes; inefficient routing
                 architectures; integrated circuit interconnections;
                 Interconnect; machine learning; memory architecture;
                 Memory management; next-state activation; pattern
                 mining; processing in memory; Random access memory;
                 regular expression workloads; Routing; SRAM 8T
                 subarrays; SRAM chips",
}

@Article{Yasin:2019:TPM,
  author =       "A. Yasin and A. Mendelson and Y. Ben-Asher",
  title =        "Tuning Performance via Metrics with Expectations",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "91--94",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2916408",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Oct 1 10:18:16 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Modern server systems employ many features that are
                 difficult to exploit by software developers. This paper
                 calls for a new performance optimization approach that
                 uses designated metrics with expected optimal values. A
                 key insight is that expected values of these metrics
                 are essential in order to verify that no performance is
                 wasted during incremental utilization of processor
                 features. We define sample primary metrics for modern
                 architectures and present three distinct techniques
                 that help to determine their optimal values. Our
                 preliminary results successfully provide 2x-4x extra
                 speedup during tuning of commonly-used software
                 optimizations on the matrix-multiply kernel.
                 Additionally, our approach helped to identify
                 counter-intuitive causes that hurt multicore
                 scalability of an optimized deep-learning benchmark on
                 a Cascade Lake server.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Cascade Lake server; Code tuning; counter-intuitive
                 cause identification; expectations; expected optimal
                 values; incremental utilization; Kernel; learning
                 (artificial intelligence); matrix multiplication;
                 matrix-multiply kernel; Measurement; measurements;
                 micro-architecture; microprocessor chips; modern server
                 systems; multi-core/single-chip multiprocessors;
                 Multicore processing; multiprocessing systems;
                 Optimization; optimization; optimized deep-learning
                 benchmark; performance analysis; performance
                 evaluation; performance optimization approach;
                 processor features; sample primary metrics; Servers;
                 SIMD processors; software metrics; software
                 optimizations; Tuning; tuning performance",
}

@Article{Wang:2019:MEM,
  author =       "L. Wang and M. Jahre and A. Adileh and Z. Wang and L.
                 Eeckhout",
  title =        "Modeling Emerging Memory-Divergent {GPU}
                 Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "95--98",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2923618",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Oct 1 10:18:16 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
                 https://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "Analytical performance models yield valuable
                 architectural insight without incurring the excessive
                 runtime overheads of simulation. In this work, we study
                 contemporary GPU applications and find that the key
                 performance-related behavior of such applications is
                 distinct from traditional GPU applications. The key
                 issue is that these GPU applications are
                 memory-intensive and have poor spatial locality, which
                 implies that the loads of different threads commonly
                 access different cache blocks. Such memory-divergent
                 applications quickly exhaust the number of misses the
                 L1 cache can process concurrently, and thereby cripple
                 the GPU's ability to use Memory-Level Parallelism (MLP)
                 and Thread-Level Parallelism (TLP) to hide memory
                 latencies. Our Memory Divergence Model (MDM) is able to
                 accurately represent this behavior and thereby reduces
                 average performance prediction error by $ 14 \times $
                 compared to the state-of-the-art GPUMech approach
                 across our memory-divergent applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Analytical models; analytical performance models;
                 Analytical performance prediction; average performance
                 prediction error; cache blocks; cache storage;
                 Computational modeling; contemporary GPU applications;
                 GPU; graphics processing units; Graphics processing
                 units; Instruction sets; key performance-related
                 behavior; L1 cache; Mathematical model; memory
                 architecture; memory divergence model; memory
                 latencies; memory-divergent applications;
                 memory-divergent GPU applications; memory-intensive;
                 memory-level parallelism; multi-threading;
                 multiprocessing systems; Predictive models; Random
                 access memory; thread-level parallelism; traditional
                 GPU applications; valuable architectural insight",
}

@Article{Shomron:2019:SSS,
  author =       "G. Shomron and T. Horowitz and U. Weiser",
  title =        "{SMT-SA}: Simultaneous Multithreading in Systolic
                 Arrays",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "99--102",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2924007",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Oct 1 10:18:16 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Systolic arrays (SAs) are highly parallel pipelined
                 structures capable of executing various tasks such as
                 matrix multiplication and convolution. They comprise a
                 grid of usually homogeneous processing units (PUs) that
                 are responsible for the multiply-accumulate (MAC)
                 operations in the case of matrix multiplication. It is
                 not rare for a PU input to be zero-valued, in which
                 case the PU becomes idle and the array becomes
                 underutilized. In this paper we consider a solution to
                 employ the underutilized PUs via simultaneous
                 multithreading (SMT). We explore the design space of a
                 SMT-SA variant and evaluate its performance, area
                 efficiency, and energy consumption. In addition, we
                 suggest a tiling method to reduce area overheads. Our
                 evaluation shows that a 4-thread FP16-based SMT-SA
                 achieves speedups of up to $ 3.6 \times $ as compared
                 to conventional SA, with $ 1.7 \times $ area overhead
                 and negligible energy overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "4-thread FP16-based SMT-SA; area efficiency;
                 Convolution; Correlation; Deep learning; Energy
                 consumption; energy consumption; homogeneous processing
                 units; Instruction sets; matrix multiplication;
                 multi-threading; multiply-accumulate operations;
                 Multithreading; multithreading; parallel pipelined
                 structures; PU input; simultaneous multithreading;
                 SMT-SA variant; Systolic arrays; systolic arrays; Task
                 analysis",
}

@Article{Masouros:2019:RRS,
  author =       "D. Masouros and S. Xydis and D. Soudris",
  title =        "{Rusty}: Runtime System Predictability Leveraging
                 {LSTM} Neural Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "103--106",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2924622",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Oct 1 10:18:16 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Modern cloud scale data-centers are adopting workload
                 co-location as an effective mechanism for improving
                 resource utilization. However, workload co-location is
                 stressing resource availability in unconventional and
                 unpredictable manner. Efficient resource management
                 requires continuous and ideally predictive runtime
                 knowledge of system metrics, sensitive both to workload
                 demands, e.g., CPU, memory etc., as well as
                 interference effects induced by co-location. In this
                 paper, we present Rusty, a framework able to address
                 the aforementioned challenges by leveraging the power
                 of Long Short-Term Memory networks to forecast at
                 runtime, performance metrics of applications executed
                 on systems under interference. We evaluate Rusty under
                 a diverse set of interference scenarios for a plethora
                 of cloud workloads, showing that Rusty achieves
                 extremely high prediction accuracy, up to 0.99 in terms
                 of R2 value, satisfying at the same time the strict
                 latency constraints to be usable at runtime.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; cloud computing; cloud workloads;
                 computer centres; Correlation; datacenters; extremely
                 high prediction accuracy; interference; Interference;
                 interference effects; interference scenarios; long
                 short-term memory networks; LSTM neural networks;
                 Measurement; modern cloud scale data-centers;
                 Monitoring; recurrent neural nets; resource allocation;
                 resource availability; Resource management; resource
                 management; resource utilization; Run-time system
                 predictability; Runtime; runtime knowledge; runtime
                 system predictability leveraging LSTM neural networks;
                 Rusty; system metrics; unconventional manner; workload
                 co-location",
}

@Article{Kim:2019:THA,
  author =       "S. Kim and H. Jung and W. Shin and H. Lee and H. Lee",
  title =        "{HAD-TWL}: Hot Address Detection-Based Wear Leveling
                 for Phase-Change Memory Systems with Low Latency",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "107--110",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2929393",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Oct 1 10:18:16 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Phase-change memory (PCM) is an emerging non-volatile
                 memory device that offers faster access than flash
                 memory does. However, PCM suffers from a critical
                 problem where the number of write operations is
                 limited. The previous practical attack detector (PAD)
                 that uses a small memory space called stack adopts an
                 algebraic mapping-based wear leveling (AWL) algorithm.
                 Thanks to successful detection of malicious attacks,
                 the PAD-AWL dramatically improves the lifetime of PCM.
                 To enhance system factors such as write latency, the
                 proposed method replaces the AWL algorithm with a
                 table-based wear leveling (TWL) algorithm. Since the
                 fixed stack size of the previous PAD is inefficient in
                 detection of attack-like hot addresses, a stack size
                 modulation scheme that enables a hot address detector
                 (HAD) to efficiently counteract various memory write
                 streams is proposed. Compared with the previous
                 AWL-based algorithm, the integration with the TWL
                 algorithm demands only 24 percent of the total number
                 of swaps per write, and the proposed HAD with the stack
                 size modulation scheme achieves the detection rate of
                 94 percent while reducing the execution time by 57
                 percent.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "algebraic mapping-based wear leveling algorithm;
                 attack-like hot addresses; AWL-based algorithm;
                 detection rate; Detectors; embedded memory management
                 system; emerging nonvolatile memory device; endurance;
                 fixed stack size; flash memories; flash memory;
                 HAD-TWL; Hardware; hot address detection-based wear
                 leveling; hot address detector; malicious attacks;
                 Memory management; memory space; PAD-AWL; PCM; Phase
                 change materials; phase change memories; Phase-change
                 memory; phase-change memory systems; practical attack
                 detector; Pulse modulation; Random access memory; stack
                 size modulation scheme; system factors; table-based
                 wear leveling algorithm; TWL algorithm; wear; wear
                 leveling; write operations",
}

@Article{Zhou:2019:QCD,
  author =       "H. Zhou and G. T. Byrd",
  title =        "Quantum Circuits for Dynamic Runtime Assertions in
                 Quantum Computation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "111--114",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2935049",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Oct 1 10:18:16 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper, we propose quantum circuits for runtime
                 assertions, which can be used for both software
                 debugging and error detection. Runtime assertion is
                 challenging in quantum computing for two key reasons.
                 First, a quantum bit (qubit) cannot be copied, which is
                 known as the non-cloning theorem. Second, when a qubit
                 is measured, its superposition state collapses into a
                 classical state, losing the inherent parallel
                 information. In this paper, we overcome these
                 challenges with runtime computation through ancilla
                 qubits, which are used to indirectly collect the
                 information of the qubits of interest. We design
                 quantum circuits to assert classical states,
                 entanglement, and superposition states.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ancilla qubits; assertions; classical state;
                 Debugging; debugging; dynamic runtime assertions; error
                 detection; inherent parallel information; Logic gates;
                 Measurement uncertainty; noncloning theorem; program
                 debugging; quantum bit; quantum circuits; quantum
                 circuits design; quantum computation; quantum
                 computing; Quantum computing; quantum entanglement;
                 Quantum entanglement; quantum error detection; Qubit;
                 qubit; Runtime; runtime assertion; runtime computation;
                 software debugging; superposition state",
}

@Article{Rao:2019:ATC,
  author =       "J. Rao and T. Ao and K. Dai and X. Zou",
  title =        "{ARCE}: Towards Code Pointer Integrity on Embedded
                 Processors Using Architecture-Assisted Run-Time
                 Metadata Management",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "115--118",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2935445",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Oct 1 10:18:16 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Code Pointer Integrity (CPI) is an efficient control
                 flow protection technique focusing on sensitive code
                 pointers with a formal proof of security, but it relies
                 on software lookup tables or Memory Management Unit
                 (MMU) based address translation and instruction-level
                 memory isolation which are impractical for
                 resource-constrained embedded processors. This paper
                 enables Architecture-assisted Run-time CPI on Embedded
                 Processors (ARCE) with 2-level metadata to balance
                 security, performance and resource overhead. The first
                 level 2-bit property metadata colors data into
                 different domains and the second level boundary
                 metadata holds structure constraints for indirect code
                 pointers only. With memory and instruction extensions,
                 metadata shares the address space with program data and
                 is propagated at runtime to maintain a precise set of
                 sensitive code pointers. It lazily validates the
                 content and boundary of sensitive pointers at
                 dereference stage to eliminate false alarms. We
                 implemented ARCE based on a shallow 3-stage pipeline
                 processor Z-scale and validated its security
                 effectiveness with code pointer attack vectors in RIPE.
                 It introduces less than 1 percent performance overhead
                 for benchmarks in C with 7.33 percent logic and 6.25
                 percent memory overhead. ARCE eliminates address space
                 waste and dependency on advanced hardware which makes
                 CPI practical even for systems with bare metal
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ARCE; architecture-assisted run-time CPI on embedded
                 processors; architecture-assisted run-time metadata
                 management; code pointer attack vectors; code pointer
                 integrity; Code pointer integrity; control flow
                 protection technique; data flow analysis; embedded
                 processors; embedded systems; first level 2-bit
                 property metadata colors data; Hardware; indirect code
                 pointers; instruction extensions; instruction set
                 extensions; instruction sets; instruction-level memory
                 isolation; Integrated circuits; level boundary
                 metadata; Memory management; memory management unit
                 based address translation; meta data; Metadata;
                 microprocessor chips; MMU; multi-level metadata;
                 pipeline processing; Program processors; Registers;
                 resource-constrained embedded processors; RIPE;
                 security; Security; security of data; sensitive code
                 pointers; shallow 3-stage pipeline processor Z-scale;
                 software lookup tables; storage management; table
                 lookup",
}

@Article{Bhardwaj:2019:DOC,
  author =       "K. Bhardwaj and M. Havasi and Y. Yao and D. M. Brooks
                 and J. M. H. Lobato and G. Wei",
  title =        "Determining Optimal Coherency Interface for
                 Many-Accelerator {SoCs} Using {Bayesian} Optimization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "119--123",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2910521",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Oct 1 10:18:16 2019",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The modern system-on-chip (SoC) of the current
                 exascale computing era is complex. These SoCs not only
                 consist of several general-purpose processing cores but
                 also integrate many specialized hardware accelerators.
                 Three common coherency interfaces are used to integrate
                 the accelerators with the memory hierarchy:
                 non-coherent,coherent with the last-level cache (LLC),
                 and fully-coherent.However, using a single coherence
                 interface for all the accelerators in an SoC can lead
                 to significant overheads: in the non-coherent model,
                 accelerators directly access the main memory, which can
                 have considerable performance penalty; whereas in the
                 LLC-coherent model, the accelerators access the LLC but
                 may suffer from performance bottleneck due to
                 contention between several accelerators; and the
                 fully-coherent model, that relies on private caches,
                 can incur non-trivial power/area overheads. Given the
                 limitations of each of these interfaces, this paper
                 proposes a novel performance-aware hybrid coherency
                 interface, where different accelerators use different
                 coherency models, decided at design time based on the
                 target applications so as to optimize the overall
                 system performance. A new Bayesian optimization based
                 framework is also proposed to determine the optimal
                 hybrid coherency interface, i.e., use machine learning
                 to select the best coherency model for each of the
                 accelerators in the SoC in terms of performance. For
                 image processing and classification workloads, the
                 proposed framework determined that a hybrid interface
                 achieves up to 23 percent better performance compared
                 to the other homogeneous interfaces, where all the
                 accelerators use a single coherency model.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bayes methods; Bayesian optimization; Coherence;
                 coherence protocols; Computational modeling; Hardware;
                 hardware accelerators; Optimization; Program
                 processors; Protocols; System-on-chip (SoC)",
}

@Article{Ansari:2019:CLO,
  author =       "Ali Ansari and Pejman Lotfi-Kamran and Hamid
                 Sarbazi-Azad",
  title =        "Code Layout Optimization for Near-Ideal Instruction
                 Cache",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "124--127",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2924429",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Instruction cache misses are a significant source of
                 performance degradation in server workloads because of
                 their large instruction footprints and complex control
                 flow. Due to the importance of reducing the number of
                 instruction cache misses, there has been a myriad of
                 proposals for hardware instruction prefetchers in the
                 past two decades. While effectual, state-of-the-art
                 hardware instruction prefetchers either impose
                 considerable storage overhead or require significant
                 changes in the frontend of a processor. Unlike hardware
                 instruction prefetchers, code-layout optimization
                 techniques profile a program and then reorder the code
                 layout of the program to increase spatial locality, and
                 hence, reduce the number of instruction cache misses.
                 While an active area of research in the 1990s,
                 code-layout optimization techniques have largely been
                 neglected in the past decade. We evaluate the
                 suitability of code-layout optimization techniques for
                 modern server workloads and show that if we combine
                 these techniques with a simple next-line prefetcher,
                 they can significantly reduce the number of instruction
                 cache misses. Moreover, we propose a new code-layout
                 optimization algorithm and show that along with a
                 next-line prefetcher, it offers the same performance
                 improvement as the state-of-the-art hardware
                 instruction prefetcher, but with almost no hardware
                 overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "basic-block reordering; Cache storage; code-layout
                 optimization; Encoding; Instruction cache miss;
                 instruction prefetcher; Instruction sets; Optimization;
                 Prefetching",
}

@Article{Ranganath:2019:SCC,
  author =       "Kiran Ranganath and AmirAli Abdolrashidi and Shuaiwen
                 Leon Song and Daniel Wong",
  title =        "Speeding up Collective Communications Through
                 Inter-{GPU} Re-Routing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "128--131",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2933842",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In order to address the vast needs of disparate
                 domains, computing engines are becoming more
                 sophisticated and complex. A typical high-performance
                 computational engine is composed of several accelerator
                 units, in most cases GPUs, plus one or more CPU
                 controllers. All these components are becoming
                 increasingly interconnected to satisfy bandwidth and
                 latency tolerance demands from modern workloads. Due to
                 these constraints, solutions to efficiently
                 interconnect them or to systematically manage their
                 traffic-such as PCIe v3, NVLink v1 and v2 on the
                 hardware side, and NVIDIA Collective Communication
                 Library (NCCL) and AMD ROCM layer on the software
                 side-are becoming more commonplace inside HPC systems
                 and cloud data centers. However, as the number of
                 accelerators increases, workloads (especially machine
                 learning) might not be able to fully exploit the
                 computational substrate due to inefficient use of
                 hardware interconnects. Such scenarios can lead to
                 performance bottlenecks where high-bandwidth links are
                 not used by the underlying libraries and
                 under-performing links are overused. This work proposes
                 Workload Optimization Through Inter-GPU Re-routing
                 (WOTIR), which consists of enhanced NCCL-based
                 collective primitives that aim to boost bandwidth
                 utilization (through more efficient routing) and reduce
                 communication overhead. WOTIR targets GPUs with no
                 direct NVLink communication path (which leads to PCIe
                 communications) and instead re-routes communication
                 through intermediate GPUs to bridge NVLink segments and
                 avoid PCIe communications. Such method allows the
                 maximum possible utilization of the NVLink bandwidth
                 between the GPUs without routing through the PCIe bus.
                 Using this method, we see a reduction of up to 34
                 percent in execution time for selected machine learning
                 workloads when non-optimal GPU allocations arise.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Collective communication; GPU; Graphics
                 processing units; interconnect; Interference; Machine
                 learning; Routing; Servers; Training data",
}

@Article{Stow:2019:PPM,
  author =       "Dylan Stow and Amin Farmahini-Farahani and Sudhanva
                 Gurumurthi and Michael Ignatowski and Yuan Xie",
  title =        "Power Profiling of Modern Die-Stacked Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "132--135",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2941715",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Die-stacked memories that integrate multiple DRAM dies
                 into the processor package have reduced the interface
                 bottleneck and improved efficiency, but demands for
                 memory capacity and bandwidth remain unfulfilled.
                 Additionally, the introduction of memory into the
                 package further complicates heat removal. Memory power
                 is therefore becoming a key architectural concern. To
                 provide insight into these challenges, an architectural
                 power model for High Bandwidth Memory is developed,
                 validated, and used to provide detailed power profiles.
                 Based on the resulting power trends, power is projected
                 for potential future memory configurations with
                 increased bandwidth and capacity. The results suggest
                 that, without significant improvements in memory
                 technology or architecture, the power utilization of
                 in-package memories will continue to grow and limit the
                 system power budget.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Integrated circuits; Memory management; Power
                 measurement; Power system measurement; Random access
                 memory; random access memory; Three-dimensional
                 displays; three-dimensional integrated circuits",
}

@Article{Nabavinejad:2019:CDP,
  author =       "Seyed Morteza Nabavinejad and Hassan Hafez-Kolahi and
                 Sherief Reda",
  title =        "Coordinated {DVFS} and Precision Control for Deep
                 Neural Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "136--140",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2942020",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Traditionally, DVFS has been the main mechanism to
                 trade-off performance and power. We observe that Deep
                 Neural Network (DNN) applications offer the possibility
                 to trade-off performance, power, and accuracy using
                 both DVFS and numerical precision levels. Our proposed
                 approach, Power-Inference accuracy Trading (PIT),
                 monitors the server's load, and accordingly adjusts the
                 precision of the DNN model and the DVFS setting of GPU
                 to trade-off the accuracy and power consumption with
                 response time. At high loads and tight request
                 arrivals, PIT leverages INT8-precision instructions of
                 GPU to dynamically change the precision of deployed DNN
                 models and boosts GPU frequency to execute the requests
                 faster at the expense of accuracy reduction and high
                 power consumption. However, when the requests' arrival
                 rate is relaxed and there is slack time for requests,
                 PIT deploys high precision version of models to improve
                 the accuracy and reduces GPU frequency to decrease
                 power consumption. We implement and deploy PIT on a
                 state-of-the-art server equipped with a Tesla P40 GPU.
                 Experimental results demonstrate that depending on the
                 load, PIT can improve response time up to 11 percent
                 compared to a job scheduler that uses only FP32
                 precision. It also improves the energy consumption by
                 up to 28 percent, while achieving around 99.5 percent
                 accuracy of sole FP32-precision.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accuracy; Deep neural network; Graphics processing
                 units; hardware accelerator; Neural networks; power;
                 Power demand; response time; Runtime; Servers; Time
                 factors; Time-frequency analysis",
}

@Article{Lee:2019:ELM,
  author =       "Seunghak Lee and Nam Sung Kim and Daehoon Kim",
  title =        "Exploiting {OS}-Level Memory Offlining for {DRAM}
                 Power Management",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "141--144",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2942914",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Power and energy consumed by main memory systems in
                 data-center servers have increased as the DRAM capacity
                 and bandwidth increase. Particularly, background power
                 accounts for a considerable fraction of the total DRAM
                 power consumption; the fraction will increase further
                 in the near future, especially when slowing-down
                 technology scaling forces us to provide necessary DRAM
                 capacity through plugging in more DRAM modules or
                 stacking more DRAM chips in a DRAM package. Although
                 current DRAM architecture supports low power states at
                 rank granularity that turn off some components during
                 idle periods, techniques to exploit memory-level
                 parallelism make the rank-granularity power state
                 become ineffective. Furthermore, the long wake-up
                 latency is one of obstacles to adopting aggressive
                 power management (PM) with deep power-down states. By
                 tackling the limitations, we propose OffDIMM that is a
                 software-assisted DRAM PM collaborating with the
                 OS-level memory onlining/offlining. OffDIMM maps a
                 memory block in the address space of the OS to a
                 subarray group or groups of DRAM, and sets a deep
                 power-down state for the subarray group when offlining
                 the block. Through the dynamic OS-level memory
                 onlining/offlining based on the current memory usage,
                 our experimental results show OffDIMM reduces
                 background power by 24 percent on average without
                 notable performance overheads.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "DRAM; Energy consumption; Hardware; Linux; Memory
                 management; memory offlining; power management; Power
                 system management; Random access memory",
}

@Article{Marinakis:2019:PFI,
  author =       "Theodoros Marinakis and Iraklis Anagnostopoulos",
  title =        "Performance and Fairness Improvement on {CMPs}
                 Considering Bandwidth and Cache Utilization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "1--4",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2944810",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Chip multiprocessors (CMPs) have become dominant both
                 in server and embedded domain as they accommodate an
                 increasing amount of cores in order to satisfy the
                 workload demands. However, when applications run
                 concurrently, they compete for shared resources, such
                 as Last Level Cache (LLC) and main memory bandwidth.
                 Applications are affected in various ways by
                 contention, and uneven degradation makes the system
                 unreliable and the overall performance unpredictable.
                 The goal of this work is to improve performance by
                 sophisticated grouping that balances bandwidth and LLC
                 requirements, while at the same time providing a fair
                 execution environment by prioritizing applications that
                 experience the least accumulated progress. The proposed
                 scheduler achieves an average performance gain of 16
                 percent over the Linux scheduler and 6.3 percent over
                 another performance-oriented scheduler. Additionally,
                 it keeps unfairness very close to two fairness-oriented
                 schedulers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Chip multiprocessors; contention-aware
                 scheduling; Degradation; fairness; Interference; Job
                 shop scheduling; Linux; performance; Quality of
                 service; Resource management",
}

@Article{Balaji:2019:FEW,
  author =       "Adarsha Balaji and Shihao Song and Anup Das and Nikil
                 Dutt and Jeff Krichmar and Nagarajan Kandasamy and
                 Francky Catthoor",
  title =        "A Framework to Explore Workload-Specific Performance
                 and Lifetime Trade-offs in Neuromorphic Computing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "149--152",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2951507",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Neuromorphic hardware with non-volatile memory (NVM)
                 can implement machine learning workload in an
                 energy-efficient manner. Unfortunately, certain NVMs
                 such as phase change memory (PCM) require high voltages
                 for correct operation. These voltages are supplied from
                 an on-chip charge pump. If the charge pump is activated
                 too frequently, its internal CMOS devices do not
                 recover from stress, accelerating their aging and
                 leading to negative bias temperature instability (NBTI)
                 generated defects. Forcefully discharging the stressed
                 charge pump can lower the aging rate of its CMOS
                 devices, but makes the neuromorphic hardware
                 unavailable to perform computations while its charge
                 pump is being discharged. This negatively impacts
                 performance such as latency and accuracy of the machine
                 learning workload being executed. In this letter, we
                 propose a novel framework to exploit workload-specific
                 performance and lifetime trade-offs in neuromorphic
                 computing. Our framework first extracts the precise
                 times at which a charge pump in the hardware is
                 activated to support neural computations within a
                 workload. This timing information is then used with a
                 characterized NBTI reliability model to estimate the
                 charge pump's aging during the workload execution. We
                 use our framework to evaluate workload-specific
                 performance and reliability impacts of using (1)
                 different SNN mapping strategies and (2) different
                 charge pump discharge strategies. We show that our
                 framework can be used by system designers to explore
                 performance and reliability trade-offs early in the
                 design of neuromorphic hardware such that appropriate
                 reliability-oriented design margins can be set.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Aging; and inter-spike interval (ISI); Charge pumps;
                 Negative bias temperature instability; negative bias
                 temperature instability (NBTI); Neuromorphic computing;
                 Neuromorphics; non-volatile memory (NVM); phase-change
                 memory (PCM); spiking neural networks (SNNs); Synapses;
                 Thermal variables control; wear-out",
}

@Article{Jeon:2019:LAG,
  author =       "Hyeran Jeon and Hodjat Asghari Esfeden and Nael B.
                 Abu-Ghazaleh and Daniel Wong and Sindhuja Elango",
  title =        "Locality-Aware {GPU} Register File",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "153--156",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2959298",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In many emerging applications such as deep learning,
                 large data set is essential to generate reliable
                 solutions. In these big data workloads, memory latency
                 and bandwidth are the main performance bottlenecks. In
                 this article, we propose a locality-aware GPU register
                 file that enables data sharing for memory-intensive big
                 data workloads on GPUs without relying on small on-chip
                 memories. We exploit two types of data sharing patterns
                 commonly found from the big data workloads and have
                 warps opportunistically share data in physical
                 registers instead of issuing memory loads separately
                 and storing the same data redundantly in their
                 registers as well as small shared memory. With an
                 extended register file mapping mechanism, our proposed
                 design enables warps to share data by simply mapping to
                 the same physical registers or reconstructing from the
                 data in the register file already. The proposed sharing
                 not only reduces the memory transactions but also
                 further decreases the register file usage. The spared
                 registers make rooms for applying orthogonal
                 optimizations for energy and performance improvement.
                 Our evaluation on two deep learning workloads and
                 matrixMul show that the proposed locality-aware GPU
                 register file achieves over $ 2 \times $ speedup and
                 saves register space up to 57 percent.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Big Data; convolution neural network; Deep
                 learning; GPU; Graphics processing units; Matrix
                 operations; Registers; System-on-chip",
}

@Article{Li:2019:PBP,
  author =       "Chen Li and Yifan Sun and Lingling Jin and Lingjie Xu
                 and Zheng Cao and Pengfei Fan and David Kaeli and Sheng
                 Ma and Yang Guo and Jun Yang",
  title =        "Priority-Based {PCIe} Scheduling for Multi-Tenant
                 Multi-{GPU} Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "157--160",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2955119",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Multi-GPU systems are widely used in data centers to
                 provide significant speedups to compute-intensive
                 workloads such as deep neural network training.
                 However, limited PCIe bandwidth between the CPU and
                 multiple GPUs becomes a major performance bottleneck.
                 We observe that relying on a traditional
                 Round-Robin-based PCIe scheduling policy can result in
                 severe bandwidth competition and stall the execution of
                 multiple GPUs. In this article, we propose a
                 priority-based scheduling policy which aims to overlap
                 the data transfers and GPU execution for different
                 applications to alleviate this bandwidth contention. We
                 also propose a dynamic priority policy for semi-QoS
                 management that can help applications to meet QoS
                 requirements and improve overall multi-GPU system
                 throughput. Experimental results show that the system
                 throughput is improved by 7.6 percent on average using
                 our priority-based PCIe scheduling scheme as compared
                 with a Round-Robin-based PCIe scheduler. Leveraging
                 semi-QoS management can help to meet defined QoS goals,
                 while preserving application throughput.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Data transfer; Graphics processing units;
                 Multi-GPU; multi-tenant; PCIe scheduling; Quality of
                 service; Switches; Task analysis; Throughput",
}

@Article{Weng:2019:DMC,
  author =       "Jian Weng and Sihao Liu and Vidushi Dadu and Tony
                 Nowatzki",
  title =        "{DAEGEN}: a Modular Compiler for Exploring Decoupled
                 Spatial Accelerators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "161--165",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2955456",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Specialized hardware accelerators, particularly those
                 that are programmable and flexible to target multiple
                 problems in their domain, have proven to provide orders
                 of magnitude speedup and energy efficiency. However,
                 their design requires extensive manual effort, due to
                 the need for hardware-software codesign to balance the
                 degree and forms of specialization to the domains or
                 program behaviors of interest. This article provides
                 the first steps towards one approach for automating
                 much of these processes. The insight behind our work is
                 to recognize that decoupled spatial architectures both
                 define a rich design space with many tradeoffs for
                 different kinds of applications, and also can be
                 composed out of a simple set of well-defined
                 primitives. Therefore, we propose a modular accelerator
                 design framework, DAEGEN, a.k.a. Decoupled Access
                 Excution Accelerator Generator. This article defines an
                 initial compiler and architecture primitives, and we
                 discuss key challenges.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; Delays; design automation;
                 Hardware; hardware/software co-design; Kernel; Manuals;
                 Micromechanical devices; Reconfigurable accelerators;
                 spatial architectures; Synchronization",
}

@Article{Iliakis:2019:LIG,
  author =       "Konstantinos Iliakis and Sotirios Xydis and Dimitrios
                 Soudris",
  title =        "{LOOG}: Improving {GPU} Efficiency With Light-Weight
                 Out-Of-Order Execution",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "166--169",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2951161",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "GPUs are one of the most prevalent platforms for
                 accelerating general-purpose workloads due to their
                 intuitive programming model, computing capacity, and
                 cost-effectiveness. GPUs rely on massive
                 multi-threading and fast context switching to overlap
                 computations with memory operations. Among the diverse
                 GPU workloads, there exists a class of kernels that
                 fail to maintain a sufficient number of active warps to
                 hide the latency of memory operations, and thus suffer
                 from frequent stalling. We observe that these kernels
                 will benefit from increased levels of Instruction-Level
                 Parallelism and we propose a novel architecture with
                 lightweight Out-Of-Order execution capability. To
                 minimize hardware overheads, we carefully design our
                 extension to highly re-use the existing
                 micro-architectural structures. We show that the
                 proposed architecture outperforms traditional platforms
                 by 15 to 46 percent on average for low occupancy
                 kernels, with an area overhead of 0.74 to 3.94 percent.
                 Finally, we prove the potential of our proposal as a
                 GPU u-arch alternative, by providing a 5 percent
                 speedup over a wide collection of 63 general-purpose
                 kernels with as little as 0.74 percent area overhead.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Copper; GPGPU; Graphics processing units; Kernel;
                 micro-architecture; Out of order; Out-of-Order
                 execution; Radio access technologies; Radio frequency;
                 Registers",
}

@Article{Matsuo:2019:IIF,
  author =       "Reoma Matsuo and Ryota Shioya and Hideki Ando",
  title =        "Improving the Instruction Fetch Throughput with
                 Dynamically Configuring the Fetch Pipeline",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "170--173",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2952592",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Instruction cache misses are the critical performance
                 bottleneck in the execution of recent workloads such as
                 Web applications written in JavaScript and server
                 applications. Although various instruction prefetchers
                 have been proposed to reduce the misses, the
                 requirements for both high miss coverage and small
                 hardware cost are not satisfied. In this article, we
                 propose a novel method that improves the instruction
                 fetch throughput not by instruction prefetching but by
                 dynamically configuring the fetch pipeline structure.
                 Our scheme switches between the normal pipeline and
                 newly introduced miss-assuming pipeline, which does not
                 degrade the fetch throughput even when L1 instruction
                 cache misses occur. Our method achieves high
                 instruction fetch throughput with simple hardware and
                 small cost unlike previously proposed prefetchers. Our
                 evaluation results using Web and database workloads
                 show that our method improves the performance by 16.6
                 percent and 8.6 percent on average, compared to that
                 with noprefetching and the state-of-the-art instruction
                 prefetcher, PIF, respectively, and achieves as much as
                 79.0 percent of the performance of the processor with a
                 perfect instruction cache.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Cache storage; Instruction fetch; pipeline
                 implementation; Pipelines; Prefetching; Servers;
                 Throughput",
}

@Article{Kommareddy:2019:CMS,
  author =       "Vamsee Reddy Kommareddy and Baogang Zhang and Fan Yao
                 and Rickard Ewetz and Amro Awad",
  title =        "Are Crossbar Memories Secure? {New} Security
                 Vulnerabilities in Crossbar Memories",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "174--177",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2952111",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memristors are emerging Non-Volatile Memories (NVMs)
                 that are promising for building future memory systems.
                 Unlike DRAM, memristors are non-volatile, i.e., they
                 can retain data after power loss. In contrast to DRAM
                 where each cell is associated with a pass transistor,
                 memristor cells can be implemented without such
                 transistor, and hence enable high density ReRAM
                 systems. Moreover, memristors leverage a unique
                 crossbar architecture to improve the density of memory
                 modules. Memristors have been considered to build
                 future data centers with both energy-efficiency and
                 high memory capacity goals. Surprisingly, we observe
                 that using memristors in multi-tenant environments,
                 e.g., cloud systems, entails new security
                 vulnerabilities. In particular, the crossbar contents
                 can severely affect the write latency of any data cells
                 within the same crossbar. With various memory
                 interleaving options (to optimize performance), a
                 single crossbar might be shared among several
                 applications/users from different security domains.
                 Therefore, such content-dependent latency can open new
                 source of information leakage. In this article, we
                 describe the information leakage problem in memristor
                 crossbar arrays (MCAs), discuss how they can be
                 potentially exploited from application level. Our work
                 highlights the need for future research to mitigate
                 (and potentially eliminate) information leakage in
                 crossbar memories in future computing systems.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; Crossbar memory; Memristors;
                 Microprocessors; Nonvolatile memory; Random access
                 memory; ReRAM; Security; security",
}

@Article{Barber:2019:ISD,
  author =       "Kristin Barber and Anys Bacha and Li Zhou and Yinqian
                 Zhang and Radu Teodorescu",
  title =        "Isolating Speculative Data to Prevent Transient
                 Execution Attacks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "18",
  number =       "2",
  pages =        "178--181",
  month =        jul,
  year =         "2019",
  DOI =          "https://doi.org/10.1109/LCA.2019.2916328",
  ISSN =         "1556-6064",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware security has recently re-surfaced as a
                 first-order concern to the confidentiality protections
                 of computing systems. Meltdown and Spectre introduced a
                 new class of exploits that leverage transient state as
                 an attack surface and have revealed fundamental
                 security vulnerabilities of speculative execution in
                 high-performance processors. These attacks derive
                 benefit from the fact that programs may speculatively
                 execute instructions outside their legal control flows.
                 This insight is then utilized for gaining access to
                 restricted data and exfiltrating it by means of a
                 covert channel. This study presents a
                 microarchitectural mitigation technique for shielding
                 transient state from covert channels during speculative
                 execution. Unlike prior work that has focused on
                 closing individual covert channels used to leak
                 sensitive information, this approach prevents the use
                 of speculative data by downstream instructions until
                 doing so is determined to be safe. This prevents
                 transient execution attacks at a cost of 18 percent
                 average performance degradation.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "covert timing channels; Delays; Hardware security;
                 Law; Pipelines; Registers; Security; Transient
                 analysis; transient execution attacks",
}

@Article{Kang:2020:NPP,
  author =       "Ki-Dong Kang and Gyeongseo Park and Nam Sung Kim and
                 Daehoon Kim",
  title =        "Network Packet Processing Mode-Aware Power Management
                 for Data Center Servers",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2019.2926079",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Cavus:2020:EPP,
  author =       "Mustafa Cavus and Mohammed Shatnawi and Resit Sendag
                 and Augustus K. Uht",
  title =        "Exploring Prefetching, Pre-Execution and Branch
                 Outcome Streaming for In-Memory Database Lookups",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2019.2959982",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Bodduna:2020:BRS,
  author =       "Rahul Bodduna and Vinod Ganesan and Patanjali SLPSK
                 and Kamakoti Veezhinathan and Chester Rebeiro",
  title =        "{Brutus}: Refuting the Security Claims of the Cache
                 Timing Randomization Countermeasure Proposed in
                 {CEASER}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2964212",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Kim:2020:TSA,
  author =       "Minsub Kim and Jaeha Kung and Sungjin Lee",
  title =        "Towards Scalable Analytics with Inference-Enabled
                 Solid-State Drives",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "13--17",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2019.2930590",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Li:2020:CDE,
  author =       "Congmiao Li and Jean-Luc Gaudiot",
  title =        "Challenges in Detecting an Evasive Spectre",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "18--21",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2976069",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Yan:2020:CUG,
  author =       "Mingyu Yan and Zhaodong Chen and Lei Deng and Xiaochun
                 Ye and Zhimin Zhang and Dongrui Fan and Yuan Xie",
  title =        "Characterizing and Understanding {GCNs} on {GPU}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "22--25",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2970395",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Kumar:2020:PSM,
  author =       "Chanchal Kumar and Aayush Chaudhary and Shubham
                 Bhawalkar and Utkarsh Mathur and Saransh Jain and Adith
                 Vastrad and Eric Rotenberg",
  title =        "Post-Silicon Microarchitecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "26--29",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2978841",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Eyerman:2020:BOB,
  author =       "Stijn Eyerman and Wim Heirman and Sam Van den Steen
                 and Ibrahim Hur",
  title =        "Breaking In-Order Branch Miss Recovery",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "30--33",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2980277",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Liu:2020:STA,
  author =       "Zhi-Gang Liu and Paul N. Whatmough and Matthew
                 Mattina",
  title =        "Systolic Tensor Array: an Efficient Structured-Sparse
                 {GEMM} Accelerator for Mobile {CNN} Inference",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "34--37",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2979965",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Krishnan:2020:SLV,
  author =       "Srivatsan Krishnan and Zishen Wan and Kshitij Bhardwaj
                 and Paul Whatmough and Aleksandra Faust and Gu-Yeon Wei
                 and David Brooks and Vijay Janapa Reddi",
  title =        "The Sky Is Not the Limit: a Visual Performance Model
                 for Cyber-Physical Co-Design in Autonomous Machines",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "38--42",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2981022",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Michaud:2020:ETT,
  author =       "Pierre Michaud",
  title =        "Exploiting Thermal Transients With Deterministic Turbo
                 Clock Frequency",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "43--46",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2983920",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Chu:2020:HPD,
  author =       "Zhufei Chu and Huiming Tian and Zeqiang Li and Yinshui
                 Xia and Lunyao Wang",
  title =        "A High-Performance Design of Generalized Pipeline
                 Cellular Array",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "47--50",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2986197",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Zhu:2020:HIR,
  author =       "Lingjun Zhu and Lennart Bamberg and Anthony Agnesina
                 and Francky Catthoor and Dragomir Milojevic and Manu
                 Komalan and Julien Ryckaert and Alberto Garcia-Ortiz
                 and Sung Kyu Lim",
  title =        "Heterogeneous {$3$D} Integration for a {RISC-V} System
                 With {STT-MRAM}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "51--54",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2992644",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Mason:2020:UPI,
  author =       "Tony Mason and Thaleia Dimitra Doudali and Margo
                 Seltzer and Ada Gavrilovska",
  title =        "Unexpected Performance of {Intel Optane DC} Persistent
                 Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "55--58",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2987303",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Zhang:2020:AIG,
  author =       "Zhihui Zhang and Jingwen Leng and Lingxiao Ma and
                 Youshan Miao and Chao Li and Minyi Guo",
  title =        "Architectural Implications of Graph Neural Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "59--62",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2988991",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Sartor:2020:HHL,
  author =       "Anderson L. Sartor and Anish Krishnakumar and Samet E.
                 Arda and Umit Y. Ogras and Radu Marculescu",
  title =        "{HiLITE}: Hierarchical and Lightweight Imitation
                 Learning for Power Management of Embedded {SoCs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "63--67",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2992182",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Desai:2020:PAH,
  author =       "Harsh Desai and Brandon Lucia",
  title =        "A Power-Aware Heterogeneous Architecture Scaling Model
                 for Energy-Harvesting Computers",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "68--71",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2989440",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Lai:2020:TDB,
  author =       "Bo-Cheng Lai and Chun-Yen Chen and Yi-Da Hsin and
                 Bo-Yen Lin",
  title =        "A Two-Directional {BigData} Sorting Architecture on
                 {FPGAs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "72--75",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2993040",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Gu:2020:NTC,
  author =       "Peng Gu and Benjamin S. Lim and Wenqin Huangfu and
                 Krishan T. Malladi and Andrew Chang and Yuan Xie",
  title =        "{NMTSim}: Transaction-Command Based Simulator for New
                 Memory Technology Devices",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "76--79",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2995167",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Rezaei:2020:NNM,
  author =       "Seyyed Hossein SeyyedAghaei Rezaei and Mehdi
                 Modarressi and Rachata Ausavarungnirun and Mohammad
                 Sadrosadati and Onur Mutlu and Masoud Daneshtalab",
  title =        "{NoM}: Network-on-Memory for Inter-Bank Data Transfer
                 in Highly-Banked Memories",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "80--83",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2990599",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2020:IIC,
  author =       "Anonymous",
  title =        "2019 Index {{\booktitle{IEEE Computer Architecture
                 Letters}}} Vol. 18",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "1",
  pages =        "1--8",
  month =        jan # "\slash " # jun,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2964168",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Ros:2020:EIP,
  author =       "Alberto Ros and Alexandra Jimborean",
  title =        "The Entangling Instruction Prefetcher",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "84--87",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3002947",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Singh:2020:VLB,
  author =       "Rahul Singh and Gokul Subramanian Ravi and Mikko
                 Lipasti and Joshua San Miguel",
  title =        "Value Locality Based Approximation With {ODIN}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "88--91",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3002542",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Zhang:2020:FRP,
  author =       "Jie Zhang and Miryeong Kwon and Sanghyun Han and Nam
                 Sung Kim and Mahmut Kandemir and Myoungsoo Jung",
  title =        "{FastDrain}: Removing Page Victimization Overheads in
                 {NVMe} Storage Stack",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "92--96",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3005507",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Im:2020:PBA,
  author =       "Junsu Im and Hanbyeol Kim and Yumin Won and Jiho Oh
                 and Minjae Kim and Sungjin Lee",
  title =        "Probability-Based Address Translation for Flash
                 {SSDs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "97--100",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3006529",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Samara:2020:CDS,
  author =       "Ahmed Samara and James Tuck",
  title =        "The Case for Domain-Specialized Branch Predictors for
                 Graph-Processing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "101--104",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3005895",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Mirosanlou:2020:MED,
  author =       "Reza Mirosanlou and Danlu Guo and Mohamed Hassan and
                 Rodolfo Pellizzoni",
  title =        "{MCsim}: an Extensible {DRAM} Memory Controller
                 Simulator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "105--109",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3008288",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Li:2020:DCA,
  author =       "Shang Li and Zhiyuan Yang and Dhiraj Reddy and Ankur
                 Srivastava and Bruce Jacob",
  title =        "{DRAMsim3}: a Cycle-Accurate, Thermal-Capable {DRAM}
                 Simulator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "106--109",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.2973991",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Lee:2020:SFA,
  author =       "Joo Hwan Lee and Hui Zhang and Veronica Lagrange and
                 Praveen Krishnamoorthy and Xiaodong Zhao and Yang Seok
                 Ki",
  title =        "{SmartSSD}: {FPGA} Accelerated Near-Storage Data
                 Analytics on {SSD}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "110--113",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3009347",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Sutradhar:2020:PPP,
  author =       "Purab Ranjan Sutradhar and Mark Connolly and Sathwika
                 Bavikadi and Sai Manoj Pudukotai Dinakarrao and Mark A.
                 Indovina and Amlan Ganguly",
  title =        "{pPIM}: a Programmable Processor-in-Memory
                 Architecture With Precision-Scaling for Deep Learning",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "118--121",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3011643",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Choe:2020:SMP,
  author =       "Wonkyo Choe and Jonghyeon Kim and Jeongseob Ahn",
  title =        "A Study of Memory Placement on Hardware-Assisted
                 Tiered Memory Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "122--125",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3015613",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Lachtar:2020:CSA,
  author =       "Nada Lachtar and Abdulrahman Abu Elkhail and Anys
                 Bacha and Hafiz Malik",
  title =        "A Cross-Stack Approach Towards Defending Against
                 Cryptojacking",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "126--129",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3017457",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/cryptography2020.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Golshan:2020:HPC,
  author =       "Fatemeh Golshan and Mohammad Bakhshalipour and Mehran
                 Shakerinava and Ali Ansari and Pejman Lotfi-Kamran and
                 Hamid Sarbazi-Azad",
  title =        "Harnessing Pairwise-Correlating Data Prefetching With
                 Runahead Metadata",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "130--133",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3019343",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Lazarev:2020:DTE,
  author =       "Nikita Lazarev and Neil Adit and Shaojie Xiang and
                 Zhiru Zhang and Christina Delimitrou",
  title =        "{Dagger}: Towards Efficient {RPCs} in Cloud
                 Microservices With Near-Memory Reconfigurable {NICs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "134--138",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3020064",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Jahanshahi:2020:GNC,
  author =       "Ali Jahanshahi and Hadi Zamani Sabzi and Chester Lau
                 and Daniel Wong",
  title =        "{GPU-NEST}: Characterizing Energy Efficiency of
                 Multi-{GPU} Inference Servers",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "139--142",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3023723",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Mikhailenko:2020:ASA,
  author =       "Darya Mikhailenko and Yujin Nakamoto and Ben Feinberg
                 and Engin Ipek",
  title =        "Adapting In Situ Accelerators for Sparsity with
                 Granular Matrix Reordering",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "143--146",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3031907",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Ishii:2020:RIP,
  author =       "Yasuo Ishii and Jaekyu Lee and Krishnendra Nathella
                 and Dam Sunwoo",
  title =        "Rebasing Instruction Prefetching: an Industry
                 Perspective",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "147--150",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3035068",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Newton:2020:PGP,
  author =       "Newton and Virendra Singh and Trevor E. Carlson",
  title =        "{PIM-GraphSCC}: {PIM}-Based Graph Processing Using
                 Graph's Community Structures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "151--154",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3039498",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Chowdhury:2020:VNM,
  author =       "Zamshed I. Chowdhury and S. Karen Khatamifard and
                 Zhaoyong Zheng and Tali Moreshet and R. Iris Bahar and
                 Ulya R. Karpuzcu",
  title =        "Voltage Noise Mitigation With Barrier Approximation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "155--158",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3040088",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Che:2020:LMA,
  author =       "Yuezhi Che and Yuanzhou Yang and Amro Awad and Rujia
                 Wang",
  title =        "A Lightweight Memory Access Pattern Obfuscation
                 Framework for {NVM}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "163--166",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3041484",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Sadredini:2020:ESP,
  author =       "Elaheh Sadredini and Reza Rahimi and Kevin Skadron",
  title =        "Enabling In-{SRAM} Pattern Processing With
                 Low-Overhead Reporting Architecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "167--170",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3042194",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Sharifi:2020:AAC,
  author =       "Ferdous Sharifi and Nezam Rohbani and Shaahin
                 Hessabi",
  title =        "Aging-Aware Context Switching in Multicore Processors
                 Based on Workload Classification",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "19",
  number =       "2",
  pages =        "159--162",
  month =        jul # "\slash " # dec,
  year =         "2020",
  DOI =          "https://doi.org/10.1109/LCA.2020.3040326",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2021:IIC,
  author =       "Anonymous",
  title =        "2020 Index {{\booktitle{IEEE Computer Architecture
                 Letters}}} Vol. 19",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "1--7",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2020.3048555",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Kwon:2021:FQM,
  author =       "Hyoukjun Kwon and Michael Pellauer and Angshuman
                 Parashar and Tushar Krishna",
  title =        "{Flexion}: a Quantitative Metric for Flexibility in
                 {DNN} Accelerators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2020.3044607",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Kim:2021:TTR,
  author =       "Byeongho Kim and Jaehyun Park and Eojin Lee and Minsoo
                 Rhu and Jung Ho Ahn",
  title =        "{TRiM}: Tensor Reduction in Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2020.3042805",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Boran:2021:FGS,
  author =       "Nirmal Kumar Boran and Shubhankit Rathore and Meet
                 Udeshi and Virendra Singh",
  title =        "Fine-Grained Scheduling in Heterogeneous-{ISA}
                 Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2020.3045056",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Resch:2021:DLQ,
  author =       "Salonik Resch and Swamit Tannu and Ulya R. Karpuzcu
                 and Moinuddin Qureshi",
  title =        "A Day In the Life of a Quantum Error",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2020.3045628",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Shan:2021:ACP,
  author =       "Mohsin Shan and Omer Khan",
  title =        "Accelerating Concurrent Priority Scheduling Using
                 Adaptive in-Hardware Task Distribution in Multicores",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "17--21",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2020.3045670",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Perais:2021:CSS,
  author =       "Arthur Perais",
  title =        "A Case for Speculative Strength Reduction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "22--25",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2020.3048694",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Navarro:2021:HSS,
  author =       "Marta Navarro and Lucia Pons and Julio Sahuquillo",
  title =        "{Hy-Sched}: a Simple Hyperthreading-Aware Thread to
                 Core Allocation Strategy",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "26--29",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3051393",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Alian:2021:IOI,
  author =       "Mohammad Alian and Jongmin Shin and Ki-Dong Kang and
                 Ren Wang and Alexandros Daglis and Daehoon Kim and Nam
                 Sung Kim",
  title =        "{IDIO}: Orchestrating Inbound Network Data on Server
                 Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "30--33",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2020.3044923",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Kim:2021:RSD,
  author =       "Hweesoo Kim and Sunjung Lee and Jaewan Choi and Jung
                 Ho Ahn",
  title =        "Row-Streaming Dataflow Using a Chaining Buffer and
                 Systolic Array+ Structure",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "34--37",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3054371",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Kasan:2021:CDB,
  author =       "Hans Kasan and John Kim",
  title =        "The Case for Dynamic Bias in Global Adaptive Routing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "38--41",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3061408",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Shah:2021:TDS,
  author =       "Parth Shah and Ranjal Gautham Shenoy and Vaidyanathan
                 Srinivasan and Pradip Bose and Alper Buyuktosunoglu",
  title =        "{TokenSmart}: Distributed, Scalable Power Management
                 in the Many-Core Era",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "42--45",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3064441",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Li:2021:RRA,
  author =       "Qian Li and Bin Li and Pietro Mercati and Ramesh
                 Illikkal and Charlie Tai and Michael Kishinevsky and
                 Christos Kozyrakis",
  title =        "{RAMBO}: Resource Allocation for Microservices Using
                 {Bayesian} Optimization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "46--49",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3066142",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Kim:2021:ZCS,
  author =       "Sunghwan Kim and Gyusun Lee and Jiwon Woo and Jinkyu
                 Jeong",
  title =        "Zero-Copying {I/O} Stack for Low-Latency {SSDs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "50--53",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3064876",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Yu:2021:MDC,
  author =       "Chao Yu and Sihang Liu and Samira Khan",
  title =        "{MultiPIM}: a Detailed and Configurable Multi-Stack
                 Processing-In-Memory Simulator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "54--57",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3061905",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu May 27 16:19:32 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Tan:2021:FQF,
  author =       "Tian Tan and Eriko Nurvitadhi and Aravind Dasu and
                 Martin Langhammer and Derek Chiou",
  title =        "{FlexScore}: Quantifying Flexibility",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "58--4",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3076413",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jul 8 12:08:28 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Sarkar:2021:DDA,
  author =       "Arindam Sarkar and Newton Singh and Varun Venkitaraman
                 and Virendra Singh",
  title =        "{DAM}: Deadblock Aware Migration Techniques for
                 {STT-RAM}-Based Hybrid Caches",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "62--4",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3071717",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jul 8 12:08:28 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Li:2021:HAG,
  author =       "Han Li and Mingyu Yan and Xiaocheng Yang and Lei Deng
                 and Wenming Li and Xiaochun Ye and Dongrui Fan and Yuan
                 Xie",
  title =        "Hardware Acceleration for {GCNs} via Bidirectional
                 Fusion",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "66--4",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3077956",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jul 8 12:08:28 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Jang:2021:DPT,
  author =       "Yongjoo Jang and Sejin Kim and Daehoon Kim and Sungjin
                 Lee and Jaeha Kung",
  title =        "Deep Partitioned Training From Near-Storage Computing
                 to {DNN} Accelerators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "70--73",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3081752",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jul 8 12:08:28 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Resch:2021:CPC,
  author =       "Salonik Resch and Husrev Cilasun and Ulya R.
                 Karpuzcu",
  title =        "Cryogenic {PIM}: Challenges Opportunities",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "74--77",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3077536",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jul 8 12:08:28 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Heirman:2021:RRC,
  author =       "Wim Heirman and Stijn Eyerman and Kristof {Du Bois}
                 and Ibrahim Hur",
  title =        "{RIO}: {ROB}-Centric In-Order Modeling of Out-of-Order
                 Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "1",
  pages =        "78--81",
  month =        jan # "\slash " # jun,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3084365",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jul 8 12:08:28 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Amarnath:2021:HAS,
  author =       "Aporva Amarnath and Subhankar Pal and Hiwot Tadese
                 Kassa and Augusto Vega and Alper Buyuktosunoglu and
                 Hubertus Franke and John-David Wellman and Ronald
                 Dreslinski and Pradip Bose",
  title =        "Heterogeneity-Aware Scheduling on {SoCs} for
                 Autonomous Vehicles",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "82--85",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3085505",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Aug 10 15:14:44 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Wang:2021:WWP,
  author =       "Lei Wang and Xingwang Xiong and Jianfeng Zhan and
                 Wanling Gao and Xu Wen and Guoxin Kang and Fei Tang",
  title =        "{WPC}: Whole-Picture Workload Characterization Across
                 Intermediate Representation, {ISA}, and
                 Microarchitecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "86--89",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3087828",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Aug 10 15:14:44 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Eyerman:2021:MDT,
  author =       "Stijn Eyerman and Wim Heirman and Ibrahim Hur",
  title =        "Modeling {DRAM} Timing in Parallel Simulators With
                 Immediate-Response Memory Model",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "90--93",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3093075",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Aug 10 15:14:44 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Falahati:2021:DAC,
  author =       "Hajar Falahati and Masoud Peyro and Hossein Amini and
                 Mehran Taghian and Mohammad Sadrosadati and Pejman
                 Lotfi-Kamran and Hamid Sarbazi-Azad",
  title =        "Data-Aware Compression of Neural Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "94--97",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3096191",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Aug 10 15:14:44 2021",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Wu:2021:GOD,
  author =       "Benjamin Wu and Trishita Tiwari and G. Edward Suh and
                 Aaron B. Wagner",
  title =        "Guessing Outputs of Dynamically Pruned {CNNs} Using
                 Memory Access Patterns",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "98--101",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3101505",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Yoo:2021:MBU,
  author =       "Mingi Yoo and Jaeyong Song and Jounghoo Lee and
                 Namhyung Kim and Youngsok Kim and Jinho Lee",
  title =        "Making a Better Use of Caches for {GCN} Accelerators
                 with Feature Slicing and Automatic Tile Morphing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "102--105",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3090954",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Hyun:2021:CAD,
  author =       "Bongjoon Hyun and Jiwon Lee and Minsoo Rhu",
  title =        "Characterization and Analysis of Deep Learning for
                 {3D} Point Cloud Analytics",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "106--109",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3099117",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Rucker:2021:CTB,
  author =       "Alexander Rucker and Muhammad Shahbaz and Kunle
                 Olukotun",
  title =        "Chopping off the Tail: Bounded Non-Determinism for
                 Real-Time Accelerators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "110--113",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3102224",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Su:2021:EPA,
  author =       "Jiya Su and Linfeng He and Peng Jiang and Rujia Wang",
  title =        "Exploring {PIM} Architecture for High-Performance
                 Graph Pattern Mining",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "114--117",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3103665",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Lee:2021:UIN,
  author =       "Yunjae Lee and Youngeun Kwon and Minsoo Rhu",
  title =        "Understanding the Implication of Non-Volatile Memory
                 for Large-Scale Graph Neural Network Training",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "118--121",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3098943",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Munoz-Martinez:2021:SEC,
  author =       "Francisco Mu{\~n}oz-Mart{\'\i}nez and Jos{\'e} L.
                 Abell{\'a}n and Manuel E. Acacio and Tushar Krishna",
  title =        "{STONNE}: Enabling Cycle-Level Microarchitectural
                 Simulation for {DNN} Inference Accelerators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "122--125",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3097253",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Shoghi:2021:SSQ,
  author =       "Nima Shoghi and Andrei Bersatti and Moinuddin Qureshi
                 and Hyesoon Kim",
  title =        "{SmaQ}: Smart Quantization for {DNN} Training by
                 Exploiting Value Clustering",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "126--129",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3108505",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Volos:2021:CRA,
  author =       "Haris Volos",
  title =        "The Case for Replication-Aware Memory-Error Protection
                 in Disaggregated Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "130--133",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3110439",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Asheim:2021:BXS,
  author =       "Truls Asheim and Boris Grot and Rakesh Kumar",
  title =        "{BTB-X}: a Storage-Effective {BTB} Organization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "134--137",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3109945",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Kumar:2021:DDS,
  author =       "Pratik Kumar and Chavhan Sujeet Yashavant and
                 Biswabandan Panda",
  title =        "{DAMARU}: a Denial-of-Service Attack on Randomized
                 Last-Level Caches",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "138--141",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3112180",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Ghasemi:2021:MPE,
  author =       "Fatemeh Ghasemi and Magnus Jahre",
  title =        "Modeling Periodic Energy-Harvesting Computing
                 Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "142--145",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3117031",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Kalani:2021:ICB,
  author =       "Neelu Shivprakash Kalani and Biswabandan Panda",
  title =        "Instruction Criticality Based Energy-Efficient
                 Hardware Data Prefetching",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "146--149",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3117005",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Kim:2021:DSR,
  author =       "Jiho Kim and Myoungsoo Jung and John Kim",
  title =        "Decoupled {SSD}: Reducing Data Movement on
                 {NAND}-Based Flash {SSD}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "150--153",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3118688",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Lee:2021:LPM,
  author =       "Hyeon Gyu Lee and Minwook Kim and Juwon Lee and Eunji
                 Lee and Bryan S. Kim and Sungjin Lee and Yeseong Kim
                 and Sang Lyul Min and Jin-Soo Kim",
  title =        "Learned Performance Model for {SSD}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "154--157",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3120728",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Gurumurthi:2021:HRE,
  author =       "Sudhanva Gurumurthi and Kijun Lee and Munseon Jang and
                 Vilas Sridharan and Aaron Nygren and Yesin Ryu and
                 Kyomin Sohn and Taekyun Kim and Hoeju Chung",
  title =        "{HBM3 RAS}: Enhancing Resilience at Scale",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "158--161",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3117150",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Aimoniotis:2021:RBC,
  author =       "Pavlos Aimoniotis and Christos Sakalis and Magnus
                 Sj{\"a}lander and Stefanos Kaxiras",
  title =        "Reorder Buffer Contention: a Forward Speculative
                 Interference Attack for Speculation Invariant
                 Instructions",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "162--165",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3123408",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Nabavinejad:2021:BLB,
  author =       "Seyed Morteza Nabavinejad and Sherief Reda",
  title =        "{BayesTuner}: Leveraging {Bayesian} Optimization For
                 {DNN} Inference Configuration Selection",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "166--170",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3123695",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Ham:2021:NDP,
  author =       "Hyungkyu Ham and Hyunuk Cho and Minjae Kim and Jueon
                 Park and Jeongmin Hong and Hyojin Sung and Eunhyeok
                 Park and Euicheol Lim and Gwangsun Kim",
  title =        "Near-Data Processing in Memory Expander for {DNN}
                 Acceleration on {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "171--174",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3126450",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Liu:2021:SMS,
  author =       "Wenjie Liu and Wim Heirman and Stijn Eyerman and
                 Shoaib Akram and Lieven Eeckhout",
  title =        "Scale-Model Simulation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "20",
  number =       "2",
  pages =        "175--178",
  month =        jul # "\slash " # dec,
  year =         "2021",
  DOI =          "https://doi.org/10.1109/LCA.2021.3133112",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2022:IIC,
  author =       "Anonymous",
  title =        "2021 Index {{\booktitle{IEEE Computer Architecture
                 Letters}}} Vol. 20",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "1",
  pages =        "1--8",
  month =        jan # "\slash " # jun,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3141948",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Xie:2022:MSS,
  author =       "Xinfeng Xie and Peng Gu and Jiayi Huang and Yufei Ding
                 and Yuan Xie",
  title =        "{MPU-Sim}: a Simulator for In-{DRAM} Near-Bank
                 Processing Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2021.3135557",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Zou:2022:AGP,
  author =       "Mo Zou and Mingzhe Zhang and Rujia Wang and Xian-He
                 Sun and Xiaochun Ye and Dongrui Fan and Zhimin Tang",
  title =        "Accelerating Graph Processing With Lightweight
                 Learning-Based Data Reordering",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3151087",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Barber:2022:PSA,
  author =       "Kristin Barber and Moein Ghaniyoun and Yinqian Zhang
                 and Radu Teodorescu",
  title =        "A Pre-Silicon Approach to Discovering
                 Microarchitectural Vulnerabilities in Security Critical
                 Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3151256",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Lee:2022:MES,
  author =       "Dusol Lee and Duwon Hong and Wonil Choi and Jihong
                 Kim",
  title =        "{MQSim-E}: an Enterprise {SSD} Simulator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3144773",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Lucas:2022:LHI,
  author =       "Benjamin J. Lucas and Ali Alwan and Marion Murzello
                 and Yazheng Tu and Pengzhou He and Andrew J. Schwartz
                 and David Guevara and Ujjwal Guin and Kyle Juretus and
                 Jiafeng Xie",
  title =        "Lightweight Hardware Implementation of Binary
                 Ring-{LWE} {PQC} Accelerator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3160394",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Apr 14 17:00:32 2022",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Shin:2022:RSA,
  author =       "Yongwon Shin and Juseong Park and Jeongmin Hong and
                 Hyojin Sung",
  title =        "Runtime Support for Accelerating {CNN} Models on
                 Digital {DRAM} Processing-in-Memory Hardware",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "33--36",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3182363",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Jin:2022:MPC,
  author =       "Hoyong Jin and Donghun Jeong and Taewon Park and Jong
                 Hwan Ko and Jungrae Kim",
  title =        "Multi-Prediction Compression: an Efficient and
                 Scalable Memory Compression Framework for {GP-GPU}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "37--40",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3177419",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Kokkinis:2022:DOC,
  author =       "Argyris Kokkinis and Dionysios Diamantopoulos and
                 Kostas Siozios",
  title =        "Dynamic Optimization of On-Chip Memories for {HLS}
                 Targeting Many-Accelerator Platforms",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "41--44",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3190048",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Yun:2022:GND,
  author =       "Sungmin Yun and Byeongho Kim and Jaehyun Park and
                 Hwayong Nam and Jung Ho Ahn and Eojin Lee",
  title =        "{GraNDe}: Near-Data Processing Architecture With
                 Adaptive Matrix Mapping for Graph Convolutional
                 Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "45--48",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3182387",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Ma:2022:FBA,
  author =       "Rui Ma and Evangelos Georganas and Alexander Heinecke
                 and Sergey Gribok and Andrew Boutros and Eriko
                 Nurvitadhi",
  title =        "{FPGA-Based} {AI} Smart {NICs} for Scalable
                 Distributed {AI} Training Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "49--52",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3189207",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Hameed:2022:DPA,
  author =       "Fazal Hameed and Asif Ali Khan and Sebastien Ollivier
                 and Alex K. Jones and Jeronimo Castrillon",
  title =        "{DNA} Pre-Alignment Filter Using Processing Near
                 Racetrack Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "53--56",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3194263",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Yang:2022:SEP,
  author =       "Ling Yang and Libo Huang and Run Yan and Nong Xiao and
                 Sheng Ma and Li Shen and Weixia Xu",
  title =        "Stride Equality Prediction for Value Speculation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "57--60",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3195411",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Hong:2022:OMC,
  author =       "Jeongmin Hong and Sungjun Cho and Gwangsun Kim",
  title =        "Overcoming Memory Capacity Wall of {GPUs} With
                 Heterogeneous Memory Stack",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "61--64",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3196932",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Piccolboni:2022:ASS,
  author =       "Luca Piccolboni and Davide Giri and Luca P. Carloni",
  title =        "Accelerators \& Security: The Socket Approach",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "65--68",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3179947",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Yan:2022:CUH,
  author =       "Mingyu Yan and Mo Zou and Xiaocheng Yang and Wenming
                 Li and Xiaochun Ye and Dongrui Fan and Yuan Xie",
  title =        "Characterizing and Understanding {HGNNs} on {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "69--72",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3198281",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Accetti:2022:SCE,
  author =       "Cecil Accetti and Rendong Ying and Peilin Liu",
  title =        "Structured Combinators for Efficient Graph Reduction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "73--76",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3198844",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Omori:2022:OSH,
  author =       "Yu Omori and Keiji Kimura",
  title =        "Open-Source Hardware Memory Protection Engine
                 Integrated With {NVMM} Simulator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "77--80",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3197777",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Kim:2022:CSD,
  author =       "Minjae Kim and Bryan S. Kim and Eunji Lee and Sungjin
                 Lee",
  title =        "A Case Study of a {DRAM-NVM} Hybrid Memory Allocator
                 for Key--Value Stores",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "81--84",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3197654",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Wang:2022:ISE,
  author =       "Zhengrong Wang and Christopher Liu and Tony Nowatzki",
  title =        "{Infinity Stream}: Enabling Transparent and Automated
                 In-Memory Computing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "85--88",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3203064",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Wu:2022:DCG,
  author =       "Lingxi Wu and Rasool Sharifi and Ashish Venkat and
                 Kevin Skadron",
  title =        "{DRAM-CAM}: General-Purpose Bit-Serial Exact Pattern
                 Matching",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "89--92",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3201168",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Resch:2022:VSQ,
  author =       "Salonik Resch and Ulya Karpuzcu",
  title =        "On Variable Strength Quantum {ECC}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "93--96",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3200204",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Salvesen:2022:LAR,
  author =       "Peter Salvesen and Magnus Jahre",
  title =        "{LMT}: Accurate and Resource-Scalable Slowdown
                 Prediction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "97--100",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3203483",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Shin:2022:OOS,
  author =       "Gyeongcheol Shin and Junsoo Kim and Joo-Young Kim",
  title =        "{OpenMDS}: an Open-Source Shell Generation Framework
                 for High-Performance Design on {Xilinx} Multi-Die
                 {FPGAs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "101--104",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3202016",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Jalili:2022:MPD,
  author =       "Majid Jalili and Mattan Erez",
  title =        "Managing Prefetchers With Deep Reinforcement
                 Learning",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "105--108",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3210397",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Lenjani:2022:PAH,
  author =       "Marzieh Lenjani and Alif Ahmed and Kevin Skadron",
  title =        "{Pulley}: an Algorithm\slash Hardware Co-Optimization
                 for In-Memory Sorting",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "109--112",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3208255",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Sorting is an important kernel that requires many
                 passes on data, where each pass imposes significant
                 data movement overhead. Processing in memory (PIM) can
                 reduce this data movement overhead while providing high
                 parallelism. The radix sorting algorithm is scalable
                 and can exploit PIM's parallelism. However, this
                 algorithm is inefficient for current PIM-based
                 accelerators for three reasons: (i) requiring a large
                 intermediate array per processing unit, wasting
                 capacity, (ii) requiring a prefix-sum operation across
                 all the large intermediate arrays, imposing performance
                 overhead, and (iii) requiring significant random
                 accesses, which are costly in PIM. In this paper, we
                 propose an algorithm and hardware co-optimization for
                 sorting that enable every group of processing elements
                 to cooperatively share and generate an intermediate
                 array, reducing the capacity overhead of intermediate
                 arrays and performance overhead of the prefix-sum
                 operation. To prevent the shared array from becoming a
                 bottleneck due to random accesses, we eliminate random
                 accesses by adding a local sorting step to the radix
                 sorting and providing efficient hardware support for
                 this step. On average, our hardware/algorithm
                 optimizations, Pulley, deliver 20$ \times $ speedup
                 compared to Bonsai, an FPGA-based sorting accelerator,
                 and 13$ \times $ speedup compared to IMC, an
                 in-logic-layer-based sorting accelerator.",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Zhu:2022:RBP,
  author =       "Yongye Zhu and Shijia Wei and Mohit Tiwari",
  title =        "Revisiting Browser Performance Benchmarking From an
                 Architectural Perspective",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "113--116",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3210483",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Gouk:2022:PHA,
  author =       "Donghyun Gouk and Seungkwan Kang and Miryeong Kwon and
                 Junhyeok Jang and Hyunkyu Choi and Sangwon Lee and
                 Myoungsoo Jung",
  title =        "{PreGNN}: Hardware Acceleration to Take Preprocessing
                 Off the Critical Path in Graph Neural Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "117--120",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3193256",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Wang:2022:CIR,
  author =       "Yinshen Wang and Wenming Li and Tianyu Liu and
                 Liangjiang Zhou and Bingnan Wang and Zhihua Fan and
                 Xiaochun Ye and Dongrui Fan and Chibiao Ding",
  title =        "Characterization and Implementation of Radar System
                 Applications on a Reconfigurable Dataflow
                 Architecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "121--124",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3215595",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Hou:2022:CUE,
  author =       "Xiaofeng Hou and Cheng Xu and Jiacheng Liu and Xuehan
                 Tang and Lingyu Sun and Chao Li and Kwang-Ting Cheng",
  title =        "Characterizing and Understanding End-to-End
                 Multi-Modal Neural Networks on {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "125--128",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3215718",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Nye:2022:SSS,
  author =       "Jared Nye and Omer Khan",
  title =        "{SSE}: Security Service Engines to Accelerate Enclave
                 Performance in Secure Multicore Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "129--132",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3210149",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Chacon:2022:HTT,
  author =       "Gino A. Chacon and Charles Williams and Johann
                 Knechtel and Ozgur Sinanoglu and Paul V. Gratz",
  title =        "Hardware {Trojan} Threats to Cache Coherence in Modern
                 {2.5D} Chiplet Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "133--136",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3216820",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Eeckhout:2022:FOM,
  author =       "Lieven Eeckhout",
  title =        "A First-Order Model to Assess Computer Architecture
                 Sustainability",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "137--140",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3217366",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Zhou:2022:LPL,
  author =       "Ranyang Zhou and Sepehr Tabrizchi and Arman Roohi and
                 Shaahin Angizi",
  title =        "{LT-PIM}: an {LUT-Based} {Processing-in-DRAM}
                 Architecture With {RowHammer} Self-Tracking",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "141--144",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3220084",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Park:2022:SML,
  author =       "Jongwon Park and Jinkyu Jeong",
  title =        "Speculative Multi-Level Access in {LSM} Tree-Based
                 {KV} Store",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "145--148",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3219808",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Fariborz:2022:MSB,
  author =       "Marjan Fariborz and Mahyar Samani and Terry O'Neill
                 and Jason Lowe-Power and S. J. Ben Yoo and Venkatesh
                 Akella",
  title =        "A Model for Scalable and Balanced Accelerators for
                 Graph Processing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "149--152",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3215489",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Huang:2022:EDC,
  author =       "Jianming Huang and Yu Hua",
  title =        "Ensuring Data Confidentiality in {eADR-Based} {NVM}
                 Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "153--156",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3225949",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Kim:2022:SSE,
  author =       "Sejin Kim and Jungwoo Kim and Yongjoo Jang and Jaeha
                 Kung and Sungjin Lee",
  title =        "{SEMS}: Scalable Embedding Memory System for
                 Accelerating Embedding-Based {DNNs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "21",
  number =       "2",
  pages =        "157--160",
  month =        jul # "\slash " # dec,
  year =         "2022",
  DOI =          "https://doi.org/10.1109/LCA.2022.3227560",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Jimenez:2023:LLC,
  author =       "Daniel A. Jim{\'e}nez and Elvira Teran and Paul V.
                 Gratz",
  title =        "Last-Level Cache Insertion and Promotion Policy in the
                 Presence of Aggressive Prefetching",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3242178",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Moon:2023:AAD,
  author =       "Yaebin Moon and Wanju Doh and Kwanhee Kyung and Eojin
                 Lee and Jung Ho Ahn",
  title =        "{ADT}: Aggressive Demotion and Promotion for Tiered
                 Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3236685",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Park:2023:CEE,
  author =       "Gyeongseo Park and Ki-Dong Kang and Minho Kim and
                 Daehoon Kim",
  title =        "{CoreNap}: Energy Efficient Core Allocation for
                 Latency-Critical Workloads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2022.3227629",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Sim:2023:CCM,
  author =       "Joonseop Sim and Soohong Ahn and Taeyoung Ahn and
                 Seungyong Lee and Myunghyun Rhee and Jooyoung Kim and
                 Kwangsik Shin and Donguk Moon and Euiseok Kim and
                 Kyoung Park",
  title =        "Computational {CXL-Memory} Solution for Accelerating
                 Memory-Intensive Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2022.3226482",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Ringlein:2023:ACD,
  author =       "Burkhard Ringlein and Francois Abel and Dionysios
                 Diamantopoulos and Beat Weiss and Christoph Hagleitner
                 and Dietmar Fey",
  title =        "Advancing Compilation of {DNNs} for {FPGAs} Using
                 Operation Set Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2022.3227643",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Lee:2023:HHF,
  author =       "Seonho Lee and Ranggi Hwang and Jongse Park and Minsoo
                 Rhu",
  title =        "{HAMMER}: Hardware-Friendly Approximate Computing for
                 Self-Attention With Mean-Redistribution and
                 Linearization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2022.3233832",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Bae:2023:ISF,
  author =       "Hanyeoreum Bae and Donghyun Gouk and Seungjun Lee and
                 Jiseon Kim and Sungjoon Koh and Jie Zhang and Myoungsoo
                 Jung",
  title =        "Intelligent {SSD} Firmware for Zero-Overhead
                 Journaling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3243695",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Zhao:2023:RAL,
  author =       "Xia Zhao and Guangda Zhang and Lu Wang and Yangmei Li
                 and Yongjun Zhang",
  title =        "{RouteReplies}: Alleviating Long Latency in
                 Many-Chip-Module {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "1",
  pages =        "29--32",
  month =        jan # "\slash " # jun,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3255555",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Weston:2023:SLI,
  author =       "Kevin Weston and Farabi Mahmud and Vahid Janfaza and
                 Abdullah Muzahid",
  title =        "{SmartIndex}: Learning to Index Caches to Improve
                 Performance",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "1",
  pages =        "33--36",
  month =        jan # "\slash " # jun,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3264478",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Khoram:2023:EEB,
  author =       "Soroosh Khoram and Kyle Daruwalla and Mikko Lipasti",
  title =        "Energy-Efficient {Bayesian} Inference Using Bitstream
                 Computing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "1",
  pages =        "37--40",
  month =        jan # "\slash " # jun,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3238584",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Brana:2023:KSC,
  author =       "Jennifer Brana and Brian C. Schwedock and Yatin A.
                 Manerkar and Nathan Beckmann",
  title =        "{Kobold}: Simplified Cache Coherence for
                 Cache-Attached Accelerators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "1",
  pages =        "41--44",
  month =        jan # "\slash " # jun,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3269399",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Jeon:2023:HAR,
  author =       "Kiseok Jeon and Junghee Lee and Bumsoo Kim and James
                 J. Kim",
  title =        "Hardware Accelerated Reusable {Merkle} Tree Generation
                 for Bitcoin Blockchain Headers",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "69--72",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3289515",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/bitcoin.bib;
                 https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Lee:2023:CDC,
  author =       "Hwanjun Lee and Seunghak Lee and Yeji Jung and Daehoon
                 Kim",
  title =        "{T-CAT}: Dynamic Cache Allocation for Tiered Memory
                 Systems With Memory Interleaving",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "73--76",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3290197",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Jeong:2023:LLA,
  author =       "Ipoom Jeong and Jiaqi Lou and Yongseok Son and Yongjoo
                 Park and Yifan Yuan and Nam Sung Kim",
  title =        "{LADIO}: Leakage-Aware Direct {I/O} for
                 {I/O}-Intensive Workloads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "77--80",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3290427",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Deshpande:2023:TPB,
  author =       "Chandana S. Deshpande and Arthur Perais and
                 Fr{\'e}d{\'e}ric P{\'e}trot",
  title =        "Toward Practical 128-Bit General Purpose
                 Microarchitectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "81--84",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3287762",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Intel introduced 5-level paging mode to support 57-bit
                 virtual address space in 2017. This, coupled to
                 paradigms where backup storage can be accessed through
                 load and store instructions (e.g., non volatile
                 memories), lets us envision a future in which a 64-bit
                 address space has become insufficient. In that event,
                 the straightforward solution would be to adopt a flat
                 128-bit address space. In this early stage letter, we
                 conduct high-level experiments that lead us to suggest
                 a possible general-purpose processor micro-architecture
                 providing 128-bit support with limited hardware cost.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Tzenetopoulos:2023:DLD,
  author =       "Achilleas Tzenetopoulos and Dimosthenis Masouros and
                 Dimitrios Soudris and Sotirios Xydis",
  title =        "{DVFaaS}: Leveraging {DVFS} for {FaaS} Workflows",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "85--88",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3288089",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Nam:2023:XRD,
  author =       "Hwayong Nam and Seungmin Baek and Minbok Wi and
                 Michael Jaemin Kim and Jaehyun Park and Chihun Song and
                 Nam Sung Kim and Jung Ho Ahn",
  title =        "{X}-ray: Discovering {DRAM} Internal Structure and
                 Error Characteristics by Issuing Memory Commands",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "89--92",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3296153",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The demand for accurate information about the internal
                 structure and characteristics of DRAM has been on the
                 rise. Recent studies have explored the structure and
                 characteristics of DRAM to improve processing in
                 memory, enhance reliability, and mitigate a
                 vulnerability known as rowhammer. However, DRAM
                 manufacturers only disclose limited information through
                 official documents, making it difficult to find
                 specific information about actual DRAM devices. This
                 paper presents reliable findings on the internal
                 structure and characteristics of DRAM using
                 activate-induced bitflips (AIBs), retention time test,
                 and row-copy operation. While previous studies have
                 attempted to understand the internal behaviors of DRAM
                 devices, they have only shown results without
                 identifying the causes or have analyzed DRAM modules
                 rather than individual chips. We first uncover the
                 size, structure, and operation of DRAM subarrays and
                 verify our findings on the characteristics of DRAM.
                 Then, we correct misunderstood information related to
                 AIBs and demonstrate experimental results supporting
                 the cause of rowhammer.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Nematallah:2023:ELS,
  author =       "Ahmed Nematallah and Chang Hyun Park and David
                 Black-Schaffer",
  title =        "Exploring the Latency Sensitivity of Cache Replacement
                 Policies",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "93--96",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3296251",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Mosquera:2023:GCC,
  author =       "Fernando Mosquera and Krishna Kavi and Gayatri Mehta
                 and Lizy John",
  title =        "Guard Cache: Creating Noisy Side-Channels",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "97--100",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3289710",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Mars:2023:JPP,
  author =       "Jason Mars and Yiping Kang and Roland Daynauth and
                 Baichuan Li and Ashish Mahendra and Krisztian Flautner
                 and Lingjia Tang",
  title =        "The {Jaseci} Programming Paradigm and Runtime Stack:
                 Building Scale-Out Production Applications Easy and
                 Fast",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "101--104",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3274038",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Hossain:2023:SDA,
  author =       "Naorin Hossain and Alper Buyuktosunoglu and John-David
                 Wellman and Pradip Bose and Margaret Martonosi",
  title =        "{SoCurity}: a Design Approach for Enhancing {SoC}
                 Security",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "105--108",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3301448",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Feng:2023:SOW,
  author =       "Justin Feng and Fatemeh Arkannezhad and Christopher
                 Ryu and Enoch Huang and Siddhant Gupta and Nader
                 Sehatbakhsh",
  title =        "Simulating Our Way to Safer Software: a Tale of
                 Integrating Microarchitecture Simulation and Leakage
                 Estimation Modeling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "109--112",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3303913",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Wed Sep 13 17:35:03 2023",
  bibsource =    "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Choi:2023:UPP,
  author =       "Jaewan Choi and Jaehyun Park and Kwanhee Kyung and Nam
                 Sung Kim and Jung Ho Ahn",
  title =        "Unleashing the Potential of {PIM}: Accelerating Large
                 Batched Inference of Transformer-Based Generative
                 Models",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "113--116",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3305386",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "attention; Computational modeling; Context modeling;
                 Decoding; Matrix converters; Memory management;
                 processing-in-memory; Throughput; Transformer-based
                 generative model; Transformers",
}

@Article{Kim:2023:HAC,
  author =       "Yonghae Kim and Anurag Kar and Jaewon Lee and Jaekyu
                 Lee and Hyesoon Kim",
  title =        "Hardware-Assisted Code-Pointer Tagging for
                 Forward-Edge Control-Flow Integrity",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "117--120",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3306326",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Authentication; Benchmark testing; CFI; Codes; CPT;
                 Hardware; memory safety; Prototypes; RISC-V BOOM;
                 Software; Tagging",
}

@Article{Saileshwar:2023:MBM,
  author =       "Gururaj Saileshwar and Moinuddin Qureshi",
  title =        "The Mirage of Breaking {MIRAGE}: Analyzing the
                 Modeling Pitfalls in Emerging Attacks on {MIRAGE}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "121--124",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3297875",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Analytical models; Cache side-channel attacks;
                 Ciphers; Codes; Computer bugs; Indexing; randomized
                 caches; Security; Side-channel attacks",
}

@Article{Lo:2023:LLV,
  author =       "Yun-Chen Lo and Yu-Chih Tsai and Ren-Shuo Liu",
  title =        "{LV}: Latency-Versatile Floating-Point Engine for
                 High-Performance Deep Neural Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "125--128",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3287096",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Adders; Approximate computation; Artificial neural
                 networks; Clocks; Computer architecture; Electric
                 breakdown; Engines; floating point; latency-versatile
                 architecture; Registers",
}

@Article{Goudarzi:2023:SBP,
  author =       "Maziar Goudarzi and Reza Azimi and Julian Humecki and
                 Faizaan Rehman and Richard Zhang and Chirag Sethi and
                 Tanishq Bomman and Yuqi Yang",
  title =        "By-Software Branch Prediction in Loops",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "129--132",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3304613",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "C.0.b hardware/software interfaces; C.1.1.b pipeline
                 processors; C.1.5.a instruction fetch; Codes; D.3.4.b
                 compilers; Hardware; Monitoring; Optimization; Program
                 processors; Software; Target tracking",
}

@Article{Yun:2023:FPP,
  author =       "Yugyoung Yun and Eunhyeok Park",
  title =        "Fast Performance Prediction for Efficient Distributed
                 {DNN} Training",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "133--136",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3316452",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "3D parallelism; Costs; Distributed training; large
                 language model; Optimization; Parallel processing;
                 Performance evaluation; performance modeling; Tensors;
                 Throughput; Training",
}

@Article{Wu:2023:CUD,
  author =       "Meng Wu and Mingyu Yan and Xiaocheng Yang and Wenming
                 Li and Zhimin Zhang and Xiaochun Ye and Dongrui Fan",
  title =        "Characterizing and Understanding Defense Methods for
                 {GNNs} on {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "137--140",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3304638",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "defense; Electric breakdown; Estimation; execution
                 pattern; execution semantic; Graph neural networks;
                 Graphics processing units; Kernel; overhead;
                 Perturbation methods; Purification; Training",
}

@Article{Patel:2023:TIP,
  author =       "Pratyush Patel and Zibo Gong and Syeda Rizvi and Esha
                 Choukse and Pulkit Misra and Thomas Anderson and
                 Akshitha Sriraman",
  title =        "Towards Improved Power Management in Cloud {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "141--144",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3278652",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Clocks; Cloud computing; design for power delivery
                 limits; Graphics processing units; graphics processors;
                 Monitoring; Performance evaluation; Power management;
                 Power system management; servers; Servers; super (very
                 large) computers",
}

@Article{Zhang:2023:BPA,
  author =       "Shiqing Zhang and Mahmood Naderan-Tahan and Magnus
                 Jahre and Lieven Eeckhout",
  title =        "Balancing Performance Against Cost and Sustainability
                 in Multi-Chip-Module {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "145--148",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3313203",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Aggregates; Bandwidth; Costs; Graphics processing
                 units; Manufacturing; Sustainable development;
                 Switches",
}

@Article{Park:2023:DHP,
  author =       "Chanyoung Park and Chun-Yi Liu and Kyungtae Kang and
                 Mahmut Kandemir and Wonil Choi",
  title =        "Design of a High-Performance, High-Endurance Key-Value
                 {SSD} for Large-Key Workloads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "149--152",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3282276",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Blogs; Data structures; Key-value SSD; large-key
                 workloads; Micromechanical devices; Performance
                 evaluation; Random access memory; Social networking
                 (online); Tail",
}

@Article{Liu:2023:ILG,
  author =       "Jie Liu and Zhongyuan Zhao and Zijian Ding and
                 Benjamin Brock and Hongbo Rong and Zhiru Zhang",
  title =        "An Intermediate Language for General Sparse Format
                 Customization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "153--156",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3262610",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Codes; Compilers; Hardware; heterogeneous (hybrid)
                 systems; Indexes; Kernel; Layout; Metadata; sparse
                 linear algebra; specialized application languages;
                 Tensors",
}

@Article{Lee:2023:NPR,
  author =       "Seunghak Lee and Ki-Dong Kang and Gyeongseo Park and
                 Nam Sung Kim and Daehoon Kim",
  title =        "{NoHammer}: Preventing Row Hammer With Last-Level
                 Cache Management",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "157--160",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3320670",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Degradation; DRAM; Indexes; Last-level cache
                 management; Memory management; Proposals; Random access
                 memory; reliability; Reverse engineering; row hammer;
                 Threat modeling",
}

@Article{Escofet:2023:HQA,
  author =       "Pau Escofet and Anabel Ovide and Carmen G. Almudever
                 and Eduard Alarc{\'o}n and Sergi Abadal",
  title =        "{Hungarian} Qubit Assignment for Optimized Mapping of
                 Quantum Circuits on Multi-Core Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "161--164",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3318857",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; Computers; Costs; Logic gates;
                 Mapping of quantum algorithms; multi-core quantum
                 computing architectures; Partitioning algorithms;
                 Quantum computing; quantum computing; Qubit",
}

@Article{Lu:2023:FEA,
  author =       "Lingfei Lu and Yudi Qiu and Shiyan Yi and Yibo Fan",
  title =        "A Flexible Embedding-Aware Near Memory Processing
                 Architecture for Recommendation System",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "165--168",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3305668",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Computer architecture; data partition;
                 Fans; Kernel; near memory processing; Random access
                 memory; Recommendation system; Recommender systems;
                 Social networking (online)",
}

@Article{Li:2023:HFT,
  author =       "Hailong Li and Jaewan Choi and Yongsuk Kwon and Jung
                 Ho Ahn",
  title =        "A Hardware-Friendly Tiled Singular-Value
                 Decomposition-Based Matrix Multiplication for
                 Transformer-Based Models",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "169--172",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3323482",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computational modeling; GPU; Graphics processing
                 units; Kernel; Matrix decomposition; Natural language
                 processing; Task analysis; tiled singular vector
                 decomposition; Transformer-based model; Transformers",
}

@Article{Hastings:2023:ASR,
  author =       "Adam Hastings and Ryan Piersma and Simha
                 Sethumadhavan",
  title =        "Architectural Security Regulation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "173--176",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3327952",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Costs; Games; Government; Modeling techniques;
                 Regulation; Regulators; Safety; Security; security
                 regulation; support for security",
}

@Article{Trochatos:2023:QCT,
  author =       "Theodoros Trochatos and Chuanqi Xu and Sanjay
                 Deshpande and Yao Lu and Yongshan Ding and Jakub
                 Szefer",
  title =        "A Quantum Computer Trusted Execution Environment",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "177--180",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3325852",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Attenuation; Cloud computing; cloud computing;
                 Computer security; control pulses; Cryptography;
                 dilution refrigerator; Hardware; Logic gates;
                 obfuscation; Quantum computing; quantum computing;
                 Qubit; RF switches",
}

@Article{Wu:2023:RAI,
  author =       "Peiyun Wu and Trung Le and Zhichun Zhu and Zhao
                 Zhang",
  title =        "Redundant Array of Independent Memory Devices",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "181--184",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3334989",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Data transfer; Error correction codes; Layout; Memory
                 management; Memory systems; mini-rank; multi-bit
                 errors; Organizations; parity; Performance evaluation;
                 redundant array; Standards organizations",
}

@Article{Garcia-Mallen:2023:TAD,
  author =       "Jonathan Garcia-Mallen and Shuohao Ping and Alex
                 Miralles-Cordal and Ian Martin and Mukund Ramakrishnan
                 and Yipeng Huang",
  title =        "Towards an Accelerator for Differential and Algebraic
                 Equations Useful to Scientists",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "22",
  number =       "2",
  pages =        "185--188",
  month =        jul # "\slash " # dec,
  year =         "2023",
  DOI =          "https://doi.org/10.1109/LCA.2023.3332318",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Convergence; Differential equations; Field
                 programmable gate arrays; Hardware; Hyperbolic
                 equations; Iterative methods; iterative methods;
                 reconfigurable hardware; Registers; Scientific
                 computing",
}

@Article{Vieira:2024:GAP,
  author =       "Jo{\~a}o Vieira and Nuno Roma and Gabriel Falcao and
                 Pedro Tom{\'a}s",
  title =        "{gem5-accel}: a Pre-{RTL} Simulation Toolchain for
                 Accelerator Architecture Validation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2023.3329443",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerator modeling; Central Processing Unit;
                 complete system emulation; Computer architecture;
                 Hardware acceleration; Kernel; Process control; Random
                 access memory; Registers; Simulation toolchain",
}

@Article{Gheibi-Fetrat:2024:TTF,
  author =       "Atiyeh Gheibi-Fetrat and Negar Akbarzadeh and Shaahin
                 Hessabi and Hamid Sarbazi-Azad",
  title =        "{Tulip}: Turn-Free Low-Power Network-on-Chip",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2023.3339646",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "area; Chip Multiprocessor (CMP); crossbar; Integrated
                 circuits; Mesh networks; Network topology;
                 Network-on-chip; Network-on-Chip (NoC); power
                 consumption; router; Routing; System recovery;
                 System-on-Chip (SoC); Topology",
}

@Article{Ueno:2024:ITB,
  author =       "Yosuke Ueno and Yuna Tomida and Teruo Tanimoto and
                 Masamitsu Tanaka and Yutaka Tabuchi and Koji Inoue and
                 Hiroshi Nakamura",
  title =        "Inter-Temperature Bandwidth Reduction in Cryogenic
                 {QAOA} Machines",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2023.3322700",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Computer architecture; cryogenic
                 electronics; Cryogenics; Logic gates; Quantum
                 computing; quantum computing; Qubit; qubit;
                 Superconducting cables; superconducting logic
                 circuits",
}

@Article{Kim:2024:FAD,
  author =       "Hyeseong Kim and Yunjae Lee and Minsoo Rhu",
  title =        "{FPGA}-Accelerated Data Preprocessing for Personalized
                 Recommendation Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "7--10",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2023.3336841",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Data models; Data preprocessing; data preprocessing;
                 Feature extraction; FPGA; Graphics processing units;
                 Personalized recommendation system; Servers;
                 Throughput; training; Training",
}

@Article{Peltekis:2024:DDM,
  author =       "Christodoulos Peltekis and Vasileios Titopoulos and
                 Chrysostomos Nicopoulos and Giorgos Dimitrakopoulos",
  title =        "{DeMM}: a Decoupled Matrix Multiplication Engine
                 Supporting Relaxed Structured Sparsity",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2024.3355178",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computational modeling; Engines; Hardware; Indexes;
                 Machine learning accelerator; matrix-multiplication
                 engine; Organizations; Sparse matrices; structured
                 sparsity; Systolic arrays; systolic computation",
}

@Article{Corontzos:2024:DCD,
  author =       "Caden Corontzos and Eitan Frachtenberg",
  title =        "Direct-Coding {DNA} With Multilevel Parallelism",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2024.3355109",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Decoding; DNA; DNA encoding; Encoding; Genomics;
                 Instruction sets; parallel architectures; Random access
                 memory; Throughput",
}

@Article{Ayanzadeh:2024:ERR,
  author =       "Ramin Ayanzadeh and Moinuddin Qureshi",
  title =        "Enhancing the Reach and Reliability of Quantum
                 Annealers by Pruning Longer Chains",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2023.3340030",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Adiabatic quantum computing; Annealing; Computers;
                 embedding; Hardware; power-law; quantum annealers;
                 Quantum annealing; Quantum circuit; Quantum computing;
                 Qubit",
}

@Article{Golden:2024:SVV,
  author =       "Courtney Golden and Dan Ilan and Caroline Huang and
                 Niansong Zhang and Zhiru Zhang and Christopher Batten",
  title =        "Supporting a Virtual Vector Instruction Set on a
                 Commercial Compute-in-{SRAM} Accelerator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "29--32",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2023.3341389",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; hardware/software interfaces;
                 In-memory computing; Instruction sets; Latches;
                 Microarchitecture; Process control; Programming;
                 Registers",
}

@Article{Thomas:2024:BMT,
  author =       "Samuel Thomas and Kidus Workneh and Ange-Thierry
                 Ishimwe and Zack McKevitt and Phaedra Curlin and R.
                 Iris Bahar and Joseph Izraelevitz and Tamara Lehman",
  title =        "Baobab {Merkle} Tree for Efficient Secure Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "33--36",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2024.3360709",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Encryption; encryption; Indexes;
                 integrity; Memory management; Metadata; Protocols;
                 secure memory; Security; System-on-chip",
}

@Article{Cho:2024:EEA,
  author =       "Minsik Cho and Keivan A. Vahid and Qichen Fu and
                 Saurabh Adya and Carlo C. {Del Mundo} and Mohammad
                 Rastegari and Devang Naik and Peter Zatloukal",
  title =        "{eDKM}: an Efficient and Accurate Train-Time Weight
                 Clustering for Large Language Models",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "37--40",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2024.3363492",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "artificial intelligence; Complexity theory;
                 Computational and artificial intelligence; deep
                 learning; Graphics processing units; Indexes; learning
                 systems; machine learning; Memory; Optimization;
                 Sharding; Tensors",
}

@Article{Kim:2024:ADR,
  author =       "Yang-Gon Kim and Yun-Ki Han and Jae-Kang Shin and
                 Jun-Kyum Kim and Lee-Sup Kim",
  title =        "Accelerating Deep Reinforcement Learning via
                 Phase-Level Parallelism for Robotics Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "41--44",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2023.3341152",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Backpropagation; Computer systems organization;
                 Graphics processing units; Hardware; Legged locomotion;
                 mobile computing; neural nets; Reinforcement learning;
                 Robots; Training",
}

@Article{Yang:2024:JIJ,
  author =       "Yuxin Yang and Xiaoming Chen and Yinhe Han",
  title =        "{JANM-IK}: {Jacobian} Argumented {Nelder--Mead}
                 Algorithm for Inverse Kinematics and its Hardware
                 Acceleration",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "45--48",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2024.3369940",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerator; Convergence; End effectors; Field
                 programmable gate arrays; inverse kinematics; Jacobian;
                 Jacobian matrices; Kinematics; nelder-mead;
                 Perturbation methods; Robotics; Robots;
                 software-hardware co-design",
}

@Article{Hafezan:2024:IEE,
  author =       "Mohammad Hafezan and Ehsan Atoofian",
  title =        "Improving Energy-Efficiency of Capsule Networks on
                 Modern {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "49--52",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2024.3365149",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "CapsNet; Computer architecture; energy-efficiency;
                 GPU; Graphics processing units; Hidden Markov models;
                 Instruction sets; Matrix converters; Registers; tensor
                 core; Vectors",
}

@Article{Nagabhiru:2024:AFP,
  author =       "Mahita Nagabhiru and Gregory T. Byrd",
  title =        "Achieving Forward Progress Guarantee in Small Hardware
                 Transactions",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "53--56",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2024.3370992",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Atomics; Coherence; compare-and-swap; concurrency;
                 Data structures; forward progress; Hardware; hardware
                 transactional memory; Instruction sets; lock-free;
                 multi-word-compare-and-swap; multithreading;
                 non-blocking; Programming; Protocols; Software",
}

@Article{Ma:2024:PFA,
  author =       "Rui Ma and Jia-Ching Hsu and Ali Mansoorshahi and
                 Joseph Garvey and Michael Kinsner and Deshanand Singh
                 and Derek Chiou",
  title =        "{Primate}: a Framework to Automatically Generate Soft
                 Processors for Network Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "57--60",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2024.3358839",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Codes; Design methodology; domain-specific
                 accelerators; Field programmable gate arrays;
                 flexibility; Libraries; programmability; Registers;
                 Software; Throughput; VLIW",
}

@Article{France:2024:RSA,
  author =       "Lo{\"\i}c France and Florent Bruguier and David Novo
                 and Maria Mushtaq and Pascal Benoit",
  title =        "Reducing the Silicon Area Overhead of Counter-Based
                 Rowhammer Mitigations",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "61--64",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2023.3328824",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Capacitors; Computer security; DRAM; Proposals; Random
                 access memory; rowhammer; Security; Silicon; Timing;
                 Transistors",
}

@Article{Yavits:2024:DCD,
  author =       "L. Yavits",
  title =        "{DRAMA}: Commodity {DRAM} Based Content Addressable
                 Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "65--68",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2023.3341830",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "CAM; DNA; DRAM; Hardware; Humanities; Random access
                 memory; Three-dimensional displays; Timing; Voltage",
}

@Article{Mishra:2024:ASA,
  author =       "Deepanjali Mishra and Konstantinos Kanellopoulos and
                 Ashish Panwar and Akshitha Sriraman and Vivek Seshadri
                 and Onur Mutlu and Todd C. Mowry",
  title =        "Address Scaling: Architectural Support for
                 Fine-Grained Thread-Safe Metadata Management",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "23",
  number =       "1",
  pages =        "69--72",
  month =        jan # "\slash " # jun,
  year =         "2024",
  DOI =          "https://doi.org/10.1109/LCA.2024.3373760",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Complexity theory; Computer bugs; Data structures;
                 dynamic program monitoring tools; Hardware;
                 intermediate address space; Metadata; metadata
                 management; Monitoring; Synthetic aperture sonar;
                 Virtual memory",
}
%%% [17-May-2023] check last issue: papers are added during each half year