Valid HTML 4.0! Valid CSS!
%%% -*-BibTeX-*-
%%% ====================================================================
%%%  BibTeX-file{
%%%     author          = "Nelson H. F. Beebe",
%%%     version         = "1.00",
%%%     date            = "21 June 2019",
%%%     time            = "08:09:58 MDT",
%%%     filename        = "ieeecomputarchitlett.bib",
%%%     address         = "University of Utah
%%%                        Department of Mathematics, 110 LCB
%%%                        155 S 1400 E RM 233
%%%                        Salt Lake City, UT 84112-0090
%%%                        USA",
%%%     telephone       = "+1 801 581 5254",
%%%     FAX             = "+1 801 581 4148",
%%%     URL             = "http://www.math.utah.edu/~beebe",
%%%     checksum        = "43922 28247 147603 1505658",
%%%     email           = "beebe at math.utah.edu, beebe at acm.org,
%%%                        beebe at computer.org (Internet)",
%%%     codetable       = "ISO/ASCII",
%%%     keywords        = "BibTeX; bibliography; IEEE Computer
%%%                        Architecture Letters",
%%%     license         = "public domain",
%%%     supported       = "yes",
%%%     docstring       = "This is a COMPLETE bibliography of
%%%                        publications in the journal IEEE Computer
%%%                        Architecture Letters (CODEN none, ISSN
%%%                        1556-6056 (print), 1556-6064 (electronic)).
%%%                        Publication began with volume 1, number 1,
%%%                        in January 2002, and there was only one
%%%                        issue per annual volume through 2005.  Since
%%%                        volume 5 (2006), there are only two issues
%%%                        per volume.
%%%
%%%                        The journal has Web sites at
%%%
%%%                            https://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208
%%%                            https://ieeexplore.ieee.org/xpl/issues?punumber=10208&isnumber=8610345
%%%
%%%                        At version 1.00, the COMPLETE year coverage
%%%                        looked like this:
%%%
%%%                             2002 (  12)    2008 (  21)    2014 (  36)
%%%                             2003 (   7)    2009 (  34)    2015 (  52)
%%%                             2004 (   9)    2010 (  32)    2016 (  49)
%%%                             2005 (   2)    2011 (  25)    2017 (  42)
%%%                             2006 (  18)    2012 (  27)    2018 (  61)
%%%                             2007 (  14)    2013 (  29)    2019 (  22)
%%%
%%%                             Article:        492
%%%
%%%                             Total entries:  492
%%%
%%%                        Data for this bibliography have been derived
%%%                        primarily from the publisher Web site, and
%%%                        from the Web of Science Web site.
%%%
%%%                        Numerous errors in the Web sources noted
%%%                        above have been corrected.  Spelling has been
%%%                        verified with the UNIX spell and GNU ispell
%%%                        programs using the exception dictionary
%%%                        stored in the companion file with extension
%%%                        .sok.
%%%
%%%                        BibTeX citation tags are uniformly chosen
%%%                        as name:year:abbrev, where name is the
%%%                        family name of the first author or editor,
%%%                        year is a 4-digit number, and abbrev is a
%%%                        3-letter condensation of important title
%%%                        words. Citation tags were automatically
%%%                        generated by software developed for the
%%%                        BibNet Project.
%%%
%%%                        The checksum field above contains a CRC-16
%%%                        checksum as the first value, followed by the
%%%                        equivalent of the standard UNIX wc (word
%%%                        count) utility output of lines, words, and
%%%                        characters.  This is produced by Robert
%%%                        Solovay's checksum utility.",
%%%  }
%%% ====================================================================
@Preamble{ "\ifx \undefined \booktitle \def \booktitle#1{{{\em #1}}} \fi" }

%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
                    University of Utah,
                    Department of Mathematics, 110 LCB,
                    155 S 1400 E RM 233,
                    Salt Lake City, UT 84112-0090, USA,
                    Tel: +1 801 581 5254,
                    FAX: +1 801 581 4148,
                    e-mail: \path|beebe@math.utah.edu|,
                            \path|beebe@acm.org|,
                            \path|beebe@computer.org| (Internet),
                    URL: \path|http://www.math.utah.edu/~beebe/|"}

%%% ====================================================================
%%% Journal abbreviations:
@String{j-IEEE-COMPUT-ARCHIT-LETT = "IEEE Computer Architecture Letters"}

%%% ====================================================================
%%% Bibliography entries, sorted in publication order with ``bibsort
%%% --byvolume'':
@Article{Alvarez:2002:IRF,
  author =       "C. Alvarez and J. Corbal and E. Salami and M. Valero",
  title =        "Initial Results on Fuzzy Floating Point Computation
                 for Multimedia Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "1--1",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "During the recent years the market of mid low end
                 portable systems such as PDAs or mobile digital phones
                 have experimented a revolution in both selling volume
                 and features as handheld devices incorporate Multimedia
                 applications. This fact brings to an increase in the
                 computational demands of the devices while still having
                 the limitation of power and energy consumption.
                 Instruction memoization is a promising technique to
                 help alleviate the problem of power consumption of
                 expensive functional units such as the floating point
                 one. Unfortunately this technique could be energy
                 inefficient for low end systems due to the additional
                 power consumption of the relatively big tables
                 required. In this paper we present a novel way of
                 understanding multimedia floating point operations
                 based on the fuzzy computation paradigm losses in the
                 computation precision may exchange performance for
                 negligible errors in the output. Exploiting the
                 implicit characteristics of media FP computation we
                 propose a new technique called fuzzy memoization. Fuzzy
                 memoization expands the capabilities of classic
                 memoization by attaching entries with similar inputs to
                 the same output. We present a case of study for a SH
                 like processor and report good performance and power
                 delay improvements with feasible hardware
                 requirements",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Delay; Energy consumption; Fuzzy systems; Handheld
                 computers; Joining processes; Mobile computing;
                 Multimedia systems; Performance loss; Personal digital
                 assistants; Portable computers",
}

@Article{Gordon-Ross:2002:EFP,
  author =       "A. Gordon-Ross and S. Cotterell and F. Vahid",
  title =        "Exploiting Fixed Programs in Embedded Systems: A Loop
                 Cache Example",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "2--2",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Embedded systems commonly execute one program for
                 their lifetime. Designing embedded system architectures
                 with configurable components, such that those
                 components can be tuned to that one program based on a
                 program pre-analysis, can yield significant power and
                 performance benefits. We illustrate such benefits by
                 designing a loop cache specifically with tuning in
                 mind. Our results show a 70\% reduction in instruction
                 memory access, for MIPS and 8051 processors
                 representing twice the reduction from a regular loop
                 cache, translating to good power savings.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architecture tuning; Computer architecture; Computer
                 science; Costs; Digital cameras; Embedded computing;
                 Embedded system; embedded systems.; fixed program; Loop
                 cache; low power; Microcomputers; Microprocessor chips;
                 Portable computers; Power engineering computing",
}

@Article{Choi:2002:LPT,
  author =       "Jin-Hyuck Choi and Jung-Hoon Lee and Seh-Woong Jeong
                 and Shin-Dug Kim and C. Weems",
  title =        "A Low Power {TLB} Structure for Embedded Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "3--3",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We present a new two-level TLB (translation look-aside
                 buffer) architecture that integrates a 2-way banked
                 filter TLB with a 2-way banked main TLB. The objective
                 is to reduce power consumption in embedded processors
                 by distributing the accesses to TLB entries across the
                 banks in a balanced manner. First, an advanced
                 filtering technique is devised to reduce access power
                 by adopting a sub-bank structure. Second, a
                 bank-associative structure is applied to each level of
                 the TLB hierarchy. Simulation results show that the
                 Energy*Delay product can be reduced by about 40.9\%
                 compared to a fully associative TLB, 24.9\% compared to
                 a micro-TLB with 4+32 entries, and 12.18\% compared to
                 a micro-TLB with 16+32 entries.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bank associative structure; CADCAM; Circuits; Computer
                 aided manufacturing; Degradation; Embedded system;
                 Energy consumption; Filter bank; filter mechanism;
                 Filtering; low power design; Power filters; translation
                 look-aside buffer; Virtual private networks",
}

@Article{Towles:2002:WCT,
  author =       "B. Towles and W. J. Dally",
  title =        "Worst-case Traffic for Oblivious Routing Functions",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "4--4",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper presents an algorithm to find a worst-case
                 traffic pattern for any oblivious routing algorithm on
                 an arbitrary interconnection network topology. The
                 linearity of channel loading offered by oblivious
                 routing algorithms enables the problem to be mapped to
                 a bipartite maximum-weight matching, which can be
                 solved in polynomial time for routing functions with a
                 polynomial number of paths. Finding exact worst case
                 performance was previously intractable, and we
                 demonstrate an example case where traditional
                 characterization techniques overestimate the throughput
                 of a particular routing algorithm by 47\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bipartite graph; Linearity; Multiprocessor
                 interconnection networks; Network topology; oblivious
                 routing; Pattern matching; Polynomials; Routing;
                 Telecommunication traffic; Throughput; worst-case
                 throughput",
}

@Article{Unsal:2002:CFC,
  author =       "O. S. Unsal and C. M. Krishna and C. A. Mositz",
  title =        "{Cool-Fetch}: Compiler-Enabled Power-Aware Fetch
                 Throttling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "5--5",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper, we present an architecture compiler
                 based approach to reduce energy consumption in the
                 processor. While we mainly target the fetch unit, an
                 important side-effect of our approach is that we obtain
                 energy savings in many other parts in the processor.
                 The explanation is that the fetch unit often runs
                 substantially ahead of execution, bringing in
                 instructions to different stages in the processor that
                 may never be executed. We have found, that although the
                 degree of Instruction Level Parallelism (ILP)of a
                 program tends to vary over time, it can be statically
                 predicted by the compiler with considerable accuracy.
                 Our Instructions Per Clock (IPC) prediction scheme is
                 using a dependence-testing-based analysis and simple
                 heuristics, to guide a front-end fetch-throttling
                 mechanism. We develop the necessary architecture
                 support and include its power overhead. We perform
                 experiments over a wide number of architectural
                 configurations, using SPEC2000 applications. Our
                 results are very encouraging: we obtain up to 15\%total
                 energy savings in the processor with generally little
                 performance degradation. In fact, in some cases our
                 intelligent throttling scheme even increases
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Clocks; compiler architecture interaction;
                 Degradation; Energy consumption; fetch-throttling;
                 instruction level parallelism; Low power design;
                 Program processors",
}

@Article{Shang:2002:PEI,
  author =       "Li Shang and L. Peh and N. K. Jha",
  title =        "Power-efficient Interconnection Networks: Dynamic
                 Voltage Scaling with Links",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "6--6",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Power consumption is a key issue in high performance
                 interconnection network design. Communication links,
                 already a significant consumer of power now, will take
                 up an ever larger portion of the power budget as demand
                 for network bandwidth increases. In this paper, we
                 motivate the use of dynamic voltage scaling (DVS) for
                 links, where the frequency and voltage of links are
                 dynamically adjusted to minimize power consumption. We
                 propose a history-based DVS algorithm that judiciously
                 adjusts DVS policies based on past link utilization.
                 Despite every conservative assumptions about DVS link
                 characteristics, our approach realizes up to 4.5X power
                 savings (3.2X average), with just an average 27.4\%
                 latency increase and 2.5\% throughput reduction. To the
                 best of our knowledge, this is the first study that
                 targets dynamic power optimization of interconnection
                 networks.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Clocks; Dynamic voltage scaling; Frequency
                 synthesizers; interconnection network; Multiprocessor
                 interconnection networks; power optimization.;
                 Regulators",
}

@Article{KleinOsowski:2002:MNS,
  author =       "A. J. KleinOsowski and D. J. Lilja",
  title =        "{MinneSPEC}: A New {SPEC} Benchmark Workload for
                 Simulation-Based Computer Architecture Research",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "7--7",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Computer architects must determine how to most
                 effectively use finite computational resources when
                 running simulations to evaluate new architectural
                 ideas. To facilitate efficient simulations with a range
                 of benchmark programs, we have developed the MinneSPEC
                 input set for the SPEC CPU 2000 benchmark suite. This
                 new workload allows computer architects to obtain
                 simulation results in a reasonable time using existing
                 simulators. While the MinneSPEC workload is derived
                 from the standard SPEC CPU 2000 work load, it is a
                 valid benchmark suite in and of itself for
                 simulation-based research. MinneSPEC also may be used
                 to run large numbers of simulations to find ``sweet
                 spots'' in the evaluation parameters pace. This small
                 number of promising design points subsequently may be
                 investigated in more detail with the full SPEC
                 reference workload. In the process of developing the
                 MinneSPEC datasets, we quantify its differences in
                 terms of function-level execution patterns, instruction
                 mixes, and memory behaviors compared to the SPEC
                 programs when executed with the reference inputs. We
                 find that for some programs, the MinneSPEC profiles
                 match the SPEC reference dataset program behavior very
                 closely. For other programs, however, the MinneSPEC
                 inputs produce significantly different program
                 behavior. The MinneSPEC workload has been recognized by
                 SPEC and is distributed with Version 1.2 and higher of
                 the SPEC CPU 2000 benchmark suite.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computational modeling; Computer architecture;
                 Computer simulation",
}

@Article{Vandierendonck:2002:ATC,
  author =       "H. Vandierendonck and K. {De Bosschere}",
  title =        "An Address Transformation Combining Block- and
                 Word-Interleaving",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "8--8",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As future superscalar processors employ higher issue
                 widths, an increasing number of load/store instructions
                 needs to be executed each cycle to sustain high
                 performance. Multi-bank data caches attempt to address
                 this issue in a cost-effective way. R multi-bank cache
                 consists of multiple cache banks that each support one
                 load/store instruction per clock cycle. The
                 interleaving of cache blocks over the banks is of
                 primary importance. Two common choices are
                 block-interleaving and word-interleaving. AC through
                 word-interleaving leads to higher PC, it is more
                 expensive to implement than block-interleaving since it
                 requires the tag array of the cache to be multi-ported.
                 By swapping the bits in the effective address that are
                 used by word-interleaving with those used by
                 block-interleaving, it is possible to implement a
                 word-interleaved cache with the same cost, cycle time
                 and power consumption of a block interleaved cache.
                 Because this makes the L1 data cache blocks sparse,
                 additional costs are incurred at different levels of
                 the memory hierarchy.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Block-Interleaving; Clocks; Costs; Data cache; Energy
                 consumption; Interleaved codes; Multi-Banking;
                 Word-Interleaving.",
}

@Article{Tambat:2002:PLB,
  author =       "S. Tambat and S. Vajapeyam",
  title =        "Page-Level Behavior of Cache Contention",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "9--9",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Cache misses in small, limited-associativity primary
                 caches very often replace live cache blocks, given the
                 dominance of capacity and conflict misses. Towards
                 motivating novel cache organizations, we study the
                 comparative characteristics of the virtual memory
                 address pairs involved in typical primary-cache
                 contention (block replacements) for the SPEC2000integer
                 benchmarks. We focus on the cache tag bits, and results
                 show that (i) often just a few tag bits differ between
                 contending addresses, and (ii) accesses to certain
                 segments or page groups of the virtual address space
                 (i.e., certain tag-bit groups) contend frequently.
                 Cache conscious virtual address space allocation can
                 further reduce the number of conflicting tag bits. We
                 mention two directions for exploiting such page-level
                 contention patterns to improve cache cost and
                 performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Automation; Benchmark testing; Cache Contention; Cache
                 Tags; Computer science; Data Cache; Libraries; Memory
                 Access Characterization; Microprocessors; Optimizing
                 compilers; Traffic control; Workstations",
}

@Article{Juang:2002:IDT,
  author =       "Philo Juang and P. Diodato and S. Kaxiras and K.
                 Skadron and Zhigang Hu and M. Martonosi and D. W.
                 Clark",
  title =        "Implementing Decay Techniques using {4T} Quasi-Static
                 Memory Cells",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "10--10",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper proposes the use of four-transistor (4T)
                 cache and branch predictor array cell designs to
                 address increasing worries regarding leakage power
                 dissipation. While 4T designs lose state when
                 infrequently accessed, they have very low leakage,
                 smaller area, and no capacitive loads to switch. This
                 short paper gives an overview of 4T implementation
                 issues and a preliminary evaluation of leakage-energy
                 savings that shows improvements of 60-80\%",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Circuit simulation; Delay; Leakage current; Libraries;
                 Microarchitecture; Power dissipation; Power generation;
                 Random access memory; Switches; Transistors",
}

@Article{Sohn:2002:RRE,
  author =       "YoungChul Sohn and NaiHoon Jung and Seungryoul Maeng",
  title =        "Request Reordering to Enhance the Performance of
                 Strict Consistency Models",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "11--11",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Advances in ILP techniques enable strict consistency
                 models to relax memory order through speculative
                 execution of memory operations. However, ordering
                 constraints still hinder the performance because
                 speculatively executed operations cannot be committed
                 out of program order for the possibility of
                 mis-speculation. In this paper, we propose a new
                 technique which allows memory operations to be
                 non-speculatively committed out of order without
                 violating consistency constraints.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ILP; memory consistency model; multiprocessor",
}

@Article{Shaw:2002:MSC,
  author =       "K. A. Shaw and W. J. Dally",
  title =        "Migration in Single Chip Multiprocessors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "1",
  number =       "1",
  pages =        "12--12",
  month =        jan,
  year =         "2002",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2002.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Global communication costs in future single-chip
                 multiprocessors will increase linearly with distance.
                 In this paper, we revisit the issues of locality and
                 load balance in order to take advantage of these new
                 costs. We present a technique which simultaneously
                 migrates data and threads based on vectors specifying
                 locality and resource usage. This technique improves
                 performance on applications with distinguishable
                 locality and imbalanced resource usage. 64\% of the
                 ideal reduction in execution time was achieved on an
                 application with these traits while no improvement was
                 obtained on a balanced application with little
                 locality.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Cost function; Delay; Global communication;
                 Laboratories; Logic; Monitoring; Multiprocessing
                 systems; Wire",
}

@Article{Sihn:2003:SCS,
  author =       "K.-H. Sihn and Joonwon Lee and Jung-Wan Cho",
  title =        "A Speculative Coherence Scheme using Decoupling
                 Synchronization for Multiprocessor Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "1--1",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper proposes a new speculative coherence
                 scheme, SCDS, for hardware distributed shared memory
                 systems to reduce the overhead of coherence action in
                 directory-based cache-coherence protocol. SCDS has two
                 main features, predicting accurate timing of
                 speculative coherence with synchronization information
                 and detecting write pattern(migratory and
                 non-migratory) for exclusive blocks' speculative
                 coherence action. In our simulation, SCDS outperforms
                 existing schemes (DSI and LTP) for well-synchronized
                 applications.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Access protocols; Coherence; Costs; Delay; Hardware;
                 Multiprocessing systems; Personal communication
                 networks; Runtime; Timing; Watches",
}

@Article{Kumar:2003:PPR,
  author =       "R. Kumar and K. Farkas and N. P. Jouppi and P.
                 Ranganathan and D. M. Tullsen",
  title =        "Processor Power Reduction Via Single-{ISA}
                 Heterogeneous Multi-Core Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "2--2",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper proposes a single-ISA heterogeneous
                 multi-core architecture as a mechanism to reduce
                 processor power dissipation. It assumes a single chip
                 containing a diverse set of cores that target different
                 performance levels and consume different levels of
                 power. During an application's execution, system
                 software dynamically chooses the most appropriate core
                 to meet specific performance and power requirements. It
                 describes an example architecture with five cores of
                 varying performance and complexity. Initial results
                 demonstrate a five-fold reduction in energy at a cost
                 of only 25\% performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; chip multiprocessor; Computer
                 architecture; Computer science; Costs; Energy
                 consumption; Fans; low-power architecture; Packaging;
                 Power dissipation; Power engineering and energy; System
                 software",
}

@Article{Sendag:2003:ACE,
  author =       "R. Sendag and Peng-fei Chuang and D. J. Lilja",
  title =        "Address Correlation: Exceeding the Limits of
                 Locality",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "3--3",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We investigate a program phenomenon, Address
                 Correlation, which links addresses that reference the
                 same data. This work shows that different addresses
                 containing the same data can often be correlated at
                 run-time to eliminate a load miss or a partial hit. For
                 ten of the SPEC CPU2000 benchmarks, 57 to 99\% of all
                 L1 data cache load misses, and 4 to 85\% of all partial
                 hits, can be supplied from a correlated address already
                 found in the cache. Our source code-level analysis
                 shows that semantically equivalent information,
                 duplicated references, and frequent values are the
                 major causes of address correlations. We also show
                 that, on average, 68\% of the potential correlated
                 addresses that could supply data on a miss of an
                 address containing the same value can be correlated at
                 run time. These correlated addresses correspond to an
                 average of 62\% of all misses in the benchmark programs
                 tested.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Delay; Electronic mail; Hardware;
                 History; Microarchitecture; Object oriented modeling;
                 Out of order; Runtime; Tellurium",
}

@Article{Milenkovic:2003:SBT,
  author =       "A. Milenkovic and M. Milenkovic",
  title =        "Stream-Based Trace Compression",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "4--4",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Trace-driven simulation has long been used in both
                 processor and memory studies. The large size of traces
                 motivated different techniques for trace reduction.
                 These techniques often combine standard compression
                 algorithms with trace-specific solutions, taking into
                 account the tradeoff between reduction in the trace
                 size and simulation slowdown due to decompression. This
                 paper introduces SBC, a new algorithm for instruction
                 and data address trace compression based on instruction
                 streams. The proposed technique significantly reduces
                 trace size and simulation time, and it is orthogonal to
                 general compression algorithms. When combined with
                 gzip, SBC reduces the size of SPEC CPU2000 traces
                 94-71968 times.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Clocks; Compression algorithms; Computational
                 modeling; Computer architecture; Computer simulation;
                 Data mining; Information analysis; instruction and
                 address trace; Instruments; Predictive models;
                 Redundancy; simulation; trace compression",
}

@Article{Zhang:2003:WHC,
  author =       "Chuanjun Zhang and F. Vahid and Jun Yang and W.
                 Walid",
  title =        "A Way-Halting Cache for Low-Energy High-Performance
                 Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "5--5",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We have designed a low power four-way set associative
                 cache that stores the four lowest-order bits of all way
                 stags into a fully associative memory, which we call
                 the halt tag array. The comparison of the halt tag
                 array with the desired tag occurs concurrently with the
                 address decoding that determines which tag and data
                 ways to read from. The halt tag array predetermines
                 most tags that cannot match due to their low-order four
                 bits mismatching. Further accesses to ways with known
                 mismatching tags are then halted, thus saving power.
                 Our halt tag array has the additional feature of using
                 static logic only, rather than dynamic logic used in
                 highly-associative caches, making our cache consumes
                 even less power. Our result shows55\% savings of memory
                 access related energy over a conventional four-way
                 set-associative cache. We show nearly 2x energy savings
                 compared with highly associative caches, while imposing
                 no performance overhead and only 2\% cache area over
                 head.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Cams; Circuits; Computer science; Decoding; Design
                 engineering; Embedded computing; Logic arrays; Power
                 engineering and energy; Power engineering computing;
                 Switches",
}

@Article{Cohen:2003:EOP,
  author =       "A. Cohen and F. Finkelstein and A. Mendelson and R.
                 Ronen and D. Rudoy",
  title =        "On Estimating Optimal Performance of {CPU} Dynamic
                 Thermal Management",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "6--6",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper we focus on dynamic thermal management
                 (DTM) strategies that use dynamic voltage scaling
                 (DVS)for power control. We perform a theoretical
                 analysis targeted at estimating the optimal strategy,
                 and show two facts: (1) when there is a gap between the
                 initial and the limit temperatures, it is best to start
                 with a high (though not necessarily maximal)frequency
                 and decrease it exponentially until the limit
                 temperature is reached; (2) when being close to the
                 limit temperature, the best strategy is to stay there.
                 We use the patterns exhibited by the optimal strategy
                 in order to analyze some existing DTM techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Costs; DTM; DVS; Energy management; Frequency
                 estimation; Microprocessors; optimal control; Pattern
                 analysis; Performance analysis; Temperature control;
                 Temperature sensors; Thermal management; Voltage
                 control",
}

@Article{Cristal:2003:CRC,
  author =       "A. Cristal and J. F. Martinez and J. Llosa and M.
                 Valero",
  title =        "A case for resource-conscious out-of-order
                 processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "2",
  number =       "1",
  pages =        "7--7",
  month =        jan,
  year =         "2003",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2003.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Modern out-of-order processors tolerate long-latency
                 memory operations by supporting a large number of
                 in-flight instructions. This is achieved in part
                 through proper sizing of critical resources, such as
                 register files or instruction queues. In light of the
                 increasing gap between processor speed and memory
                 latency, tolerating upcoming latencies in this way
                 would require impractical sizes of such critical
                 resources. To tackle this scalability problem, we make
                 a case for resource-conscious out-of-order processors.
                 We present quantitative evidence that critical
                 resources are increasingly underutilized in these
                 processors. We advocate that better use of such
                 resources should be a priority in future research in
                 processor architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bars; checkpointing.; Computer aided instruction;
                 Delay; instruction-level parallelism; Laboratories;
                 memory latency; Optimal control; Out of order;
                 Out-of-order processor; Queueing analysis; Registers;
                 Resource management; resource utilization; Voltage
                 control",
}

@Article{Citron:2004:ELE,
  author =       "D. Citron",
  title =        "Exploiting Low Entropy to Reduce Wire Delay",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "1--1",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Wires shrink less efficiently than transistors.
                 Smaller dimensions increase relative delay and the
                 probability of crosstalk. Solutions to this problem
                 include adding additional latency with pipelining,
                 using ``fat wires'' at higher metal levels, and
                 advances in process and material technology. We propose
                 a stopgap solution to this problem by applying a decade
                 old technique called bus-expanding to the problem. By
                 exploiting low spatial and temporal entropy of data it
                 is possible to transfer m bits of data over a n-bit
                 wide bus in a single cycle (m > n ). High entropy data
                 will be routed directly over the bus while low entropy
                 data will be compacted using small lookup tables. A
                 table index will be transferred in the case of a
                 successful lookup, otherwise the full value will be
                 transferred in several cycles. Reducing the number of
                 wires per bus, enables the use of wider wires, which in
                 turn reduces the wire delay. Examination of projected
                 process technologies shows that by shrinking the number
                 of bits in a bus (64 > 48) instead of shrinking the
                 individual wires maintains a constant wire delay. Tests
                 on SPEC CPU2000 have shown that for the 64-bit buses
                 leading from the L1 caches to the processor core it is
                 possible to transfer all data types (addresses,
                 integers, instructions and floating-points) using
                 40-bits per bus on the average.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Area measurement; Compaction; Crosstalk; Delay;
                 Entropy; Materials science and technology; Pipeline
                 processing; Power measurement; Transistors; Wire",
}

@Article{Singh:2004:GAL,
  author =       "A. Singh and W. J. Dally and B. Towles and A. K.
                 Gupta",
  title =        "Globally Adaptive Load-Balanced Routing on Tori",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "2--2",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We introduce a new method of adaptive routing on k-ary
                 n-cubes, Globally Adaptive Load-Balance (GAL). GAL
                 makes global routing decisions using global
                 information. In contrast, most previous adaptive
                 routing algorithms make local routing decisions using
                 local information (typically channel queue depth). GAL
                 senses global congestion using segmented injection
                 queues to decide the directions to route in each
                 dimension. It further load balances the network by
                 routing in the selected directions adaptively. Using
                 global information, GAL achieves the performance
                 (latency and throughput) of minimal adaptive routing on
                 benign traffic patterns and performs as well as the
                 best obliviously load-balanced routing algorithm (GOAL)
                 on adversarial traffic.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Chaos; Delay; Nearest neighbor searches; Routing;
                 Stability; Switches; Telecommunication traffic;
                 Throughput; Tornadoes; Traffic control",
}

@Article{Gomez:2004:EFT,
  author =       "M. E. Gomez and J. Duato and J. Flich and P. Lopez and
                 A. Robles and N. A. Nordbotten and O. Lysne and T.
                 Skeie",
  title =        "An Efficient Fault-Tolerant Routing Methodology for
                 Meshes and Tori",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "3--3",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper we present a methodology to design
                 fault-tolerant routing algorithms for regular direct
                 interconnection networks. It supports fully adaptive
                 routing, does not degrade performance in the absence of
                 faults, and supports a reasonably large number of
                 faults without significantly degrading performance. The
                 methodology is mainly based on the selection of an
                 intermediate node (if needed) for each
                 source-destination pair. Packets are adaptively routed
                 to the intermediate node and, at this node, without
                 being ejected, they are adaptively forwarded to their
                 destinations. In order to allow deadlock-free minimal
                 adaptive routing, the methodology requires only one
                 additional virtual channel (for a total of three), even
                 for tori. Evaluation results for a 4 x 4 x 4 torus
                 network show that the methodology is 5-fault tolerant.
                 Indeed, for up to 14 link failures, the percentage of
                 fault combinations supported is higher than 99.96\%.
                 Additionally, network throughput degrades by less than
                 10\% when injecting three random link faults without
                 disabling any node. In contrast, a mechanism similar to
                 the one proposed in the BlueGene/L, that disables some
                 network planes, would strongly degrade network
                 throughput by 79\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithm design and analysis; Circuit faults;
                 Degradation; Design methodology; Electronic mail; Fault
                 tolerance; Multiprocessor interconnection networks;
                 Routing; Switches; Throughput",
}

@Article{Stine:2004:CAR,
  author =       "J. M. Stine and N. P. Carter and J. Flich",
  title =        "Comparing Adaptive Routing and Dynamic Voltage Scaling
                 for Link Power Reduction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "4--4",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We compare techniques that dynamically scale the
                 voltage of individual network links to reduce power
                 consumption with an approach in which all links in the
                 network are set to the same voltage and adaptive
                 routing is used to distribute load across the network.
                 Our results show that adaptive routing with static
                 network link voltages outperforms dimension-order
                 routing with dynamic link voltages in all cases,
                 because the adaptive routing scheme can respond more
                 quickly to changes in network demand. Adaptive routing
                 with static link voltages also outperforms adaptive
                 routing with dynamic link voltages in many cases,
                 although dynamic link voltage scaling gives better
                 behavior as the demand on the network grows.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Clocks; Dynamic voltage scaling; Energy
                 consumption; Frequency; Network-on-a-chip; Routing;
                 Telecommunication traffic; Traffic control; Voltage
                 control",
}

@Article{Robatmili:2004:TSI,
  author =       "B. Robatmili and N. Yazdani and S. Sardashti and M.
                 Nourani",
  title =        "Thread-Sensitive Instruction Issue for {SMT}
                 Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "5--5",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Simultaneous Multi Threading (SMT) is a processor
                 design method in which concurrent hardware threads
                 share processor resources like functional units and
                 memory. The scheduling complexity and performance of an
                 SMT processor depend on the topology used in the fetch
                 and issue stages. In this paper, we propose a thread
                 sensitive issue policy for a partitioned SMT processor
                 which is based on a thread metric. We propose the
                 number of ready-to-issue instructions of each thread as
                 priority metric. To evaluate our method, we have
                 developed a reconfigurable SMT-simulator on top of the
                 SimpleScalar Toolset. We simulated our modeled
                 processor under several workloads composed of SPEC
                 benchmarks. Experimental results show around 30\%
                 improvement compared to the conventional OLDEST\_FIRST
                 mixed topology issue policy. Additionally, the hardware
                 implementation of our architecture with this metric in
                 issue stage is quite simple.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Clocks; Delay; Frequency; Intrusion detection;
                 Laboratories; Logic; Processor scheduling;
                 Surface-mount technology; Topology",
}

@Article{Luo:2004:EES,
  author =       "Yue Luo and L. K. John",
  title =        "Efficiently Evaluating Speedup Using Sampled Processor
                 Simulation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "6--6",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Cycle accurate simulation of processors is extremely
                 time consuming. Sampling can greatly reduce simulation
                 time while retaining good accuracy. Previous research
                 on sampled simulation has been focusing on the accuracy
                 of CPI. However, most simulations are used to evaluate
                 the benefit of some microarchitectural enhancement, in
                 which the speedup is a more important metric than CPI.
                 We employ the ratio estimator from statistical sampling
                 theory to design efficient sampling to measure speedup
                 and to quantify its error. We show that to achieve a
                 given relative error limit for speedup, it is not
                 necessary to estimate CPI to the same accuracy. In our
                 experiment, estimating speedup requires about 9X fewer
                 instructions to be simulated in detail in comparison to
                 estimating CPI for the same relative error limit.
                 Therefore using the ratio estimator to evaluate speedup
                 is much more cost-effective and offers great potential
                 for reducing simulation time. We also discuss the
                 reason for this interesting and important result.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; Clocks; Computational modeling;
                 Computer errors; Computer simulation; Frequency;
                 Microarchitecture; Sampling methods; Size measurement;
                 Velocity measurement",
}

@Article{Ceze:2004:CHL,
  author =       "L. Ceze and K. Strauss and J. Tuck and J. Renau and J.
                 Torrellas",
  title =        "{CAVA}: Hiding {L2} Misses with Checkpoint-Assisted
                 Value Prediction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "7--7",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Load misses in on-chip L2 caches often end up stalling
                 modern superscalars. To address this problem, we
                 propose hiding L2 misses with Checkpoint-Assisted VAlue
                 prediction (CAVA). When a load misses in L2, a
                 predicted value is returned to the processor. If the
                 missing load reaches the head of the reorder buffer
                 before the requested data is received from memory, the
                 processor checkpoints, consumes the predicted value,
                 and speculatively continues execution. When the
                 requested data finally arrives, it is compared to the
                 predicted value. If the prediction was correct,
                 execution continues normally; otherwise, execution
                 rolls back to the checkpoint. Compared to a baseline
                 aggressive superscalar, CAVA speeds up execution by a
                 geometric mean of 1.14 for SPECint and 1.34 for SPECfp
                 applications. Additionally, CAVA is faster than an
                 implementation of Runahead execution, and Runahead with
                 value prediction.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; Checkpointing; Costs; Delay;
                 Hardware; Microarchitecture; Out of order; Pipelines;
                 Prefetching; Recycling",
}

@Article{Singh:2004:BDB,
  author =       "A. Singh and W. J. Dally",
  title =        "Buffer and Delay Bounds in High Radix Interconnection
                 Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "8--8",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We apply recent results in queueing theory to propose
                 a methodology for bounding the buffer depth and packet
                 delay in high radix interconnection networks. While
                 most work in interconnection networks has been focused
                 on the throughput and average latency in such systems,
                 few studies have been done providing statistical
                 guarantees for buffer depth and packet delays. These
                 parameters are key in the design and performance of a
                 network. We present a methodology for calculating such
                 bounds for a practical high radix network and through
                 extensive simulations show its effectiveness for both
                 bursty and non-bursty injection traffic. Our results
                 suggest that modest speedups and buffer depths enable
                 reliable networks without flow control to be
                 constructed.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Convergence; Delay; Intelligent networks;
                 Multiprocessor interconnection networks; Queueing
                 analysis; Supercomputers; Switches; Telecommunication
                 traffic; Throughput; Traffic control",
}

@Article{Holloway:2004:CPS,
  author =       "A. L. Holloway and G. S. Sohi",
  title =        "Characterization of Problem Stores",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "3",
  number =       "1",
  pages =        "9--9",
  month =        jan,
  year =         "2004",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2004.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper introduces the concept of problem stores:
                 static stores whose dependent loads often miss in the
                 cache. Accurately identifying problem stores allows the
                 early determination of addresses likely to cause later
                 misses, potentially allowing for the development of
                 novel, proactive prefetching and memory hierarchy
                 management schemes. We present a detailed empirical
                 characterization of problem stores using the SPEC2000
                 CPU benchmarks. The data suggests several key
                 observations about problem stores. First, we find that
                 the number of important problem stores is typically
                 quite small; the worst 100 problem stores write the
                 values that will lead to about 90\% of non-cold misses
                 for a variety of cache configurations. We also find
                 that problem stores only account for 1 in 8 dynamic
                 stores, though they result in 9 of 10 misses.
                 Additionally, the problem stores dependent loads miss
                 in the L2 cache a larger fraction of the time than
                 loads not dependent on problem stores. We also observe
                 the set of problem stores is stable across a variety of
                 cache configurations. Finally, we found that the
                 instruction distance from problem store to miss and
                 problem store to evict is often greater than one
                 million instructions, but the value is often needed
                 within 100,000 instructions of the eviction.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Delay; Hardware; Memory management; Prefetching;
                 Proposals; Timing",
}

@Article{Sazeides:2005:DIB,
  author =       "Y. Sazeides and R. Kumar and D. M. Tullsen and T.
                 Constantinou",
  title =        "The Danger of Interval-Based Power Efficiency Metrics:
                 When Worst Is Best",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "4",
  number =       "1",
  pages =        "1--1",
  month =        jan,
  year =         "2005",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2005.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper shows that if the execution of a program is
                 divided into distinct intervals, it is possible for one
                 processor or configuration to provide the best power
                 efficiency over every interval, and yet have worse
                 overall power efficiency over the entire execution than
                 other configurations. This unintuitive behavior is a
                 result of a seemingly intuitive use of power efficiency
                 metrics, and can result in suboptimal design and
                 execution decisions. This behavior may occur when using
                 the energy-delay product and energy-delay product
                 metrics but not with the energy metric.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Battery charge measurement; Clocks; Computer science;
                 Delay; Design optimization; Frequency; Out of order;
                 Power engineering and energy; Power measurement",
}

@Article{Mutlu:2005:RRP,
  author =       "O. Mutlu and Hyesoon Kim and J. Stark and Y. N. Patt",
  title =        "On Reusing the Results of Pre-Executed Instructions in
                 a Runahead Execution Processor",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "4",
  number =       "1",
  pages =        "2--2",
  month =        jan,
  year =         "2005",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2005.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Previous research on runahead execution took it for
                 granted as a prefetch-only technique. Even though the
                 results of instructions independent of an L2 miss are
                 correctly computed during runahead mode, previous
                 approaches discarded those results instead of trying to
                 utilize them in normal mode execution. This paper
                 evaluates the effect of reusing the results of
                 preexecuted instructions on performance. We find that,
                 even with an ideal scheme, it is not worthwhile to
                 reuse the results of preexecuted instructions. Our
                 analysis provides insights into why result reuse does
                 not provide significant performance improvement in
                 runahead processors and concludes that runahead
                 execution should be employed as a prefetching mechanism
                 rather than a full-blown prefetching/result-reuse
                 mechanism.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Computational modeling; Computer aided
                 instruction; Delay; Energy consumption;
                 Microprocessors; Performance analysis; Prefetching;
                 Registers",
}

@Article{Zhang:2006:BIC,
  author =       "Chuanjun Zhang",
  title =        "Balanced instruction cache: reducing conflict misses
                 of direct-mapped caches through balanced subarray
                 accesses",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "2--5",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "It is observed that the limited memory space of
                 direct-mapped caches is not used in balance therefore
                 incurs extra conflict misses. We propose a novel cache
                 organization of a balanced cache, which balances
                 accesses to cache sets at the granularity of cache
                 subarrays. The key technique of the balanced cache is a
                 programmable subarray decoder through which the mapping
                 of memory reference addresses to cache subarrays can be
                 optimized hence conflict misses of direct-mapped caches
                 can be resolved. The experimental results show that the
                 miss rate of balanced cache is lower than that of the
                 same sized two-way set-associative caches on average
                 and can be as low as that of the same sized four-way
                 set-associative caches for particular applications.
                 Compared with previous techniques, the balanced cache
                 requires only one cycle to access all cache hits and
                 has the same access time as direct-mapped caches",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "balanced instruction cache; balanced subarray
                 accesses; Bridges; Cache memory; cache organization;
                 cache storage; Clocks; conflict miss reduction;
                 Decoding; Delay; Frequency; High performance computing;
                 programmable subarray decoder; storage allocation",
}

@Article{Ottoni:2006:SPC,
  author =       "G. Ottoni and R. Rangan and A. Stoler and M. J.
                 Bridges and D. I. August",
  title =        "From sequential programs to concurrent threads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "6--9",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Chip multiprocessors are of increasing importance due
                 to difficulties in achieving higher clock frequencies
                 in uniprocessors, but their success depends on finding
                 useful work for the processor cores. This paper
                 addresses this challenge by presenting a simple
                 compiler approach that extracts non-speculative
                 thread-level parallelism from sequential codes. We
                 present initial results from this technique targeting a
                 validated dual-core processor model, achieving speedups
                 ranging from 9-48\% with an average of 25\% for
                 important benchmark loops over their single-threaded
                 versions. We also identify important next steps found
                 during our pursuit of higher degrees of automatic
                 threading",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "automatic threading; Bridges; Clocks; Computer
                 science; concurrency control; concurrent threads;
                 Frequency; Hardware; Microprocessors; multi-threading;
                 nonspeculative thread-level parallelism; Parallel
                 processing; Pipeline processing; program compiler;
                 program compilers; Program processors; sequential
                 programs",
}

@Article{Gupta:2006:TOI,
  author =       "A. K. Gupta and W. J. Dally",
  title =        "Topology optimization of interconnection networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "10--13",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper describes an automatic optimization tool
                 that searches a family of network topologies to select
                 the topology that best achieves a specified set of
                 design goals while satisfying specified packaging
                 constraints. Our tool uses a model of signaling
                 technology that relates bandwidth, cost and distance of
                 links. This model captures the distance-dependent
                 bandwidth of modern high-speed electrical links and the
                 cost differential between electrical and optical links.
                 Using our optimization tool, we explore the design
                 space of hybrid Clos-torus (C-T) networks. For a
                 representative set of packaging constraints we
                 determine the optimal hybrid C-T topology to minimize
                 cost and the optimal C-T topology to minimize latency
                 for various packet lengths. We then use the tool to
                 measure the sensitivity of the optimal topology to
                 several important packaging constraints such as pin
                 count and critical distance",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Constraint optimization; Costs; Design
                 optimization; hybrid Clos-torus networks;
                 interconnection networks; Multiprocessor
                 interconnection networks; multistage interconnection
                 networks; Network topology; Optical fiber
                 communication; Packaging; signaling technology;
                 signalling; Space exploration; Space technology;
                 telecommunication network topology; topology
                 optimization tool",
}

@Article{Gaudiot:2006:F,
  author =       "J.-L. Gaudiot and Y. Patt and K. Skadon",
  title =        "Foreword",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "11--11",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Forward for issue 1 of 2006",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; Computer Society; Concrete;
                 Delay; Footwear; Software libraries; Vehicles",
}

@Article{Morad:2006:PPE,
  author =       "T. Y. Morad and U. C. Weiser and A. Kolodnyt and M.
                 Valero and E. Ayguade",
  title =        "Performance, power efficiency and scalability of
                 asymmetric cluster chip multiprocessors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "14--17",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper evaluates asymmetric cluster chip
                 multiprocessor (ACCMP) architectures as a mechanism to
                 achieve the highest performance for a given power
                 budget. ACCMPs execute serial phases of multithreaded
                 programs on large high-performance cores whereas
                 parallel phases are executed on a mix of large and many
                 small simple cores. Theoretical analysis reveals a
                 performance upper bound for symmetric multiprocessors,
                 which is surpassed by asymmetric configurations at
                 certain power ranges. Our emulations show that
                 asymmetric multiprocessors can reduce power consumption
                 by more than two thirds with similar performance
                 compared to symmetric multiprocessors",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ACCMP; Application software; asymmetric cluster chip
                 multiprocessors; Chip Multiprocessors; Emulation;
                 Frequency; microprocessor chips; multi-threading;
                 multiprocessing systems; multithreaded program;
                 Optimized production technology; Parallel processing;
                 parallel processing; power consumption reduction; power
                 efficiency; Power Efficiency; Power system modeling;
                 Queueing analysis; Scalability; Upper bound; Voltage",
}

@Article{Riley:2006:PCU,
  author =       "N. Riley and C. Zilles",
  title =        "Probabilistic counter updates for predictor hysteresis
                 and bias",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "18--21",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware predictor designers have incorporated
                 hysteresis and/or bias to achieve desired behavior by
                 increasing the number of bits per counter. Some
                 resulting proposed predictor designs are currently
                 impractical because their counter tables are too large.
                 We describe a method for dramatically reducing the
                 amount of storage required for a predictor's counter
                 table with minimal impact on prediction accuracy.
                 Probabilistic updates to counter state are implemented
                 using a hardware pseudo-random number generator to
                 increment or decrement counters a fraction of the time,
                 meaning fewer counter bits are required. We demonstrate
                 the effectiveness of probabilistic updates in the
                 context of Fields et al.'s critical path predictor,
                 which employs a biased 6-bit counter. Averaged across
                 the SPEC CINT2000 benchmarks, our 2-bit and 3-bit
                 probabilistic counters closely approximate a 6-bit
                 deterministic one (achieving speedups of 7.75\% and
                 7.91\% compared to 7.94\%) when used for
                 criticality-based scheduling in a clustered machine.
                 Performance degrades gracefully, enabling even a 1-bit
                 probabilistic counter to outperform the best 3-bit
                 deterministic counter we found",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; clustered machine; computer architecture;
                 Computer science; Costs; Counting circuits; critical
                 path predictor; criticality-based scheduling;
                 Degradation; Hardware; hardware predictor design;
                 hardware pseudorandom number generator; Hysteresis;
                 Microarchitecture; Pipelines; predictor bias; predictor
                 hysteresis; predictors counter table; probabilistic
                 counter update; probability; Processor scheduling;
                 processor scheduling; random number generation",
}

@Article{Zhou:2006:CFT,
  author =       "Huiyang Zhou",
  title =        "A case for fault tolerance and performance enhancement
                 using chip multi-processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "22--25",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper makes a case for using multi-core
                 processors to simultaneously achieve transient-fault
                 tolerance and performance enhancement. Our approach is
                 extended from a recent latency-tolerance proposal,
                 dual-core execution (DCE). In DCE, a program is
                 executed twice in two processors, named the front and
                 back processors. The front processor pre-processes
                 instructions in a very fast yet highly accurate way and
                 the back processor re-executes the instruction stream
                 retired from the front processor. The front processor
                 runs faster as it has no correctness constraints
                 whereas its results, including timely prefetching and
                 prompt branch misprediction resolution, help the back
                 processor make faster progress. In this paper, we
                 propose to entrust the speculative results of the front
                 processor and use them to check the un-speculative
                 results of the back processor. A discrepancy, either
                 due to a transient fault or a mispeculation, is then
                 handled with the existing mispeculation recovery
                 mechanism. In this way, both transient-fault tolerance
                 and performance improvement can be delivered
                 simultaneously with little hardware overhead",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "back processor; chip multiprocessors; Computer aided
                 software engineering; dual-core execution; Error
                 analysis; Fault tolerance; fault tolerant computing;
                 front processor; Hardware; latency-tolerance proposal;
                 microprocessor chips; mispeculation recovery mechanism;
                 Multicore processing; multiprocessing systems;
                 prefetching; Prefetching; prompt branch misprediction
                 resolution; Proposals; Redundancy; storage management;
                 Throughput; transient-fault tolerance; Transistors",
}

@Article{Lee:2006:ASC,
  author =       "Moon-Sang Lee and Sang-Kwon Lee and Joonwon Lee and
                 Seung-Ryoul Maeng",
  title =        "Adopting system call based address translation into
                 user-level communication",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "26--29",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "User-level communication alleviates the software
                 overhead of the communication subsystem by allowing
                 applications to access the network interface directly.
                 For that purpose, efficient address translation of
                 virtual address to physical address is critical. In
                 this study, we propose a system call based address
                 translation scheme where every translation is done by
                 the kernel instead of a translation cache on a network
                 interface controller as in the previous cache based
                 address translation. According to our experiments, our
                 scheme achieves up to 4.5\% reduction in application
                 execution time compared to the previous cache based
                 approach",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; cache based approach; cache
                 storage; Communication system software; Control
                 systems; Costs; Delay; Electronic mail; Hardware;
                 Kernel; network interface controller; network
                 interfaces; Network interfaces; operating system
                 kernels; Protocols; software overhead; system call
                 based address translation; user-level communication",
}

@Article{Ahn:2006:DPA,
  author =       "Jung Ho Ahn and W. J. Dally",
  title =        "Data parallel address architecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "30--33",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Data parallel memory systems must maintain a large
                 number of outstanding memory references to fully use
                 increasing DRAM bandwidth in the presence of increasing
                 latency. At the same time, the throughput of modern
                 DRAMs is very sensitive to access pattern's due to the
                 time required to precharge and activate banks and to
                 switch between read and write access. To achieve memory
                 reference parallelism a system may simultaneously issue
                 references from multiple reference threads.
                 Alternatively multiple references from a single thread
                 can be issued in parallel. In this paper, we examine
                 this tradeoff and show that allowing only a single
                 thread to access DRAM at any given time significantly
                 improves performance by increasing the locality of the
                 reference stream and hence reducing precharge/activate
                 operations and read/write turnaround. Simulations of
                 scientific and multimedia applications show that
                 generating multiple references from a single thread
                 gives, on average, 17\% better performance than
                 generating references from two parallel threads",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Computer architecture; data parallel
                 address architecture; data parallel memory systems;
                 Delay; DRAM bandwidth; DRAM chips; Memory management;
                 parallel architectures; parallel memories; Parallel
                 processing; Random access memory; read access;
                 Scheduling; Streaming media; Switches; write access",
}

@Article{Eisley:2006:NCC,
  author =       "N. Eisley and Li-Shiuan Peh and Li Shang",
  title =        "In-network cache coherence",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "34--37",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We propose implementing cache coherence protocols
                 within the network, demonstrating how an in-network
                 implementation of the MSI directory-based protocol
                 allows for in-transit optimizations of read and write
                 delay. Our results show 15\% and 24\% savings on
                 average in memory access latency for SPLASH-2 parallel
                 benchmarks running on a 4times4 and a 16times16
                 multiprocessor respectively",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Access protocols; benchmark testing; cache coherence;
                 cache storage; Coherence; Delay; delays; Fabrics;
                 interconnection network; memory access latency; Memory
                 architecture; memory architecture; memory protocols;
                 Moore's Law; MSI directory-based protocol;
                 Multiprocessor interconnection networks; network cache
                 coherence protocols; parallel processing; read delay;
                 SPLASH-2 parallel benchmarks; write delay",
}

@Article{Srinivasan:2006:PMU,
  author =       "R. Srinivasan and J. Cook and O. Lubeck",
  title =        "Performance modeling using {Monte Carlo} simulation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "1",
  pages =        "38--41",
  month =        jan,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/intel-ia-64.bib",
  abstract =     "Cycle accurate simulation has long been the primary
                 tool for micro-architecture design and evaluation.
                 Though accurate, the slow speed often imposes
                 constraints on the extent of design exploration. In
                 this work, we propose a fast, accurate Monte-Carlo
                 based model for predicting processor performance. We
                 apply this technique to predict the CPI of in-order
                 architectures and validate it against the Itanium-2.
                 The Monte Carlo model uses micro-architecture
                 independent application characteristics, and cache,
                 branch predictor statistics to predict CPI with an
                 average error of less than 7\%. Since prediction is
                 achieved in a few seconds, the model can be used for
                 fast design space exploration that can efficiently cull
                 the space for cycle-accurate simulations. Besides
                 accurately predicting CPI, the model also breaks down
                 CPI into various components, where each component
                 quantifies the effect of a particular stall condition
                 (branch misprediction, cache miss, etc.) on overall
                 CPI. Such a CPI decomposition can help processor
                 designers quickly identify and resolve critical
                 performance bottlenecks",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "branch predictor statistics; Computational modeling;
                 Computer architecture; CPI decomposition; design space
                 exploration; Error analysis; Itanium-2; Laboratories;
                 Mathematical analysis; memory architecture;
                 microarchitecture design; microarchitecture evaluation;
                 Monte Carlo methods; Monte Carlo simulation;
                 performance evaluation; Predictive models; Process
                 design; processor performance modeling; program
                 processors; Sampling methods; Space exploration",
}

@Article{Ergin:2006:ENV,
  author =       "O. Ergin and O. Unsal and X. Vera and A. Gonzalez",
  title =        "Exploiting Narrow Values for Soft Error Tolerance",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "12--12",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Soft errors are an important challenge in contemporary
                 microprocessors. Particle hits on the components of a
                 processor are expected to create an increasing number
                 of transient errors with each new microprocessor
                 generation. In this paper we propose simple mechanisms
                 that effectively reduce the vulnerability to soft
                 errors In a processor. Our designs are generally
                 motivated by the fact that many of the produced and
                 consumed values in the processors are narrow and their
                 upper order bits are meaningless. Soft errors canted by
                 any particle strike to these higher order bits can be
                 avoided by simply identifying these narrow values.
                 Alternatively soft errors can be detected or corrected
                 on the narrow values by replicating the vulnerable
                 portion of the value inside the storage space provided
                 for the upper order bits of these operands. We offer a
                 variety of schemes that make use of narrow values and
                 analyze their efficiency in reducing soft error
                 vulnerability of level-1 data cache of the processor",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache storage; Cache storage; contemporary
                 microprocessors; data cache; Data Cache; Error
                 correction; error correction; Error Correction; error
                 correction; error detection; Hardware; Impurities;
                 Manufacturing; microprocessor chips; Microprocessors;
                 Multithreading; Narrow Values; narrow values; Neutrons;
                 particle strike; Process design; radiation effects;
                 Random access memory; soft error tolerance; Soft
                 Errors; system recovery; transient errors; transients",
}

@Article{Li:2006:PBH,
  author =       "W. Li and S. Mohanty and K. Kavi",
  title =        "A Page-based Hybrid (Software--Hardware) Dynamic
                 Memory Allocator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "13--13",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/java2000.bib",
  abstract =     "Modern programming languages often include complex
                 mechanisms for dynamic memory allocation and garbage
                 collection. These features drive the need for more
                 efficient implementation of memory management
                 functions, both in terms of memory usage and execution
                 performance. In this paper, we introduce a software and
                 hardware co-design to improve the speed of the software
                 allocator used in free-BSD systems. The hardware
                 complexity of our design is independent of the dynamic
                 memory size, thus making the allocator suitable for any
                 memory size. Our design improves the performance of
                 memory management intensive benchmarks by as much as
                 43\%. To oar knowledge, this is the first-ever work of
                 this kind, introducing ``hybrid memory allocator''",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; Computer languages; Computer
                 science; Costs; Delay; Dynamic programming; garbage
                 collection; Hardware; hardware complexity;
                 hardware-software codesign; hybrid dynamic memory
                 allocator; Java; memory allocator; memory architecture;
                 memory management; Memory management; modern
                 programming languages; software allocator; Software
                 performance; software-hardware co-design;
                 software/hardware co-design; storage allocation;
                 storage management",
}

@Article{Donald:2006:EPP,
  author =       "J. Donald and M. Martonosi",
  title =        "An Efficient, Practical Parallelization Methodology
                 for Multicore Architecture Simulation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "14--14",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Multiple core designs have become commonplace in the
                 processor market, and are hence a major focus in modern
                 computer architecture research. Thus, for both product
                 development and research, multiple core processor
                 simulation environments are necessary. A well-known
                 positive feedback property of computer design is that
                 we use today's computers to design tomorrow's. Thus,
                 with the emergence of chip multiprocessors, it is
                 natural to re-examine simulation environments written
                 to exploit parallelism. In this paper we present a
                 programming methodology for directly converting
                 existing uniprocessor simulators into parallelized
                 multiple-core simulators. Our method not only takes
                 significantly less development effort compared to some
                 prior used programming techniques, but also possesses
                 advantages by retaining a modular and comprehensible
                 programming structure. We demonstrate our case with
                 actual developed products after applying this method to
                 two different simulators, one developed from IBM
                 Ibrandot and the other from the SimpleScalar tool set.
                 Our SimpleScalar-based framework achieves a parallel
                 speedup of 2.2times on a dual-CPU dual-core (4-way)
                 Opteron server",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "chip multiprocessors; comprehensible programming
                 structure; Computational modeling; Computer
                 architecture; Computer simulation; Feedback; IBM
                 Ibrandot; logic simulation; microcomputers; modern
                 computer architecture; modular programming structure;
                 multicore; multicore architecture simulation; Multicore
                 processing; multiple core processor simulation;
                 multiprocessing systems; Object oriented modeling;
                 parallel architectures; Parallel processing; Parallel
                 programming; parallelism; parallelization method;
                 parallelized multiple-core simulators; positive
                 feedback property; Process planning; Product
                 development; programming methodology; SimpleScalar tool
                 set; simulation",
}

@Article{Bracy:2006:DAC,
  author =       "A. Bracy and K. Doshi and Q. Jacobson",
  title =        "Disintermediated Active Communication",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "15--15",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Disintermediated active communication (DAC) is a new
                 paradigm of communication in which a sending thread
                 actively engages a receiving thread when sending it a
                 message via shared memory. DAC is different than
                 existing approaches that use passive communication
                 through shared-memory --- based on intermittently
                 checking for messages --- or that use preemptive
                 communication but must rely on intermediaries such as
                 the operating system or dedicated interrupt channels.
                 An implementation of DAC builds on existing cache
                 coherency support and exploits light-weight user-level
                 interrupts. Inter-thread communication occurs via
                 monitored memory locations where the receiver thread
                 responds to invalidations of monitored addresses with a
                 light-weight user-level software-defined handler.
                 Address monitoring is supported by cache line
                 user-bits, or CLUbits. CLUbits reside in the cache next
                 to the coherence state, are private per thread, and
                 maintain user-defined per-cache-line state. A light
                 weight software library can demultiplex asynchronous
                 notifications and handle exceptional cases. In
                 DAC-based programs threads coordinate with one another
                 by explicit signaling and implicit resource monitoring.
                 With the simple and direct communication primitives of
                 DAC, multi-threaded workloads synchronize at a finer
                 granularity and more efficiently utilize the hardware
                 of upcoming multi-core designs. This paper introduces
                 DAC, presents several signaling models for DAC-based
                 programs, and describes a simple memory-based framework
                 that supports DAC by leveraging existing
                 cache-coherency models. Our framework is general enough
                 to support uses beyond DAC",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "address monitoring; cache coherency; cache line
                 user-bits; cache storage; CLUbits; Computer aided
                 instruction; Concurrent computing; disintermediated
                 active communication; Hardware; High performance
                 computing; interrupts; interthread communication;
                 memory locations; Monitoring; multi-threading;
                 multicore designs; Operating systems; Processor
                 scheduling; Programming profession; resource
                 monitoring; shared memory; shared memory systems;
                 signaling models; software libraries; Software
                 libraries; software library; storage allocation;
                 user-level interrupts",
}

@Article{Mallik:2006:UDF,
  author =       "A. Mallik and B. Lin and G. Memik and P. Dinda and R.
                 P. Dick",
  title =        "User-Driven Frequency Scaling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "16--16",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We propose and evaluate user-driven frequency scaling
                 (UDFS) for improved power management on processors that
                 support dynamic voltage and frequency scaling (DVFS),
                 e.g, those used in current laptop and desktop
                 computers. UDFS dynamically adapts CPU frequency to the
                 individual user and the workload through a simple user
                 feedback mechanism, unlike currently-used DVFS methods
                 which rely only on CPU utilization. Our UDFS algorithms
                 dramatically reduce typical operating frequencies while
                 maintaining performance at satisfactory levels for each
                 user. We evaluated our techniques through user studies
                 conducted on a Pentium M laptop running Windows
                 applications. The UDFS scheme reduces measured system
                 power by 22.1\%, averaged across all our users and
                 applications, compared to the Windows XP DVFS scheme",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Central Processing Unit; computer power supplies; CPU
                 frequency; DVFS; dynamic frequency scaling; Dynamic
                 voltage scaling; dynamic voltage scaling; Energy
                 consumption; Energy management; Engineering management;
                 Feedback; Frequency control; improved power management;
                 microprocessor chips; Pentium M laptop; Portable
                 computers; power aware computing; Power engineering
                 computing; Power Management; Power measurement; user
                 feedback mechanism; User-aware computing; user-driven
                 frequency scaling; Windows XP DVFS scheme",
}

@Article{Blundell:2006:STM,
  author =       "C. Blundell and E. C. Lewis and M. M. K. Martin",
  title =        "Subtleties of transactional memory atomicity
                 semantics",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "17--17",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.18",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Transactional memory has great potential for
                 simplifying multithreaded programming by allowing
                 programmers to specify regions of the program that must
                 appear to execute atomically. Transactional memory
                 implementations then optimistically execute these
                 transactions concurrently to obtain high performance.
                 This work shows that the same atomic guarantees that
                 give transactions their power also have unexpected and
                 potentially serious negative effects on programs that
                 were written assuming narrower scopes of atomicity. We
                 make four contributions: (1) we show that a direct
                 translation of lock-based critical sections into
                 transactions can introduce deadlock into otherwise
                 correct programs, (2) we introduce the terms strong
                 atomicity and weak atomicity to describe the
                 interaction of transactional and non-transactional
                 code, (3) we show that code that is correct under weak
                 atomicity can deadlock under strong atomicity, and (4)
                 we demonstrate that sequentially composing
                 transactional code can also introduce deadlocks. These
                 observations invalidate the intuition that transactions
                 are strictly safer than lock-based critical sections,
                 that strong atomicity is strictly safer than weak
                 atomicity, and that transactions are always
                 composable",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer languages; Computer Systems Organization;
                 Concurrent distributed and parallel languages;
                 deadlock; direct translation; Hardware; Information
                 science; Interference; Interleaved codes; Language
                 Classifications; Law; lock-based critical sections;
                 Multi-core/single-chip multiprocessors;
                 multi-threading; Multiple Data Stream Architectures
                 (Multiprocessors); multithreaded programming;
                 nontransactional code; operating systems (computers);
                 Parallel Architectures; Processor Architectures;
                 program verification; Programming Languages;
                 Programming profession; sequentially composing
                 transactional code; Software performance;
                 Software/Software Engineering; strong atomicity; System
                 recovery; Transaction databases; transaction
                 processing; transactional memory atomicity semantics;
                 weak atomicity",
}

@Article{Price:2006:CCT,
  author =       "G. Price and M. Vachharajani",
  title =        "A Case for Compressing Traces with {BDDs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "5",
  number =       "2",
  pages =        "18--18",
  month =        feb,
  year =         "2006",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2006.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Instruction-level traces are widely used for program
                 and hardware analysis. However, program traces for just
                 a few seconds of execution are enormous, up to several
                 terabytes in size, uncompressed. Specialized
                 compression can shrink traces to a few gigabytes, but
                 trace analyzers typically stream the decompressed trace
                 through the analysis engine. Thus, the complexity of
                 analysis depends on the decompressed trace size (even
                 though the decompressed trace is never stored to disk).
                 This makes many global or interactive analyses
                 infeasible. This paper presents a method to compress
                 program traces using binary decision diagrams (BDDs).
                 BDDs intrinsically support operations common to many
                 desirable program analyses and these analyses operate
                 directly on the BDD. Thus, they are often polynomial in
                 the size of the compressed representation. The paper
                 presents mechanisms to represent a variety of trace
                 data using BDDs and shows that BDDs can store, in 1 GB
                 of RAM, the entire data-dependence graph of traces with
                 over 1 billion instructions. This allows rapid
                 computation of global analyses such as heap-object
                 liveness and dynamic slicing",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "binary decision diagrams; Binary decision diagrams;
                 Boolean functions; Data analysis; Data structures;
                 data-dependence graph; dynamic slicing; Engines; global
                 analyses; Hardware; hardware analysis; heap-object
                 liveness; instruction-level traces; Performance
                 analysis; Polynomials; program analysis; program
                 slicing; program traces; rapid computation; Read-write
                 memory; Software Engineering; Software Processor
                 validation Engineering; Software/Program Verification;
                 Software/Software; Software/Software Engineering;
                 specialized compression; Testing and Debugging; trace
                 analyzers; traces compression; Tracing; Validation;
                 Visualization",
}

@Article{MoretoPlanas:2007:EDC,
  author =       "M. {Moreto Planas} and F. Cazorla and A. Ramirez and
                 M. Valero",
  title =        "Explaining Dynamic Cache Partitioning Speed Ups",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "1",
  pages =        "1--4",
  month =        jan,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Cache partitioning has been proposed as an interesting
                 alternative to traditional eviction policies of shared
                 cache levels in modern CMP architectures: throughput is
                 improved at the expense of a reasonable cost. However,
                 these new policies present different behaviors
                 depending on the applications that are running in the
                 architecture. In this paper, we introduce some metrics
                 that characterize applications and allow us to give a
                 clear and simple model to explain final throughput
                 speed ups.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B Hardware; B.3 Memory Structures; B.3.2 Design
                 Styles; B.3.2.b Cache memories; B.3.3 Performance
                 Analysis and Design Aids; C Computer Systems
                 Organization; C.1 Processor Architectures; C.1.4
                 Parallel Architectures; C.1.4.e Multi-core/single-chip
                 multiprocessors; C.1.5 Micro-architecture
                 implementation considerations; C.1.5.e Memory
                 hierarchy; C.4 Performance of Systems; C.4.d Modeling
                 techniques; cache storage; chip multiprocessing;
                 Computer architecture; Counting circuits; dynamic cache
                 partitioning; microprocessor chips; Parallel
                 processing; Process design; Resource management; shared
                 cache levels; Streaming media; Surface-mount
                 technology; Throughput; Uninterruptible power systems",
}

@Article{Jerger:2007:CSC,
  author =       "N. Enright Jerger and M. Lipasti and L. Peh",
  title =        "Circuit-Switched Coherence",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "1",
  pages =        "5--8",
  month =        jan,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Circuit-switched networks can significantly lower the
                 communication latency between processor cores, when
                 compared to packet-switched networks, since once
                 circuits are set up, communication latency approaches
                 pure interconnect delay. However, if circuits are not
                 frequently reused, the long set up time and poorer
                 interconnect utilization can hurt overall performance.
                 To combat this problem, we propose a hybrid router
                 design which intermingles packet-switched flits with
                 circuit-switched flits. Additionally, we co-design a
                 prediction-based coherence protocol that leverages the
                 existence of circuits to optimize pair-wise sharing
                 between cores. The protocol allows pair-wise sharers to
                 communicate directly with each other via circuits and
                 drives up circuit reuse. Circuit-switched coherence
                 provides overall system performance improvements of up
                 to 17\% with an average improvement of 10\% and reduces
                 network latency by up to 30\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; C Computer Systems Organization; C.1
                 Processor Architectures; C.1.4 Parallel Architectures;
                 C.1.4.e Multi-core/single-chip multiprocessors; C.1.4.g
                 On-chip interconnection networks; C.1.5
                 Micro-architecture implementation considerations;
                 C.1.5.e Memory hierarchy; circuit switching;
                 circuit-switched network; Coupling circuits; Delay;
                 Fabrics; hybrid router design; Integrated circuit
                 interconnections; multiprocessor interconnection
                 networks; network latency; Network-on-a-chip; packet
                 switching; Packet switching; packet switching;
                 pair-wise sharing; Pipelines; prediction-based
                 coherence protocol; processor core; Protocols; routing
                 protocols; System performance",
}

@Article{Kodakara:2007:CRM,
  author =       "S. Kodakara and J. Kim and D. Lilja and D. Hawkins and
                 W. Hsu and P. Yew",
  title =        "{CIM}: A Reliable Metric for Evaluating Program Phase
                 Classifications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "1",
  pages =        "9--12",
  month =        jan,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We propose the use of the confidence interval of
                 estimated mean (CIM), a metric based on statistical
                 sampling theory, to evaluate the quality of a given
                 phase classification and for comparing different phase
                 classification schemes. Previous research on phase
                 classification used the weighted average of coefficient
                 of variation (CoVwa) to estimate phase classification
                 quality. We found that the phase quality indicated by
                 CoVwa could be inconsistent across different phase
                 classifications. We explain the reasons behind this
                 inconsistency and demonstrate the inconsistency using
                 data from several SPEC CPU2000 benchmark programs. We
                 show that the confidence interval of estimated mean
                 (CIM) correctly estimates the quality of phase
                 classification with a meaningful statistical
                 interpretation.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; Benchmark Analysis; Clustering
                 algorithms; Computer architecture; computer
                 architecture; Computer integrated manufacturing;
                 confidence interval; estimated mean; estimation theory;
                 pattern classification; Phase Classification; Phase
                 detection; Phase estimation; Phase measurement; phase
                 quality estimation; program compilers; program
                 diagnostics; program phase classification; Quality
                 Metric; reliable metric; Sampling methods; sampling
                 methods; SPEC CPU2000 benchmark program; statistical
                 interpretation; Statistical Sampling; statistical
                 sampling theory; Statistics; Surges",
}

@Article{Dieter:2007:LCM,
  author =       "W. R. Dieter and A. Kaveti and H. G. Dietz",
  title =        "Low-Cost Microarchitectural Support for Improved
                 Floating-Point Accuracy",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "1",
  pages =        "13--16",
  month =        jan,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/fparith.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Some processors designed for consumer applications,
                 such as graphics processing units (CPUs) and the CELL
                 processor, promise outstanding floating-point
                 performance for scientific applications at commodity
                 prices. However, IEEE single precision is the most
                 precise floating-point data type these processors
                 directly support in hardware. Pairs of native
                 floating-point numbers can be used to represent a base
                 result and a residual term to increase accuracy, but
                 the resulting order of magnitude slowdown dramatically
                 reduces the price/performance advantage of these
                 systems. By adding a few simple microarchitectural
                 features, acceptable accuracy can be obtained with
                 relatively little performance penalty. To reduce the
                 cost of native-pair arithmetic, a residual register is
                 used to hold information that would normally have been
                 discarded after each floating-point computation. The
                 residual register dramatically simplifies the code,
                 providing both lower latency and better
                 instruction-level parallelism.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; B Hardware; B.2 Arithmetic and
                 Logic Structures; B.2.4 High-Speed Arithmetic; B.2.4.b
                 Cost/performance; C Computer Systems Organization; C.0
                 General; C.0.b Hardware/software interfaces; C.1
                 Processor Architectures; C.1.5 Micro-architecture
                 implementation considerations; CELL processor; computer
                 architecture; Costs; floating point arithmetic;
                 floating-point accuracy; Floating-point arithmetic; G
                 Mathematics of Computing; G.1 Numerical Analysis; G.1.0
                 General; G.1.0.e Multiple precision arithmetic;
                 Graphics; graphics processing units; Hardware; I
                 Computing Methodologies; I.3 Computer Graphics; I.3.1
                 Hardware Architecture; I.3.1.a Graphics processors;
                 IEEE single precision; instruction-level parallelism;
                 microarchitectural support; Microarchitecture; parallel
                 processing; Pipelines; Registers; Software algorithms;
                 Software performance",
}

@Article{Etsion:2007:PPT,
  author =       "Y. Etsion and D. G. Feitelson",
  title =        "Probabilistic Prediction of Temporal Locality",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "1",
  pages =        "17--20",
  month =        jan,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The increasing gap between processor and memory
                 speeds, as well as the introduction of multi-core CPUs,
                 have exacerbated the dependency of CPU performance on
                 the memory subsystem. This trend motivates the search
                 for more efficient caching mechanisms, enabling both
                 faster service of frequently used blocks and decreased
                 power consumption. In this paper we describe a novel,
                 random sampling based predictor that can distinguish
                 transient cache insertions from non-transient ones. We
                 show that this predictor can identify a small set of
                 data cache resident blocks that service most of the
                 memory references, thus serving as a building block for
                 new cache designs and block replacement policies.
                 Although we only discuss the L1 data cache, we have
                 found this predictor to be efficient also when handling
                 L1 instruction caches and shared L2 caches.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B Hardware; B.3 Memory Structures; B.3.2 Design
                 Styles; B.3.2.b Cache memories; B.3.3 Performance
                 Analysis and Design Aids; cache storage; Computer
                 science; Data analysis; data cache; Distributed
                 computing; Energy consumption; Extraterrestrial
                 phenomena; memory subsystem; multi-core CPU; power
                 aware computing; probabilistic prediction; random
                 sampling; Sampling methods; temporal locality;
                 transient cache insertions; Visualization",
}

@Article{Guz:2007:NCO,
  author =       "Z. Guz and I. Keidar and A. Kolodny and U. Weiser",
  title =        "{Nahalal}: Cache Organization for Chip
                 Multiprocessors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "1",
  pages =        "21--24",
  month =        jan,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper addresses cache organization in chip
                 multiprocessors (CMPs). We show that in CMP systems it
                 is valuable to distinguish between shared data, which
                 is accessed by multiple cores, and private data
                 accessed by a single core. We introduce Nahalal, an
                 architecture whose novel floorplan topology partitions
                 cached data according to its usage (shared versus
                 private data), and thus enables fast access to shared
                 data for all processors while preserving the vicinity
                 of private data to each processor. Nahalal exhibits
                 significant improvements in cache access latency
                 compared to a traditional cache design.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Cache memories; cache organization; cache
                 storage; chip multiprocessors; circuit layout; CMP
                 systems; Computer integrated manufacturing; Computer
                 Systems Organization; Design Styles; floorplan topology
                 partitions; Hardware; Memory Structures; microprocessor
                 chips; Multi-core/single-chip multiprocessors; Nahalal;
                 Parallel Architectures; Processor Architectures;
                 Writing",
}

@Article{Joao:2007:DPI,
  author =       "J. A. Joao and O. Mutlu and H. Kim and Y. N. Patt",
  title =        "Dynamic Predication of Indirect Jumps",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "25--28",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Indirect jumps are used to implement
                 increasingly-common programming language constructs
                 such as virtual function calls, switch-case statements,
                 jump tables, and interface calls. Unfortunately, the
                 prediction accuracy of indirect jumps has remained low
                 because many indirect jumps have multiple targets that
                 are difficult to predict even with specialized
                 hardware. This paper proposes a new way of handling
                 hard-to-predict indirect jumps: dynamically predicating
                 them. The compiler identifies indirect jumps that are
                 suitable for predication along with their control-flow
                 merge (CFM) points. The microarchitecture predicates
                 the instructions between different targets of the jump
                 and its CFM point if the jump turns out to be
                 hard-to-predict at run time. We describe the new
                 indirect jump predication architecture, provide code
                 examples showing why it could reduce the performance
                 impact of jumps, derive an analytical cost-benefit
                 model for deciding which jumps and targets to
                 predicate, and present preliminary evaluation
                 results.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; Analytical models; and statically-scheduled
                 implementation; Computer languages; Computer Systems
                 Organization; control-flow merge point;
                 dynamically-scheduled; dynamically-scheduled and
                 statically-scheduled implementation; hard-to-predict
                 indirect jump handling; Hardware; Instruction fetch;
                 Instruction sets; interface call; jump table;
                 Micro-architecture implementation considerations;
                 Microarchitecture; microarchitecture dynamic
                 predication; Object oriented modeling; parallel
                 architectures; Performance analysis; Pipeline
                 processors; Pipelines; Processor Architectures; program
                 compiler; program compilers; program control
                 structures; programming language construct; Single Data
                 Stream Architectures; Superscalar; switch-case
                 statement; Switches; system monitoring; virtual
                 function call",
}

@Article{Das:2007:MMC,
  author =       "A. Das and S. Ozdemir and G. Memik and J. Zambreno and
                 A. Choudhary",
  title =        "Microarchitectures for Managing Chip Revenues under
                 Process Variations",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "29--32",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As transistor feature sizes continue to shrink into
                 the sub-90 nm range and beyond, the effects of process
                 variations on critical path delay and chip yields have
                 amplified. A common concept to remedy the effects of
                 variation is speed-binning, by which chips from a
                 single batch are rated by a discrete range of
                 frequencies and sold at different prices. In this
                 paper, we discuss strategies to modify the number of
                 chips in different bins and hence enhance the profits
                 obtained from them. Particularly, we propose a scheme
                 that introduces a small Substitute Cache associated
                 with each cache way to replicate the data elements that
                 will be stored in the high latency lines. Assuming a
                 fixed pricing model, this method increases the revenue
                 by as much as 13.8\% without any impact on the
                 performance of the chips.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Cache Memories; cache memory; cache storage; Circuits;
                 Computer Architecture; computer architecture; Computer
                 Architecture; Computer architecture; critical path
                 delay; Fabrication; Fault-tolerant Computing.; fixed
                 pricing model; Frequency; Logic arrays;
                 Microarchitecture; microarchitecture chip;
                 microprocessor chips; Microprocessors; optimisation;
                 process variation; Process Variations; Registers; Size
                 control; Voltage control",
}

@Article{Zebchuk:2007:BBC,
  author =       "J. Zebchuk and A. Moshovos",
  title =        "A Building Block for Coarse-Grain Optimizations in the
                 On-Chip Memory Hierarchy",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "33--36",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Current on-chip block-centric memory hierarchies
                 exploit access patterns at the fine-grain scale of
                 small blocks. Several recently proposed memory
                 hierarchy enhancements for coherence traffic reduction
                 and prefetching suggest that additional useful patterns
                 emerge with a macroscopic, coarse-grain view. This
                 paper presents RegionTracker, a dual-grain, on-chip
                 cache design that exposes coarse-grain behavior while
                 maintaining block-level communication. RegionTracker
                 eliminates the extraneous, often imprecise coarse-grain
                 tracking structures of previous proposals. It can be
                 used as the building block for coarse-grain
                 optimizations, reducing their overall cost and easing
                 their adoption. Using full-system simulation of a
                 quad-core chip multiprocessor and commercial workloads,
                 we demonstrate that RegionTracker overcomes the
                 inefficiencies of previous coarse-grain cache designs.
                 We also demonstrate how RegionTracker boosts the
                 benefits and reduces the cost of a previously proposed
                 snoop reduction technique.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "access patterns; Bandwidth; cache storage; Cache
                 storage; coarse-grain optimizations; coherence traffic
                 reduction; Cost function; Design optimization;
                 Explosions; Information management; Memory management;
                 Multithreading; on-chip memory hierarchy; optimising
                 compilers; Prefetching; prefetching; Proposals;
                 quad-core chip multiprocessor; RegionTracker dual-grain
                 on-chip cache design; system-on-chip",
}

@Article{Kim:2007:FBT,
  author =       "J. Kim and J. Balfour and W. J. Dally",
  title =        "Flattened Butterfly Topology for On-Chip Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "37--40",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "With the trend towards increasing number of cores in a
                 multicore processors, the on-chip network that connects
                 the cores needs to scale efficiently. In this work, we
                 propose the use of high-radix networks in on-chip
                 networks and describe how the flattened butterfly
                 topology can be mapped to on-chip networks. By using
                 high-radix routers to reduce the diameter of the
                 network, the flattened butterfly offers lower latency
                 and energy consumption than conventional on-chip
                 topologies. In addition, by properly using bypass
                 channels in the flattened butterfly network,
                 non-minimal routing can be employed without increasing
                 latency or the energy consumption.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Computer networks; Delay; Energy
                 consumption; flattened butterfly; flattened butterfly
                 topology; high-radix networks; high-radix routers;
                 Laboratories; Multicore processing; multicore
                 processors; Multiprocessor interconnection networks;
                 Network topology; network topology; Network-on-a-chip;
                 network-on-chip; on-chip networks; Routing; topology",
}

@Article{Xiao:2007:NPD,
  author =       "X. Xiao and J. Lee",
  title =        "A Novel Parallel Deadlock Detection Algorithm and
                 Hardware for Multiprocessor System-on-a-Chip",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "41--44",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Given the projected dramatic increase in the number of
                 processors and resources in a system-on-a-chip, a
                 quadratic increase in the likelihood of deadlock is
                 predicted due to complex system behavior. To deal with
                 this issue, we here present a novel parallel
                 hardware-oriented deadlock detection algorithm with $
                 O(1) $ DEADLOCK DETECTION AND $ O(\MIN (M, N)) $
                 preparation, where $m$ and $n$ are the numbers of
                 processes and resources, respectively. Our
                 contributions are (i) the first $ O(1)$ deadlock
                 detection hardware implementation and (ii) a new
                 algorithmic method of achieving $ O(\min (m, n))$
                 overall run-time complexity. We implement our algorithm
                 in Verilog HDL and demonstrate that deadlock detection
                 always takes only two clock cycles regardless of the
                 size of a system (i.e., $m$ and $n$).",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithms implemented in hardware; computational
                 complexity; deadlock detection hardware; Deadlocks;
                 Detection algorithms; Hardware design languages;
                 microprocessor chips; Multiprocessing systems;
                 multiprocessing systems; multiprocessor
                 system-on-a-chip; operating systems (computers);
                 Parallel algorithms; parallel algorithms; parallel
                 deadlock detection algorithm; Processor scheduling;
                 Real time systems; Real-time and embedded systems;
                 Resource management; run-time complexity; Runtime;
                 Software performance; System recovery; system-on-chip",
}

@Article{August:2007:UOS,
  author =       "D. August and J. Chang and S. Girbal and D.
                 Gracia-Perez and G. Mouchard and D. A. Penry and O.
                 Temam and N. Vachharajani",
  title =        "{UNISIM}: An Open Simulation Environment and Library
                 for Complex Architecture Design and Collaborative
                 Development",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "45--48",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Simulator development is already a huge burden for
                 many academic and industry research groups; future
                 complex or heterogeneous multi-cores, as well as the
                 multiplicity of performance metrics and required
                 functionality, will make matters worse. We present a
                 new simulation environment, called UNISIM, which is
                 designed to rationalize simulator development by making
                 it possible and efficient to distribute the overall
                 effort over multiple research groups, even without
                 direct cooperation. UNISIM achieves this goal with a
                 combination of modular software development,
                 distributed communication protocols, multilevel
                 abstract modeling, interoperability capabilities, a set
                 of simulator services APIs, and an open
                 library/repository for providing a consistent set of
                 simulator modules.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "API; application program interfaces; Collaboration;
                 collaborative development; complex architecture design;
                 Computational modeling; Computer architecture; Computer
                 industry; Computer science; Design engineering;
                 distributed communication protocols; groupware;
                 interoperability capability; Libraries; Measurement;
                 modular software development; multilevel abstract
                 modeling; open library; open repository; open
                 simulation environment; open systems; Operating
                 systems; Performance and Reliability; Processor
                 Architectures; Programming; simulator development;
                 simulator modules; simulator services; software
                 architecture; UNISIM",
}

@Article{Sendag:2007:BMP,
  author =       "R. Sendag and J. Yi and P. Chuang",
  title =        "Branch Misprediction Prediction: Complementary Branch
                 Predictors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "49--52",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper, we propose a new class of branch
                 predictors, complementary branch predictors, which can
                 be easily added to any branch predictor to improve the
                 overall prediction accuracy. This mechanism differs
                 from conventional branch predictors in that it focuses
                 only on mispredicted branches. As a result, this
                 mechanism has the advantages of scalability and
                 flexibility (can be implemented with any branch
                 predictor), but is not on the critical path. More
                 specifically, this mechanism improves the branch
                 prediction accuracy by predicting which future branch
                 will be mispredicted next and when that will occur, and
                 then it changes the predicted direction at the
                 predicted time. Our results show that a branch
                 predictor with the branch misprediction predictor
                 achieves the same prediction accuracy as a conventional
                 branch predictor that is 4 to 16 times larger, but
                 without significantly increasing the overall complexity
                 or lengthening the critical path.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; branch misprediction prediction; branch
                 predictor; computational complexity; Computer networks;
                 Costs; Delay; Emerging technologies; History; parallel
                 architectures; Performance loss; Pipeline processors;
                 Pipelines; Prediction algorithms; Scalability;
                 Testing",
}

@Article{Yalcin:2007:UTM,
  author =       "G. Yalcin and O. Ergin",
  title =        "Using tag-match comparators for detecting soft
                 errors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "6",
  number =       "2",
  pages =        "53--56",
  month =        feb,
  year =         "2007",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Soft errors caused by high energy particle strikes are
                 becoming an increasingly important problem in
                 microprocessor design. With increasing transistor
                 density and die sizes, soft errors are expected to be a
                 larger problem in the near future. Recovering from
                 these unexpected faults may be possible by reexecuting
                 some part of the program only if the error can be
                 detected. Therefore it is important to come up with new
                 techniques to detect soft errors and increase the
                 number of errors that are detected. Modern
                 microprocessors employ out-of-order execution and
                 dynamic scheduling logic. Comparator circuits, which
                 are used to keep track of data dependencies, are
                 usually idle. In this paper, we propose various schemes
                 to exploit on-chip comparators to detect transient
                 faults. Our results show that around 50\% of the errors
                 on the wakeup logic can be detected with minimal
                 hardware overhead by using the proposed techniques.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "and Fault-Tolerance; Broadcasting; Circuit faults;
                 comparators (circuits); Computer errors; Control
                 Structure Reliability; dynamic scheduling logic;
                 Electrical fault detection; Fault detection;
                 identification technology; Logic; logic design; logic
                 testing; microprocessor chips; microprocessor design;
                 Microprocessors; Out of order; out-of-order execution;
                 Pipelines; Processor Architectures; Registers;
                 scheduling; soft error detection; tag-match comparator;
                 Testing; Testing and Fault-Tolerance",
}

@Article{Joao:2008:DPI,
  author =       "J. A. Joao and O. Mutlu and H. Kim and Y. N. Patt",
  title =        "Dynamic Predication of Indirect Jumps",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "1--4",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Indirect jumps are used to implement increasingly
                 common programming language constructs such as virtual
                 function calls, switch-case statements, jump tables,
                 and interface calls. Unfortunately, the prediction
                 accuracy of indirect jumps has remained low because
                 many indirect jumps have multiple targets that are
                 difficult to predict even with specialized hardware.
                 This paper proposes a new way of handling
                 hard-to-predict indirect jumps: dynamically predicating
                 them. The compiler identifies indirect jumps that are
                 suitable for predication along with their control-flow
                 merge (CFM) points. The microarchitecture predicates
                 the instructions between different targets of the jump
                 and its CFM point if the jump turns out to be
                 hardto-predict at run time. We describe the new
                 indirect jump predication architecture, provide code
                 examples showing why it could reduce the performance
                 impact of jumps, derive an analytical cost-benefit
                 model for deciding which jumps and targets to
                 predicate, and present preliminary evaluation
                 results.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; Analytical models; B Hardware; B.3 Memory
                 Structures; Cache memories; Computer languages;
                 Computer Systems Organization; Design Styles; Hardware;
                 Instruction sets; Microarchitecture;
                 Multi-core/single-chip multiprocessors; Object oriented
                 modeling; Parallel Architectures; Performance analysis;
                 Pipelines; Processor Architectures; Switches",
}

@Article{Das:2008:MMC,
  author =       "A. Das and S. Ozdemir and G. Memik and J. Zambreno and
                 A. Choudhary",
  title =        "Microarchitectures for Managing Chip Revenues under
                 Process Variations",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "5--8",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As transistor feature sizes continue to shrink into
                 the sub-90nm range and beyond, the effects of process
                 variations on critical path delay and chip yields have
                 amplified. A common concept to remedy the effects of
                 variation is speed-binning, by which chips from a
                 single batch are rated by a discrete range of
                 frequencies and sold at different prices. In this
                 paper, we discuss strategies to modify the number of
                 chips in different bins and hence enhance the profits
                 obtained from them. Particularly, we propose a scheme
                 that introduces a small substitute cache associated
                 with each cache way to replicate the data elements that
                 will be stored in the high latency lines. Assuming a
                 fixed pricing model, this method increases the revenue
                 by as much as 13.8\% without any impact on the
                 performance of the chips.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Cache Memories; Computer Architecture; Computer
                 architecture; Cost function; Delay effects; Design
                 optimization; Fabrication; Fault-tolerant Computing.;
                 Frequency; Manufacturing; Microarchitecture; Pricing;
                 Process Variations; Transistors",
}

@Article{Roth:2008:PRR,
  author =       "A. Roth",
  title =        "Physical register reference counting",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "9--12",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Several proposed techniques including CPR (checkpoint
                 processing and recovery) and NoSQ (no store queue) rely
                 on reference counting to manage physical registers.
                 However, the register reference counting mechanism
                 itself has received surprisingly little attention. This
                 paper fills this gap by describing potential register
                 reference counting schemes for NoSQ, CPR, and a
                 hypothetical NoSQ/CPR hybrid. Although previously
                 described in terms of binary counters, we find that
                 reference counts are actually more naturally
                 represented as matrices. Binary representations can be
                 used as an optimization in specific situations.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "and statically-scheduled implementation; binary
                 representations; checkpoint processing; checkpointing;
                 Counting circuits; dynamically-scheduled;
                 dynamically-scheduled and statically-scheduled
                 implementation; Engines; Information science; matrices;
                 Micro-architecture implementation considerations;
                 Microarchitecture; no store queue; physical register
                 reference counting; Physics computing; Proposals;
                 recovery technique; Registers; shift registers;
                 Superscalar",
}

@Article{Flich:2008:LBD,
  author =       "J. Flich and J. Duato",
  title =        "Logic-Based Distributed Routing for {NoCs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "13--16",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "The design of scalable and reliable interconnection
                 networks for multicore chips (NoCs) introduces new
                 design constraints like power consumption, area, and
                 ultra low latencies. Although 2D meshes are usually
                 proposed for NoCs, heterogeneous cores, manufacturing
                 defects, hard failures, and chip virtualization may
                 lead to irregular topologies. In this context,
                 efficient routing becomes a challenge. Although
                 switches can be easily configured to support most
                 routing algorithms and topologies by using routing
                 tables, this solution does not scale in terms of
                 latency and area. We propose a new circuit that removes
                 the need for using routing tables. The new mechanism,
                 referred to as logic-based distributed routing (LBDR),
                 enables the implementation in NoCs of many routing
                 algorithms for most of the practical topologies we
                 might find in the near future in a multicore chip. From
                 an initial topology and routing algorithm, a set of
                 three bits per switch output port is computed. By using
                 a small logic block, LHDR mimics (demonstrated by
                 evaluation) the behavior of routing algorithms
                 implemented with routing tables. This result is
                 achieved both in regular and irregular topologies.
                 Therefore, LBDR removes the need for using routing
                 tables for distributed routing, thus enabling flexible,
                 fast and power-efficient routing in NoCs.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "chip virtualization; circuit reliability; Circuit
                 topology; Delay; Energy consumption; heterogeneous
                 cores; interconnection network reliability;
                 interconnections; logic circuits; logic-based
                 distributed routing; Manufacturing; manufacturing
                 defects; Multi-core/single-chip multiprocessors;
                 Multicore processing; Multiprocessor interconnection
                 networks; network routing; network topology; Network
                 topology; Network-on-a-chip; network-on-chip; networks
                 for multicore chips; NoC; On-chip interconnection
                 networks; Routing; Switches",
}

@Article{Yoon:2008:CHP,
  author =       "J. H. Yoon and E. H. Nam and Y. J. Seong and H. Kim
                 and B. Kim and S. L. Min and Y. Cho",
  title =        "{Chameleon}: A High Performance Flash\slash {FRAM}
                 Hybrid Solid State Disk Architecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "17--20",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Flash memory solid state disk (SSD) is gaining
                 popularity and replacing hard disk drive (HDD) in
                 mobile computing systems such as ultra mobile PCs
                 (UMPCs) and notebook PCs because of lower power
                 consumption, faster random access, and higher shock
                 resistance. One of the key challenges in designing a
                 high-performance flash memory SSD is an efficient
                 handling of small random writes to non-volatile data
                 whose performance suffers from the inherent limitation
                 of flash memory that prohibits in-placc update. In this
                 paper, we propose a high performance Flash/FRAM hybrid
                 SSD architecture called Chameleon. In Chameleon,
                 metadata used by the flash translation layer (FTL), a
                 software layer in the flash memory SSD, is maintained
                 in a small FRAM since this metadata is a target of
                 intensive small random writes, whereas the bulk data is
                 kept in the flash memory. Performance evaluation based
                 on an FPGA implementation of the Chameleon architecture
                 shows that the use of FRAM in Chameleon improves the
                 performance by 21.3\%. The results also show that even
                 for bulk data that cannot be maintained in FRAM because
                 of the size limitation, the use of fine-grained write
                 buffering is critically important because of the
                 inability of flash memory to perform in-placc update of
                 data.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Chameleon; Computer architecture; Design studies; disc
                 drives; Energy consumption; Ferroelectric films; field
                 programmable gate arrays; flash memories; Flash memory;
                 flash memory solid state disk; flash translation layer;
                 flash-FRAM hybrid SSD architecture; FPGA
                 implementation; FTL; hard discs; hard disk drive; Hard
                 disks; HDD; Mass storage; memory architecture; Mobile
                 computing; mobile computing systems; Nonvolatile
                 memory; notebook PCs; Personal communication networks;
                 Random access memory; random-access storage; Solid
                 state circuits; SSD; ultra mobile PCs; UMPC",
}

@Article{Biswas:2008:CAA,
  author =       "A. Biswas and P. Racunas and J. Emer and S.
                 Mukherjee",
  title =        "Computing Accurate {AVFs} using {ACE} Analysis on
                 Performance Models: A Rebuttal",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "21--24",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.19",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "ACE (architecturally correct execution) analysis
                 computes AVFs (architectural vulnerability factors) of
                 hardware structures. AVF expresses the fraction of
                 radiation-induced transient faults that result in
                 user-visible errors. Architects usually perform this
                 analysis on a high-level performance model to quickly
                 compute per-structure AVFs. If, however, low-level
                 details of a microarchitecture are not modeled
                 appropriately, then their effects may not be reflected
                 in the per-structure AVFs. In this paper we refute
                 Wang, et al.'s (2007) claim that this detail is
                 difficult to model and imposes a practical threshold on
                 ACE analysis that forces its estimates to have a high
                 error margin. We show that carefully choosing a small
                 amount of additional detail can result in a much
                 tighter AVF bound than Wang, et al. were able to
                 achieve in their refined ACE analysis. Even the
                 inclusion of small details, such as read/write pointers
                 and appropriate inter-structure dependencies, can
                 increase the accuracy of the AVF computation by 40\% or
                 more. We argue that this is no different than modeling
                 the IPC (instructions per cycle) of a microprocessor
                 pipeline. A less detailed performance model will
                 provide less accurate IPCs. AVFs are no different.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "and Fault-Tolerance; architectural vulnerability
                 factors; architecturally correct execution analysis;
                 Computational modeling; Hardware; hardware structures;
                 High performance computing; instructions per cycle;
                 inter-structure dependencies; Microarchitecture;
                 microprocessor pipeline; Microprocessors; Performance
                 analysis; Performance and Reliability; performance
                 evaluation; performance models; Pipelines; Protection;
                 radiation-induced transient faults; read pointers;
                 Reliability; Target tracking; Testing; Testing and
                 Fault-Tolerance; user-visible errors; write pointers",
}

@Article{Cho:2008:CAL,
  author =       "S. Cho and R. Melhem",
  title =        "Corollaries to {Amdahl's Law} for Energy",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "25--28",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2007.18",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper studies the important interaction between
                 parallelization and energy consumption in a
                 parallelizable application. Given the ratio of serial
                 and parallel portion in an application and the number
                 of processors, we first derive the optimal frequencies
                 allocated to the serial and parallel regions in the
                 application to minimize the total energy consumption,
                 while the execution time is preserved (i.e., speedup =
                 1). We show that dynamic energy improvement due to
                 parallelization has a function rising faster with the
                 increasing number of processors than the speed
                 improvement function given by the well-known Amdahl's
                 Law. Furthermore, we determine the conditions under
                 which one can obtain both energy and speed improvement,
                 as well as the amount of improvement. The formulas we
                 obtain capture the fundamental relationship between
                 parallelization, speedup, and energy consumption and
                 can be directly utilized in energy aware processor
                 resource management. Our results form a basis for
                 several interesting research directions in the area of
                 power and energy aware parallel processing.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Amdahl's Law; Application software; Computer science;
                 Concurrent computing; dynamic energy improvement;
                 energy aware processor resource management; Energy
                 capture; energy consumption; Energy consumption; energy
                 consumption; Energy management; Equations; Hardware;
                 Parallel Architectures; parallel processing; Parallel
                 processing; parallelization; Power Management; Radio
                 spectrum management; Resource management",
}

@Article{Balfour:2008:EEP,
  author =       "J. Balfour and W. Dally and D. Black-Schaffer and V.
                 Parikh and J. Park",
  title =        "An Energy-Efficient Processor Architecture for
                 Embedded Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "1",
  pages =        "29--32",
  month =        jan,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We present an efficient programmable architecture for
                 compute-intensive embedded applications. The processor
                 architecture uses instruction registers to reduce the
                 cost of delivering instructions, and a hierarchical and
                 distributed data register organization to deliver data.
                 Instruction registers capture instruction reuse and
                 locality in inexpensive storage structures that arc
                 located near to the functional units. The data register
                 organization captures reuse and locality in different
                 levels of the hierarchy to reduce the cost of
                 delivering data. Exposed communication resources
                 eliminate pipeline registers and control logic, and
                 allow the compiler to schedule efficient instruction
                 and data movement. The architecture keeps a significant
                 fraction of instruction and data bandwidth local to the
                 functional units, which reduces the cost of supplying
                 instructions and data to large numbers of functional
                 units. This architecture achieves an energy efficiency
                 that is 23x greater than an embedded RISC processor.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Communication system control; compute-intensive
                 embedded applications; Computer applications; computer
                 architecture; Computer architecture; Costs; data
                 movement; distributed data register organization;
                 Embedded computing; embedded RISC processor; Embedded
                 system; embedded systems; Energy efficiency;
                 energy-efficient processor architecture; hierarchical
                 organization; inexpensive storage structures;
                 instruction registers; instruction sets; Logic; Mobile
                 processors; pipeline processing; pipeline registers;
                 Pipelines; Registers",
}

@Article{Anonymous:2008:FC,
  author =       "Anonymous",
  title =        "[Front cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "c1--c1",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Presents the front cover for this issue of the
                 publication.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2008:EBC,
  author =       "Anonymous",
  title =        "Editorial Board [Cover2]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "c2--c2",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Provides a listing of current society officers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Pao:2008:PAM,
  author =       "D. Pao and W. Lin and B. Liu",
  title =        "Pipelined Architecture for Multi-String Matching",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "33--36",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This letter presents a new oblivious routing algorithm
                 for 3D mesh networks called randomized
                 partially-minimal (RPM) routing that provably achieves
                 optimal worst- case throughput for 3D meshes when the
                 network radix fc is even and within a factor of 1/k2 of
                 optimal when k is odd. Although this optimality result
                 has been achieved with the minimal routing algorithm
                 OITURN for the 2D case, the worst-case throughput of
                 OITURN degrades tremendously in higher dimensions.
                 Other existing routing algorithms suffer from either
                 poor worst-case throughput (DOR, ROMM) or poor latency
                 (VAL). RPM on the other hand achieves near optimal
                 worst-case and good average-case throughput as well as
                 good latency performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "3D mesh networks; Automata; computer architecture;
                 Computer architecture; Computer science; Costs;
                 deterministic finite automaton; Hardware; Intrusion
                 detection; network intrusion detection; network radix;
                 OITURN; Partial response channels; pipelined
                 processing; Pipelines; randomized partially-minimal
                 routing; string matching; Table lookup;
                 three-dimensional mesh networks; Throughput",
}

@Article{Ramanujam:2008:RPM,
  author =       "R. Sunkam Ramanujam and B. Lin",
  title =        "Randomized Partially-Minimal Routing on
                 Three-Dimensional Mesh Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "37--40",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This letter presents a new oblivious routing algorithm
                 for 3D mesh networks called Randomized Partially-
                 Minimal (RPM) routing that provably achieves optimal
                 worst-case throughput for 3D meshes when the network
                 radix k is even and within a factor of 1/k2 of optimal
                 when k is odd. Although this optimality result has been
                 achieved with the minimal routing algorithm O1TURN [9]
                 for the 2D case, the worst-case throughput of O1TURN
                 degrades tremendously in higher dimensions. Other
                 existing routing algorithms suffer from either poor
                 worst-case throughput (DOR [10], ROMM [8]) or poor
                 latency (VAL [14]). RPM on the other hand achieves near
                 optimal worst-case and good average-case throughput as
                 well as good latency performance.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Degradation; Delay; Emerging technologies; Fabrics;
                 Interconnection architectures; Mesh networks; Network
                 communications; Network topology; On-chip
                 interconnection networks; Packet-switching networks;
                 Routing; Silicon; Technological innovation;
                 Telecommunication traffic; Throughput",
}

@Article{Black-Schaffer:2008:HIR,
  author =       "D. Black-Schaffer and J. Balfour and W. Dally and V.
                 Parikh and J. Park",
  title =        "Hierarchical Instruction Register Organization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "41--44",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper analyzes a range of architectures for
                 efficient delivery of VLIW instructions for embedded
                 media kernels. The analysis takes an efficient filter
                 cache as a baseline and examines the benefits from (1)
                 removing the tag overhead, (2) distributing the
                 storage, (3) adding indirection, (4) adding efficient
                 NOP generation, and (5) sharing instruction memory. The
                 result is a hierarchical instruction register
                 organization that provides a 56\% energy and 40\% area
                 savings over an already efficient filter cache.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache storage; Cache storage; Computer aided
                 instruction; Computer architecture; Computer integrated
                 manufacturing; distributed shared memory systems;
                 Embedded computing; embedded media kernel; embedded
                 processor architecture; embedded systems; filter cache;
                 Filters; hierarchical instruction register
                 organization; Instruction fetch; instruction memory
                 sharing; instruction sets; Kernel; Laboratories;
                 Low-power design; NOP generation; parallel
                 architectures; Registers; RISC/CISC; VLIW; VLIW
                 architectures; VLIW instruction delivery",
}

@Article{Lee:2008:PDD,
  author =       "J. Lee and X. Xiao",
  title =        "A Parallel Deadlock Detection Algorithm with {$ O(1)
                 $} Overall Run-time Complexity",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "45--48",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This article proposes a novel parallel,
                 hardware-oriented deadlock detection algorithm for
                 multiprocessor system-on-chips. The proposed algorithm
                 takes full advantage of hardware parallelism in
                 computation and maintains information needed by
                 deadlock detection through classifying all resource
                 allocation events and performing class specific
                 operations, which together make the overall run-time
                 complexity of the new method O(1). We implement the
                 proposed algorithm in Verilog HDL and demonstrate in
                 the simulation that each algorithm invocation takes at
                 most four clock cycles in hardware.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithms implemented in hardware; clock cycle;
                 Computational modeling; Concurrent computing;
                 Deadlocks; Detection algorithms; Event detection;
                 hardware description languages; Hardware design
                 languages; hardware-oriented deadlock detection;
                 Multiprocessing systems; multiprocessing systems;
                 multiprocessor system-on-chips; operating systems
                 (computers); parallel deadlock detection; Parallel
                 processing; Real-time and embedded systems; resource
                 allocation; Resource management; run-time complexity;
                 Runtime; System recovery; system-on-chip; Verilog HDL",
}

@Article{GomezRequena:2008:BFT,
  author =       "C. {Gomez Requena} and F. Gilabert Villamon and M.
                 Gomez and P. Lopez and J. Duato",
  title =        "Beyond Fat-tree: Unidirectional Load--Balanced
                 Multistage Interconnection Network",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "49--52",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  note =         "See comment \cite{Antelo:2009:CBF}.",
  abstract =     "The fat-tree is one of the most widely-used topologies
                 by interconnection network manufacturers. Recently, it
                 has been demonstrated that a deterministic routing
                 algorithm that optimally balances the network traffic
                 can not only achieve almost the same performance than
                 an adaptive routing algorithm but also outperforms it.
                 On the other hand, fat-trees require a high number of
                 switches with a non-negligible wiring complexity. In
                 this paper, we propose replacing the fat-tree by a
                 unidirectional multistage interconnection network
                 (UMIN) that uses a traffic balancing deterministic
                 routing algorithm. As a consequence, switch hardware is
                 almost reduced to the half, decreasing, in this way,
                 the power consumption, the arbitration complexity, the
                 switch size itself, and the network cost. Preliminary
                 evaluation results show that the UMIN with the load
                 balancing scheme obtains lower latency than fat-tree
                 for low and medium traffic loads. Furthermore, in
                 networks with a high number of stages or with high
                 radix switches, it obtains the same, or even higher,
                 throughput than fat-tree.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "adaptive routing algorithm; Butterfly Network;
                 computational complexity; Cost-efficiency; Costs;
                 Deterministic Routing; Energy consumption; Fat-trees;
                 Hardware; interconnection network manufacturers;
                 Manufacturing; Multiprocessor interconnection networks;
                 Multistage Interconnection Networks; Network
                 Architecture and Design; Network topology; network
                 traffic; nonnegligible wiring complexity; power
                 consumption; radix switches; Routing; Switches;
                 telecommunication network routing; telecommunication
                 switching; Telecommunication traffic; telecommunication
                 traffic; Traffic Balancing; traffic balancing
                 deterministic routing algorithm; trees (mathematics);
                 unidirectional load-balanced multistage interconnection
                 network; Wiring",
}

@Article{Li:2008:TAN,
  author =       "Z. Li and C. Zhu and L. Shang and R. Dick and Y. Sun",
  title =        "Transaction-Aware Network-on-Chip Resource
                 Reservation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "53--56",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Performance and scalability are critically-important
                 for on-chip interconnect in many-core
                 chip-multiprocessor systems. Packet-switched
                 interconnect fabric, widely viewed as the de facto
                 on-chip data communication backplane in the many-core
                 era, offers high throughput and excellent scalability.
                 However, these benefits come at the price of router
                 latency due to run-time multi-hop data buffering and
                 resource arbitration. The network accounts for a
                 majority of on-chip data transaction latency. In this
                 work, we propose dynamic in-network resource
                 reservation techniques to optimize run-time on-chip
                 data transactions. This idea is motivated by the need
                 to preserve existing abstraction and general-purpose
                 network performance while optimizing for
                 frequently-occurring network events such as data
                 transactions. Experimental studies using multithreaded
                 benchmarks demonstrate that the proposed techniques can
                 reduce on-chip data access latency by 28.4\% on average
                 in a 16-node system and 29.2\% on average in a 36-node
                 system.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Backplanes; buffer storage; Computer buffers; data
                 communication; Data communication; de facto on-chip
                 data communication backplane; Delay; dynamic in-network
                 resource reservation techniques; Fabrics;
                 frequently-occurring network events; Interconnection
                 architectures; Interconnections (Subsystems); many-core
                 chip-multiprocessor systems; multiprocessor
                 interconnection networks; Network-on-a-chip; on-chip
                 data transaction latency; On-chip interconnection
                 networks; packet switching; packet-switched
                 interconnect fabric; Parallel Architectures; resource
                 allocation; router latency; run-time multihop data
                 buffering; Runtime; Scalability; System-on-a-chip;
                 telecommunication network routing; Throughput;
                 transaction-aware network-on-chip resource
                 reservation",
}

@Article{Fide:2008:PUS,
  author =       "S. Fide and S. Jenks",
  title =        "Proactive Use of Shared {L3} Caches to Enhance Cache
                 Communications in Multi-Core Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "57--60",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "The software and hardware techniques to exploit the
                 potential of multi-core processors are falling behind,
                 even though the number of cores and cache levels per
                 chip is increasing rapidly. There is no explicit
                 communications support available, and hence inter-core
                 communications depend on cache coherence protocols,
                 resulting in demand-based cache line transfers with
                 their inherent latency and overhead. In this paper, we
                 present software controlled eviction (SCE) to improve
                 the performance of multithreaded applications running
                 on multi-core processors by moving shared data to
                 shared cache levels before it is demanded from remote
                 private caches. Simulation results show that SCE offers
                 significant performance improvement (8-28\%) and
                 reduces L3 cache misses by 88-98\%.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache coherence protocol; cache communication; cache
                 storage; Concurrent computing; Control systems;
                 Degradation; Delay; demand-based cache line transfer;
                 Hardware; intercore communications; microprocessor
                 chips; Multi-core/single-chip multiprocessors;
                 multi-threading; Multicore processing; multicore
                 processors; multithreaded application; Parallel
                 processing; Protocols; shared L3 cache; shared memory
                 systems; software controlled eviction; Software
                 performance; Support for multi-threaded execution",
}

@Article{Walter:2008:BBE,
  author =       "I. Walter and I. Cidon and A. Kolodny",
  title =        "{BENoC}: A Bus-Enhanced Network on-Chip for a Power
                 Efficient {CMP}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "61--64",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Network-on-chips (NoCs) outperform buses in terms of
                 scalability, parallelism and system modularity and
                 therefore are considered as the main interconnect
                 infrastructure in future chip multi-processor (CMP).
                 However, while NoCs are very efficient for delivering
                 high throughput point-to-point data from sources to
                 destinations, their multi-hop operation is too slow for
                 latency sensitive signals. In addition, current NoCS
                 are inefficient for broadcast operations and
                 centralized control of CMP resources. Consequently,
                 state-of-the-art NoCs may not facilitate the needs of
                 future CMP systems. In this paper, the benefit of
                 adding a low latency, customized shared bus as an
                 internal part of the NoC architecture is explored.
                 BENoC (bus-enhanced network on-chip) possesses two main
                 advantages: First, the bus is inherently capable of
                 performing broadcast transmission in an efficient
                 manner. Second, the bus has lower and more predictable
                 propagation latency. In order to demonstrate the
                 potential benefit of the proposed architecture, an
                 analytical comparison of the power saving in BENoC
                 versus a standard NoC providing similar services is
                 presented. Then, simulation is used to evaluate BENoC
                 in a dynamic non-uniform cache access (DNUCA)
                 multiprocessor system.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "broadcast transmission; Broadcasting; bus-enhanced
                 network-on-chip; Centralized control; chip
                 multiprocessor; Delay; dynamic nonuniform cache access;
                 integrated circuit interconnections; interconnect
                 infrastructure; Interconnection architectures;
                 low-power electronics; microprocessor chips;
                 multiprocessing systems; Multiprocessing systems;
                 Multiprocessor interconnection networks;
                 Network-on-a-chip; network-on-chip; NoC; On-chip
                 interconnection networks; power efficient CMP; Power
                 system interconnection; propagation latency;
                 Scalability; system buses; System-on-a-chip;
                 Throughput",
}

@Article{Golander:2008:DDS,
  author =       "A. Golander and S. Weiss and R. Ronen",
  title =        "{DDMR}: Dynamic and Scalable Dual Modular Redundancy
                 with Short Validation Intervals",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "65--68",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "DMR (dual modular redundancy) was suggested for
                 increasing reliability. Classical DMR consists of pairs
                 of cores that check each other and are pre-connected
                 during manufacturing by dedicated links. In this paper
                 we introduce the dynamic dual modular redundancy (DDMR)
                 architecture. DDMR supports run-time scheduling of
                 redundant threads, which has significant benefits
                 relative to static binding. To allow dynamic pairing,
                 DDMR replaces the special links with a novel ring
                 architecture. DDMR uses short instruction sequences for
                 validation, smaller than the processor reorder buffer.
                 Such short sequences reduce latencies in parallel
                 programs and save resources needed to buffer
                 uncommitted data. DDMR scales with the number of cores
                 and may be used in large multicore architectures.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "buffer storage; DDMR; Delay; dynamic dual modular
                 redundancy; Job shop scheduling; Joining processes;
                 Manufacturing; Multi-core/single-chip multiprocessors;
                 multicore architectures; Multicore processing; parallel
                 architectures; parallel programs; processor reorder
                 buffer; processor scheduling; Processor scheduling;
                 Proposals; Redundancy; Redundant design; ring
                 architecture; run-time scheduling; scalable dual
                 modular redundancy; short validation intervals;
                 Transistors",
}

@Article{Anonymous:2008:IA,
  author =       "Anonymous",
  title =        "Information for authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "c3--c3",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Provides instructions and guidelines to prospective
                 authors who wish to submit manuscripts.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2008:ICS,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [Cover 4]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "7",
  number =       "2",
  pages =        "c4--c4",
  month =        jul,
  year =         "2008",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.18",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 05:49:19 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Provides a listing of current society officers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Ramanujam:2009:WRR,
  author =       "Rohit Sunkam Ramanujam and Bill Lin",
  title =        "Weighted Random Routing on Torus Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper, we introduce a new closed-form
                 oblivious routing algorithm called W2TURN that is
                 worst-case throughput optimal for 2D-torus networks.
                 W2TURN is based on a weighted random selection of paths
                 that contain at most two turns. In terms of average hop
                 count, W2TURN outperforms the best previously known
                 closed-form worst-case throughput optimal routing
                 algorithm called IVAL [7]. In addition, we present a
                 new optimal weighted random routing algorithm for rings
                 called WRD.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ramanujam, RS (Reprint Author), Univ Calif San Diego,
                 San Diego, CA 92103 USA. Ramanujam, Rohit Sunkam; Lin,
                 Bill, Univ Calif San Diego, San Diego, CA 92103 USA.",
  author-email = "rsunkamr@ucsd.edu billlin@ucsd.edu",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "2D-torus networks; Algorithm design and analysis;
                 closed-form oblivious routing algorithm; Data
                 communications; Delay; Interconnection network;
                 internetworking; IVAL; latency; Measurement;
                 Multiprocessor interconnection networks;
                 Network-on-a-chip; oblivious routing; Oblivious
                 Routing; On-chip interconnection networks; optimal
                 weighted random routing algorithm; Routing; Runtime;
                 System recovery; telecommunication network routing;
                 throughput; Throughput; torus network; Torus Network;
                 W2TURN; weighted random path selection",
  number-of-cited-references = "8",
  ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X",
  research-areas = "Computer Science",
  researcherid-numbers = "Lin, Binshan/A-9772-2009",
  times-cited =  "2",
  unique-id =    "Ramanujam:2009:WRR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Ahn:2009:MDE,
  author =       "Jung Ho Ahn and Jacob Leverich and Robert S. Schreiber
                 and Norman P. Jouppi",
  title =        "Multicore {DIMM}: an Energy Efficient Memory Module
                 with Independently Controlled {DRAMs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2008.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Demand for memory capacity and bandwidth keeps
                 increasing rapidly in modern computer systems, and
                 memory power consumption is becoming a considerable
                 portion of the system power budget. However, the
                 current DDR DIMM standard is not well suited to
                 effectively serve CMP memory requests from both a power
                 and performance perspective. We propose a new memory
                 module called a Multicore DIMM, where DRAM chips are
                 grouped into multiple virtual memory devices, each of
                 which has its own data path and receives separate
                 commands (address and control signals). The Multicore
                 DIMM is designed to improve the energy efficiency of
                 memory systems with small impact on system performance.
                 Dividing each memory modules into 4 virtual memory
                 devices brings a simultaneous 22\%, 7.6\%, and 18\%
                 improvement in memory power, IPC, and system
                 energy-delay product respectively on a set of
                 multithreaded applications and consolidated
                 workloads.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ahn, JH (Reprint Author), Hewlett Packard Labs,
                 Mississauga, ON, Canada. Ahn, Jung Ho; Schreiber,
                 Robert S.; Jouppi, Norman P., Hewlett Packard Labs,
                 Mississauga, ON, Canada. Leverich, Jacob, Stanford
                 Univ, Stanford, CA 94305 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; CMP memory requests; Control systems; DDR
                 DIMM standard; DRAM; DRAM chips; Energy consumption;
                 Energy efficiency; energy efficiency; energy efficient
                 memory module; Energy-aware systems; Error correction
                 codes; independently controlled DRAM; Jacobian
                 matrices; memory capacity; memory module; memory power
                 consumption; Memory Structures; memory system;
                 microprocessor chips; Multicore; multicore DIMM;
                 Multicore processing; Proposals; Random access memory;
                 System performance; system power budget; virtual memory
                 devices",
  number-of-cited-references = "16",
  ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394",
  research-areas = "Computer Science",
  researcherid-numbers = "Ahn, Jung Ho/D-1298-2013",
  times-cited =  "26",
  unique-id =    "Ahn:2009:MDE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wang:2009:PST,
  author =       "Po-Han Wang and Yen-Ming Chen and Chia-Lin Yang and
                 Yu-Jung Cheng",
  title =        "A Predictive Shutdown Technique for {GPU} Shader
                 Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As technology continues to shrink, reducing leakage is
                 critical to achieve energy efficiency. Previous works
                 on low-power GPU (Graphics Processing Unit) focus on
                 techniques for dynamic power reduction, such as DVFS
                 (Dynamic Voltage/Frequency Scaling) and clock gating.
                 In this paper, we explore the potential of adopting
                 architecture-level power gating techniques for leakage
                 reduction on GPU. In particular, we focus on the most
                 power-hungry components, shader processors. We observe
                 that, due to different scene complexity, the required
                 shader resources to satisfy the target frame rate
                 actually vary across frames. Therefore, we propose the
                 Predictive Shader Shutdown technique to exploit
                 workload variation across frames for leakage reduction
                 on shader processors. The experimental results show
                 that Predictive Shader Shutdown achieves up to 46\%
                 leakage reduction on shader processors with negligible
                 performance degradation.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wang, PH (Reprint Author), Natl Taiwan Univ, Dept Comp
                 Sci \& Informat Engn, Taipei 10764, Taiwan. Wang,
                 Po-Han; Chen, Yen-Ming; Yang, Chia-Lin, Natl Taiwan
                 Univ, Dept Comp Sci \& Informat Engn, Taipei 10764,
                 Taiwan. Cheng, Yu-Jung, Natl Taiwan Univ, Grad Inst
                 Networking \& Multimedia, Taipei 10764, Taiwan.",
  author-email = "r96002@csie.ntu.edu.tw r95125@csie.ntu.edu.tw
                 yangc@csie.ntu.edu.tw d96944002@ntu.edu.tw",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Institute for Information Industry of
                 Taiwan [97-FS-C03]; National Taiwan University
                 [97R0062-05]",
  funding-text = "This work was partially supported by the Institute for
                 Information Industry of Taiwan under project No.
                 97-FS-C03, and by the Excellent Research Projects of
                 National Taiwan University, 97R0062-05.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architecture-level power gating techniques; Central
                 Processing Unit; Circuits; clock gating; Clocks;
                 computer architecture; computer graphic equipment;
                 Computer science; coprocessors; Degradation; dynamic
                 power reduction; Dynamic voltage scaling; dynamic
                 voltage-frequency scaling; Energy efficiency;
                 Energy-aware systems; Frequency; GPU; GPU shader
                 processors; Graphics; graphics processing unit; Layout;
                 leakage; Low-power design; power aware computing; power
                 gating; predictive shader shutdown technique",
  number-of-cited-references = "15",
  ORCID-numbers = "YANG, CHIA-LIN/0000-0003-0091-5027",
  research-areas = "Computer Science",
  times-cited =  "10",
  unique-id =    "Wang:2009:PST",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Barnes:2009:XBA,
  author =       "Christopher Barnes and Pranav Vaidya and Jaehwan John
                 Lee",
  title =        "An {XML}-Based {ADL} Framework for Automatic
                 Generation of Multithreaded Computer Architecture
                 Simulators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Computer architecture simulation has always played a
                 pivotal role in continuous innovation of computers.
                 However, constructing or modifying a high quality
                 simulator is time consuming and error-prone. Thus,
                 often Architecture Description Languages (ADLs) are
                 used to provide an abstraction layer for describing the
                 computer architecture and automatically generating
                 corresponding simulators. Along the line of such
                 research, we present a novel XML-based ADL, its
                 compiler, and a generation methodology to automatically
                 generate multithreaded simulators for computer
                 architecture. We utilize the industry-standard
                 extensible markup language XML to describe the
                 functionality and architecture of a modeled processor.
                 Our ADL framework allows users to easily and quickly
                 modify the structure, register set, and execution of a
                 modeled processor. To prove its validity, we have
                 generated several multithreaded simulators with
                 different configurations based on the MIPS five-stage
                 processor, and successfully tested with two programs.",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "IUPUI RSFG",
  funding-text = "This research was funded by the IUPUI RSFG grant.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "abstraction layer; Architecture description languages;
                 automatic generation; C.0.d Modeling of computer
                 architecture; C.1.1.b Pipeline processors;
                 Computational modeling; computer architecture; Computer
                 architecture; Computer simulation; Concurrent
                 computing; extensible markup language-architecture
                 description language; Kernel; MIPS five-stage
                 processor; Modeling of computer architecture;
                 multi-threading; multithreaded computer architecture
                 simulator; Object oriented modeling; Pipeline
                 processors; Pipelines; program compilers; program
                 verification; Testing; validity testing; XML; XML-based
                 ADL framework",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Barnes:2009:XBA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Luque:2009:CAC,
  author =       "Carlos Luque and Miquel Moreto and Francisco J.
                 Cazorla and Roberto Gioiosa and Alper Buyuktosunoglu
                 and Mateo Valero",
  title =        "{CPU} Accounting in {CMP} Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Chip-MultiProcessors (CMP) introduce complexities when
                 accounting CPU utilization to processes because the
                 progress done by a process during an interval of time
                 highly depends on the activity of the other processes
                 it is co-scheduled with. We propose a new hardware
                 accounting mechanism to improve the accuracy when
                 measuring the CPU utilization in CMPs and compare it
                 with the previous accounting mechanisms. Our results
                 show that currently known mechanisms could lead to a
                 12\% average error when it comes to CPU utilization
                 accounting. Our proposal reduces this error to less
                 than 1\% in a modeled 4-core processor system.",
  acknowledgement = ack-nhfb,
  affiliation =  "Luque, C (Reprint Author), Univ Politecn Cataluna,
                 E-08028 Barcelona, Spain. Luque, Carlos; Moreto,
                 Miquel; Valero, Mateo, Univ Politecn Cataluna, E-08028
                 Barcelona, Spain. Cazorla, Francisco J.; Valero, Mateo,
                 Barcelona Supercomp Ctr, Barcelona, Spain.",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Ministry of Science and Technology of Spain
                 [TIN-2007-60625, BES-2008-003683, AP-2005-3318]; HiPEAC
                 Network of Excellence [IST-004408]; IBM Research; IBM
                 Deep Computing organizations",
  funding-text = "This work has been supported by the Ministry of
                 Science and Technology of Spain under contract
                 TIN-2007-60625 and grants BES-2008-003683 and
                 AP-2005-3318, by the HiPEAC Network of Excellence
                 (IST-004408) and a Collaboration Agreement between IBM
                 and BSC with funds from IBM Research and IBM Deep
                 Computing organizations. The authors would like to
                 thank Pradip Bose and Chen-Yong Cher from IBM for their
                 technical support.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "4-core processor system; Bandwidth; Cache memory;
                 chip-multiprocessor architecture; Clocks; CMP processor
                 system; CPU utilization accounting; data center;
                 General; Hardware; hardware accounting mechanism;
                 Hardware/software interfaces; Kernel; microprocessor
                 chips; Multi-core/single-chip multiprocessors;
                 multiprocessing systems; operating system task
                 scheduling; Operating systems; process scheduling;
                 processor scheduling; Proposals; resource allocation;
                 Semiconductor device measurement; Switches",
  number-of-cited-references = "11",
  oa =           "Green Published",
  ORCID-numbers = "Moreto Planas, Miquel/0000-0002-9848-8758 Cazorla,
                 Francisco/0000-0002-3344-376X Luque,
                 Carlos/0000-0003-0442-0785 Valero,
                 Mateo/0000-0003-2917-2482 Gioiosa,
                 Roberto/0000-0001-9430-2656",
  research-areas = "Computer Science",
  researcherid-numbers = "Moreto Planas, Miquel/C-1823-2016 Cazorla,
                 Francisco/D-7261-2016 Luque, Carlos/E-2110-2019 Valero,
                 Mateo/L-5709-2014",
  times-cited =  "5",
  unique-id =    "Luque:2009:CAC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Soteriou:2009:HTD,
  author =       "Vassos Soteriou and Rohit Sunkam Ramanujam and Bill
                 Lin and Li-Shiuan Peh",
  title =        "A High-Throughput Distributed Shared-Buffer {NoC}
                 Router",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Microarchitectural configurations of buffers in
                 routers have a significant impact on the overall
                 performance of an on-chip network (NoC). This buffering
                 can be at the inputs or the outputs of a router,
                 corresponding to an input-buffered router (IBR) or an
                 output-buffered router (OBR). OBRs are attractive
                 because they have higher throughput and lower queuing
                 delays under high loads than IBRs. However, a direct
                 implementation of OBRs requires a router speedup equal
                 to the number of ports, making such a design
                 prohibitive given the aggressive clocking and power
                 budgets of most NoC applications. In this letter, we
                 propose a new router design that aims to emulate an OBR
                 practically based on a distributed shared-buffer (DSB)
                 router architecture. We introduce innovations to
                 address the unique constraints of NoCs, including
                 efficient pipelining and novel flow control. Our DSB
                 design can achieve significantly higher bandwidth at
                 saturation, with an improvement of up to 20\% when
                 compared to a state-of-the-art pipelined IBR with the
                 same amount of buffering, and our proposed
                 microarchitecture can achieve up to 94\% of the ideal
                 saturation throughput.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ramanujam, Rohit Sunkam; Lin, Bill, Univ Calif San
                 Diego, San Diego, CA 92103 USA. Peh, Li-Shiuan,
                 Princeton Univ, Princeton, NJ 08544 USA.",
  author-email = "vassos.soteriou@cut.ac.cy rsunkamr@ucsd.edu
                 billlin@ucsd.edu peh@princeton.edu",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; buffer circuits; Clocks; Computer
                 architecture; configuration management; Delay;
                 distributed shared-buffer; Interconnection
                 architectures; Internet; microarchitectural
                 configurations; Microarchitecture; network routing;
                 Network-on-a-chip; network-on-chip; NoC router; On-chip
                 interconnection networks; output-buffered router;
                 Pipeline processing; router architecture; Router
                 micro-architecture; Technological innovation;
                 Throughput",
  keywords-plus = "ARCHITECTURE",
  number-of-cited-references = "16",
  ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X Soteriou,
                 Vassos/0000-0002-2818-0459",
  research-areas = "Computer Science",
  researcherid-numbers = "Lin, Binshan/A-9772-2009 Soteriou,
                 Vassos/H-4603-2014",
  times-cited =  "15",
  unique-id =    "Soteriou:2009:HTD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Guz:2009:MCV,
  author =       "Zvika Guz and Evgeny Bolotin and Idit Keidar and
                 Avinoam Kolodny and Avi Mendelson and Uri C. Weiser",
  title =        "Many-Core vs. Many-Thread Machines: Stay Away From the
                 Valley",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We study the tradeoffs between Many-Core machines like
                 Intel's Larrabee and Many-Thread machines like Nvidia
                 and AMD GPGPUs. We define a unified model describing a
                 superposition of the two architectures, and use it to
                 identify operation zones for which each machine is more
                 suitable. Moreover, we identify an intermediate zone in
                 which both machines deliver inferior performance. We
                 study the shape of this ``performance valley'' and
                 provide insights on how it can be avoided.",
  acknowledgement = ack-nhfb,
  affiliation =  "Guz, Z (Reprint Author), Technion Israel Inst Technol,
                 EE Dept, IL-32000 Haifa, Israel. Guz, Zvika; Keidar,
                 Idit; Kolodny, Avinoam; Weiser, Uri C., Technion Israel
                 Inst Technol, EE Dept, IL-32000 Haifa, Israel. Bolotin,
                 Evgeny, Intel Corp, Santa Clara, CA 95051 USA.
                 Mendelson, Avi, Microsoft Corp, Redmond, WA 98052
                 USA.",
  author-email = "zguz@tx.technion.ac.il evgeny.bolotin@intel.com
                 idish@ee.technion.ac.il kolodny@ee.technion.ac.il
                 avim@microsoft.com uri.weiser@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Semiconductors Research Corporation (SRC);
                 Intel; Israeli Ministry of Science Knowledge Center on
                 Chip MultiProcessors",
  funding-text = "We thank Ronny Ronen, Michael Behar, and Roni Rosner.
                 This work was partially supported by Semiconductors
                 Research Corporation (SRC), Intel, and the Israeli
                 Ministry of Science Knowledge Center on Chip
                 MultiProcessors.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "AMD GPGPU; architecture superposition; Bandwidth; Chip
                 Multiprocessors; Computer Systems; coprocessors; Delay;
                 Engines; Equations; GPGPU; Graphics; Intelpsilas
                 Larrabee; many-core machines; many-thread machines;
                 Multi-core/single-chip multiprocessors;
                 multi-threading; multiprocessing systems; Nvidia GPGPU;
                 Parallel Architectures; parallel architectures;
                 Parallel processing; performance valley; Processor
                 Architectures; Shape",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "27",
  unique-id =    "Guz:2009:MCV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Desai:2009:AIC,
  author =       "Aniruddha Desai and Jugdutt Singh",
  title =        "Architecture Independent Characterization of Embedded
                 {Java} Workloads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "29--32",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/java2000.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "This paper presents architecture independent
                 characterization of embedded Java workloads based on
                 the industry standard GrinderBench benchmark which
                 includes different classes of real world embedded Java
                 applications. This work is based on a custom built
                 embedded Java Virtual Machine (JVM) simulator
                 specifically designed for embedded JVM modeling and
                 embodies domain specific details such as thread
                 scheduling, algorithms used for native CLDC APIs and
                 runtime data structures optimized for use in embedded
                 systems. The results presented include dynamic
                 execution characteristics, dynamic bytecode instruction
                 mix, application and API workload distribution, Object
                 allocation statistics, instruction-set coverage, memory
                 usage statistics and method code and stack frame
                 characteristics.",
  acknowledgement = ack-nhfb,
  affiliation =  "Desai, A (Reprint Author), La Trobe Univ, Bundoora,
                 Vic 3086, Australia. Desai, Aniruddha; Singh, Jugdutt,
                 La Trobe Univ, Bundoora, Vic 3086, Australia.",
  author-email = "desai@ieee.org",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithm design and analysis; application program
                 interfaces; architecture independent characterization;
                 CLDC API; custom built embedded Java virtual machine
                 simulator; data structures; Data structures; Design
                 optimization; dynamic bytecode instruction mix; dynamic
                 execution characteristics; embedded Java workload;
                 Embedded Systems; embedded systems; Embedded Systems;
                 industry standard GrinderBench benchmark; instruction
                 sets; instruction-set coverage; Java; Java bytecode;
                 Job shop scheduling; JVM; memory usage statistics;
                 method code characteristics; multi-threading; object
                 allocation statistics; Runtime; runtime data structure;
                 scheduling; Scheduling algorithm; stack frame
                 characteristics; Statistical distributions; storage
                 allocation; thread scheduling; virtual machines;
                 Virtual machining; Workload Characterization",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Desai:2009:AIC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Antelo:2009:CBF,
  author =       "Elisardo Antelo",
  title =        "A Comment on {``Beyond Fat-tree: Unidirectional
                 Load-Balanced Multistage Interconnection Network''}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "33--34",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  note =         "See \cite{GomezRequena:2008:BFT}.",
  abstract =     "A recent work proposed to simplify fat-trees with
                 adaptive routing by means of a load-balancing
                 deterministic routing algorithm. The resultant network
                 has performance figures comparable to the more complex
                 adaptive routing fat-trees when packets need to be
                 delivered in order. In a second work by the same
                 authors published in IEEE CAL, they propose to simplify
                 the fat-tree to a unidirectional multistage
                 interconnection network (UMIN), using the same
                 load-balancing deterministic routing algorithm. They
                 show that comparable performance figures are achieved
                 with much lower network complexity. In this comment we
                 show that the proposed load-balancing deterministic
                 routing is in fact the routing scheme used by the
                 butterfly network. Moreover we show that the properties
                 of the simplified UMIN network proposed by them are
                 intrinsic to the standard butterfly and other existing
                 UMINs",
  acknowledgement = ack-nhfb,
  affiliation =  "Antelo, E (Reprint Author), Univ Santiago de
                 Compostela, Dept Elect \& Comp Sci, Santiago De
                 Compostela, Spain. Univ Santiago de Compostela, Dept
                 Elect \& Comp Sci, Santiago De Compostela, Spain.",
  da =           "2019-06-20",
  doc-delivery-number = "V17GC",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "adaptive routing fat-trees; Bismuth; butterfly
                 network; Computer science; deterministic algorithms;
                 fat-tree; hypercube networks; Interconnection networks;
                 Interconnections (Subsystems); load balancing
                 deterministic routing algorithm; Logic functions;
                 Multiprocessor interconnection networks; Multistage
                 Interconnection networks; network complexity; Network
                 topology; packets; resource allocation; Routing;
                 Switches; Technological innovation; Topology;
                 unidirectional load-balanced multistage interconnection
                 network; unidirectional multistage interconnection
                 network",
  number-of-cited-references = "7",
  ORCID-numbers = "Antelo, Elisardo/0000-0003-3743-3689",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Antelo:2009:CBF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2009:Aa,
  author =       "Anonymous",
  title =        "[Advertisement]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "35--35",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.38",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:AIC,
  author =       "Anonymous",
  title =        "Ad --- {IEEE Computer Society Digital Library}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "36--36",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.39",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:EBCa,
  author =       "Anonymous",
  title =        "Editorial Board [Cover2]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "c2--c2",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.41",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:FCa,
  author =       "Anonymous",
  title =        "[Front cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "c1--c1",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.40",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:IAa,
  author =       "Anonymous",
  title =        "Information for authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "c3--c3",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.42",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:ICSa,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [Cover4]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "1",
  pages =        "c4--c4",
  month =        jan # "\slash " # jun,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.43",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Gaudiot:2009:INE,
  author =       "Jean-Luc Gaudiot",
  title =        "Introducing the New {Editor-in-Chief} of
                 {{\booktitle{IEEE Computer Architecture Letters}}}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "37--38",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.60",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  number-of-cited-references = "0",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Gaudiot:2009:INE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Skadron:2009:LE,
  author =       "K. Skadron",
  title =        "Letter from the {Editor}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "39--39",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.61",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Skadron:2009:U,
  author =       "Kevin Skadron",
  title =        "Untitled",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "39--39",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.61",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  number-of-cited-references = "0",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Skadron:2009:U",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Xin:2009:ELI,
  author =       "Jing Xin and Russ Joseph",
  title =        "Exploiting Locality to Improve Circuit-level Timing
                 Speculation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "40--43",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.50",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Circuit-level timing speculation has been proposed as
                 a technique to reduce dependence on design margins,
                 eliminating power and performance overheads. Recent
                 work has proposed microarchitectural methods to
                 dynamically detect and recover from timing errors in
                 processor logic. This work has not evaluated or
                 exploited the disparity of error rates at the level of
                 static instructions. In this paper, we demonstrate
                 pronounced locality in error rates at the level of
                 static instructions. We propose timing error prediction
                 to dynamically anticipate timing errors at the
                 instruction-level and reduce the costly recovery
                 penalty. This allows us to achieve 43.6\% power savings
                 when compared to a baseline policy and incurs only
                 6.9\% performance penalty.",
  acknowledgement = ack-nhfb,
  affiliation =  "Xin, J (Reprint Author), Northwestern Univ, Evanston,
                 IL 60208 USA. Xin, Jing; Joseph, Russ, Northwestern
                 Univ, Evanston, IL 60208 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-0644332, CNS-0720820]",
  funding-text = "Manuscript submitted: 17-Sep-2009. Manuscript
                 accepted: 08-Oct-2009. Final manuscript received:
                 15-Oct-2009. We thank the anonymous reviewers for their
                 constructive feedback. This work was supported by NSF
                 awards CAREER CCF-0644332 and CNS-0720820.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Circuit faults; circuit reliability; circuit-level
                 timing speculation; Costs; Delay; Dynamic voltage
                 scaling; Error analysis; Error locality; Frequency;
                 Hardware; instruction sets; Logic; logic design;
                 low-power design; Low-power design; microarchitectural
                 methods; microprocessor chips; Pipelines; power
                 elimination; processor logic; reliability; Reliability;
                 static instruction level; Testing and Fault-Tolerance;
                 Timing; timing error prediction; timing speculation",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Xin:2009:ELI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Sudarsanam:2009:PPD,
  author =       "Arvind Sudarsanam and Ramachandra Kallam and Aravind
                 Dasu",
  title =        "{PRR--PRR} Dynamic Relocation",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "44--47",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.49",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Partial bitstream relocation (PBR) on FPGAs has been
                 gaining attention in recent years as a potentially
                 promising technique to scale parallelism of accelerator
                 architectures at run time, enhance fault tolerance,
                 etc. PBR techniques to date have focused on reading
                 inactive bitstreams stored in memory, on-chip or
                 off-chip, whose contents are generated for a specific
                 partial reconfiguration region (PRR) and modified on
                 demand for configuration into a PRR at a different
                 location. As an alternative, we propose a PRR-PRR
                 relocation technique to generate source and destination
                 addresses, read the bitstream from an active PRR
                 (source) in a non-intrusive manner, and write it to
                 destination PRR. We describe two options of realizing
                 this on Xilinx Virtex 4 FPGAs: (a) hardware-based
                 accelerated relocation circuit (ARC) and (b) a software
                 solution executed on Microblaze. A comparative
                 performance analysis to highlight the speed-up obtained
                 using ARC is presented. For real test cases,
                 performance of our implementations are compared to
                 estimated performances of two state of the art
                 methods.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sudarsanam, A (Reprint Author), Utah State Univ, Dept
                 Elect \& Comp Engn, Logan, UT 84321 USA. Sudarsanam,
                 Arvind; Kallam, Ramachandra; Dasu, Aravind, Utah State
                 Univ, Dept Elect \& Comp Engn, Logan, UT 84321 USA.",
  author-email = "arvind.sudarsanam@aggiemail.usu.edu
                 ramachandra.kallam@aggiemail.usu.edu
                 dasu@engineering.usu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NASA; Micron Research Center",
  funding-text = "Manuscript submitted: 03-Aug-2009. Manuscript
                 accepted: 16-Sep-2009. Final manuscript received:
                 24-Sep-2009. This work was supported by NASA and Micron
                 Research Center.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; Accelerator architectures; accelerator
                 architectures; Bioreactors; Circuits; destination
                 address; Emerging technologies; Fault tolerance; fault
                 tolerance; field programmable gate arrays; Field
                 programmable gate arrays; Filters; FPGAs; Hardware;
                 hardware-based accelerated relocation circuit; parallel
                 architecture; parallel architectures; Parallel
                 processing; partial bitstream relocation; Partial
                 dynamic reconfiguration; Partial dynamic relocation;
                 partial reconfiguration region; PBR techniques;
                 Performance analysis; Performance Analysis and Design
                 Aids; PRR-PRR dynamic relocation technique; PRR-PRR
                 relocation technique; Reconfigurable computing;
                 Reconfigurable hardware; source address; Xilinx Virtex
                 4 FPGA",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "Sudarsanam:2009:PPD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Leverich:2009:PMD,
  author =       "Jacob Leverich and Matteo Monchiero and Vanish Talwar
                 and Partha Ranganathan and Christos Kozyrakis",
  title =        "Power Management of Datacenter Workloads Using
                 Per-Core Power Gating",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "48--51",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.46",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "While modern processors offer a wide spectrum of
                 software-controlled power modes, most datacenters only
                 rely on Dynamic Voltage and Frequency Scaling (DVFS,
                 a.k.a. P-states) to achieve energy efficiency. This
                 paper argues that, in the case of datacenter workloads,
                 DVFS is not the only option for processor power
                 management. We make the case for per-core power gating
                 (PCPG) as an additional power management knob for
                 multi-core processors. PCPG is the ability to cut the
                 voltage supply to selected cores, thus reducing to
                 almost zero the leakage power for the gated cores.
                 Using a testbed based on a commercial 4-core chip and a
                 set of real-world application traces from enterprise
                 environments, we have evaluated the potential of PCPG.
                 We show that PCPG can significantly reduce a
                 processor's energy consumption (up to 40\%) without
                 significant performance overheads. When compared to
                 DVFS, PCPG is highly effective saving up to 30\% more
                 energy than DVFS. When DVFS and PCPG operate together
                 they can save up to almost 60\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "Leverich, J (Reprint Author), Hewlett Packard Labs,
                 Mississauga, ON, Canada. Leverich, Jacob; Monchiero,
                 Matteo; Talwar, Vanish; Ranganathan, Partha, Hewlett
                 Packard Labs, Mississauga, ON, Canada. Leverich, Jacob;
                 Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305
                 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; computer centres; Costs; data
                 center workloads; dynamic voltage and frequency
                 scaling; Dynamic voltage scaling; Energy consumption;
                 energy efficiency; Energy management; Energy-aware
                 systems; enterprise environments; Frequency;
                 integration and modeling; Jacobian matrices; leakage
                 power; microprocessor chips; Multicore processing;
                 multicore processors; per-core power gating; power
                 consumption; Power supplies; processor energy
                 consumption; processor power management;
                 software-controlled power modes; System architectures;
                 Testing",
  number-of-cited-references = "10",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "43",
  unique-id =    "Leverich:2009:PMD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Musoll:2009:PVA,
  author =       "Enric Musoll",
  title =        "A Process-Variation Aware Technique for Tile-Based,
                 Massive Multicore Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "52--55",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.48",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Process variations in advanced nodes introduce
                 significant core-to-core performance differences in
                 single-chip multicore architectures. Isolating each
                 core with its own frequency and voltage island helps
                 improving the performance of the multi-core
                 architecture by operating at the highest frequency
                 possible rather than operating all the cores at the
                 frequency of the slowest core. However, inter-core
                 communication suffers from additional
                 cross-clock-domain latencies that can offset the
                 performance benefits. This work proposes the concept of
                 the configurable, variable-size frequency and voltage
                 domain, and it is described in the context of a
                 tile-based, massive multi-core architecture.",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Clocks; computer architecture; Context;
                 cross-clock-domain latency; Delay; Frequency; intercore
                 communication; massive multi-core; massive multicore
                 processors; Multi-core/single-chip multiprocessors;
                 multicore architecture; Multicore processing;
                 Network-on-a-chip; network-on-chip; On-chip
                 interconnection networks; Performance gain; Process
                 design; process-variation aware architecture;
                 process-variation aware technique; Runtime; single-chip
                 multicore architectures; tile-base architecture;
                 tile-based multicore processors; variable-size
                 frequency domain; Voltage; voltage domain",
  number-of-cited-references = "5",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Musoll:2009:PVA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Baldassin:2009:CEC,
  author =       "Alexandro Baldassin and Felipe Klein and Guido Araujo
                 and Rodolfo Azevedo and Paulo Centoducatte",
  title =        "Characterizing the Energy Consumption of Software
                 Transactional Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "56--59",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.47",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The well-known drawbacks imposed by lock-based
                 synchronization have forced researchers to devise new
                 alternatives for concurrent execution, of which
                 transactional memory is a promising one. Extensive
                 research has been carried out on Software Transaction
                 Memory (STM), most of all concentrated on program
                 performance, leaving unattended other metrics of great
                 importance like energy consumption. This letter
                 presents a thorough evaluation of energy consumption in
                 a state-of-the-art STM. We show that energy and
                 performance results do not always follow the same trend
                 and, therefore, it might be appropriate to consider
                 different strategies depending on the focus of the
                 optimization. We also introduce a novel strategy based
                 on dynamic voltage and frequency scaling for contention
                 managers, revealing important energy and energy-delay
                 product improvements in high-contended scenarios. This
                 work is a first study towards a better understanding of
                 the energy consumption behavior of STM systems, and
                 could prompt STM designers to research new
                 optimizations in this area, paving the way for an
                 energy-aware transactional memory.",
  acknowledgement = ack-nhfb,
  affiliation =  "Baldassin, A (Reprint Author), Univ Estadual Campinas,
                 Inst Comp, Campinas, SP, Brazil. Baldassin, Alexandro;
                 Klein, Felipe; Araujo, Guido; Azevedo, Rodolfo;
                 Centoducatte, Paulo, Univ Estadual Campinas, Inst Comp,
                 Campinas, SP, Brazil.",
  author-email = "alebal@ic.unicamp.br klein@ic.unicamp.br
                 guido@ic.unicamp.br rodolfo@ic.unicamp.br
                 ducatte@ic.unicamp.br",
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "FAPESP [2005/02565-9]",
  funding-text = "Manuscript submitted: 02-Jul-2009. Manuscript
                 accepted: 23-Jul-2009. Final manuscript received:
                 05-Aug-2009. This work was supported in part by FAPESP
                 (2005/02565-9).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Concurrent computing; Concurrent Programming; Content
                 management; Costs; Dynamic voltage scaling; Energy
                 Consumption; Energy consumption; energy consumption;
                 Energy management; Energy-aware systems; energy-delay
                 product improvements; frequency scaling; Frequency
                 synchronization; Hardware; lock-based synchronization;
                 Measurement techniques; Memory management;
                 multiprocessing systems; Multiprocessor Systems;
                 multiprocessor systems; Multiprocessor Systems;
                 Parallel Architectures; parallel architectures; Power
                 Management; Software performance; software
                 transactional memory; synchronisation; transaction
                 processing; Transactional Memory",
  number-of-cited-references = "13",
  ORCID-numbers = "Azevedo, Rodolfo/0000-0002-8803-0401",
  research-areas = "Computer Science",
  researcherid-numbers = "Azevedo, Rodolfo/F-3008-2012",
  times-cited =  "3",
  unique-id =    "Baldassin:2009:CEC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Balfour:2009:ORE,
  author =       "James Balfour and R. Curtis Harting and William J.
                 Dally",
  title =        "Operand Registers and Explicit Operand Forwarding",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "60--63",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.45",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Operand register files are small, inexpensive register
                 files that are integrated with function units in the
                 execute stage of the pipeline, effectively extending
                 the pipeline operand registers into register files.
                 Explicit operand forwarding lets software
                 opportunistically orchestrate the routing of operands
                 through the forwarding network to avoid writing
                 ephemeral values to registers. Both mechanisms let
                 software capture short-term reuse and locality close to
                 the function units, improving energy efficiency by
                 allowing a significant fraction of operands to be
                 delivered from inexpensive registers that are
                 integrated with the function units. An evaluation shows
                 that capturing operand bandwidth close to the function
                 units allows operand registers to reduce the energy
                 consumed in the register files and forwarding network
                 of an embedded processor by 61\%, and allows explicit
                 forwarding to reduce the energy consumed by 26\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "Balfour, J (Reprint Author), Stanford Univ, Comp Syst
                 Lab, Stanford, CA 94305 USA. Balfour, James; Harting,
                 R. Curtis; Dally, William J., Stanford Univ, Comp Syst
                 Lab, Stanford, CA 94305 USA.",
  author-email = "jbalfour@cva.stanford.edu dally@cva.stanford.edu",
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Code generation; Computer aided
                 instruction; Computer System Implementation; Computer
                 Systems Organizat; embedded processor; Energy capture;
                 energy consumption; energy efficient register
                 organization; explicit operand forwarding; explicit
                 operand forwarding network; Fixed-point arithmetic;
                 impact of technology trends; Impact of VLSI on system
                 design; Laboratories; Logic; low-power programmable
                 processors; Memory hierarchy; microprocessor chips;
                 operand bandwidth; operand register files; operand
                 registers; Optimization; Physically aware
                 micro-architecture: power; Pipelines; Real-time and
                 embedded systems; Registers; Routing; software
                 reusability; thermal; VLSI Systems; Writing",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "Balfour:2009:ORE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Chiou:2009:AFF,
  author =       "Derek Chiou and Hari Angepat and Nikhil A. Patil and
                 Dam Sunwoo",
  title =        "Accurate Functional-First Multicore Simulators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "64--67",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.44",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Fast and accurate simulation of multicore systems
                 requires a parallelized simulator. This paper describes
                 a novel method to build parallelizable and
                 cycle-accurate-capable functional-first simulators of
                 multicore targets.",
  acknowledgement = ack-nhfb,
  affiliation =  "Chiou, D (Reprint Author), Univ Texas Austin, Dept
                 Elect \& Comp Engn, Austin, TX 78712 USA. Chiou, Derek;
                 Angepat, Hari; Patil, Nikhil A.; Sunwoo, Dam, Univ
                 Texas Austin, Dept Elect \& Comp Engn, Austin, TX 78712
                 USA.",
  author-email = "derek@ece.utexas.edu angepat@ece.utexas.edu
                 npatil@ece.utexas.edu sunwoo@ece.utexas.edu",
  da =           "2019-06-20",
  doc-delivery-number = "V17GD",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [0615352,
                 0747438]",
  funding-text = "This material is based upon work supported by the
                 National Science Foundation under Grants No. 0615352
                 and No. 0747438 and gifts from Intel and IBM. We thank
                 the anonymous reviewers for their comments.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "circuit simulation; Computational modeling; Computer
                 simulation; field programmable gate arrays;
                 FPGA-accelerated simulation technologies;
                 functional-first multicore simulators; Instruction
                 sets; integration and modeling; Microarchitecture;
                 Modeling and Visualization; Modeling of computer
                 architecture; Modeling techniques;
                 Multi-core/single-chip multiprocessors; Multicore
                 processing; multicore system simulation; Parallel;
                 Parallel Architectures; parallelized simulator;
                 Performance Analysis and Design Aids; Predictive
                 models; Simulation; Software prototyping; System
                 architectures; Timing; Virtual machining; Virtual
                 prototyping",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "7",
  unique-id =    "Chiou:2009:AFF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2009:Ab,
  author =       "Anonymous",
  title =        "[Advertisement]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "68--68",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.52",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:Ac,
  author =       "Anonymous",
  title =        "[Advertisement]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "69--69",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.53",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:Ad,
  author =       "Anonymous",
  title =        "[Advertisement]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "70--70",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.55",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:Ae,
  author =       "Anonymous",
  title =        "[Advertisement]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "71--71",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.54",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:Af,
  author =       "Anonymous",
  title =        "[Advertisement]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "72--72",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.51",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:EBCb,
  author =       "Anonymous",
  title =        "Editorial Board [Cover2]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "c2--c2",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.57",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:FCb,
  author =       "Anonymous",
  title =        "[Front cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "c1--c1",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.56",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:IAb,
  author =       "Anonymous",
  title =        "Information for authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "c3--c3",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.58",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2009:ICSb,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [Cover4]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "8",
  number =       "2",
  pages =        "c4--c4",
  month =        jul # "\slash " # dec,
  year =         "2009",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2009.59",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Patil:2010:URT,
  author =       "Shruti Patil and David J. Lilja",
  title =        "Using Resampling Techniques to Compute Confidence
                 Intervals for the Harmonic Mean of Rate-Based
                 Performance Metrics",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Rate-based metrics such as floating point operations
                 per second, instructions per cycle and so forth are
                 commonly used to measure computer performance. In
                 addition to the average or mean performance of the
                 metric, indicating the precision of the mean using
                 confidence intervals helps to make informed decisions
                 and comparisons with the data. In this paper, we
                 discuss the determination of confidence intervals for
                 the harmonic mean of rate-based metrics using two
                 statistical resampling techniques Jackknife and
                 Bootstrap. We show using Monte Carlo simulations that
                 resampling indeed works as expected, and can be used
                 for generating confidence intervals for harmonic
                 mean.",
  acknowledgement = ack-nhfb,
  affiliation =  "Patil, S (Reprint Author), Univ Minnesota Twin Cities,
                 Dept Elect \& Comp Engn, St Paul, MN USA. Patil,
                 Shruti; Lilja, David J., Univ Minnesota Twin Cities,
                 Dept Elect \& Comp Engn, St Paul, MN USA.",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CCF-0541162]",
  funding-text = "This work was supported in part by the National
                 Science Foundation grant no. CCF-0541162. Any opinions,
                 findings and conclusions or recommendations expressed
                 in this material are those of the authors and do not
                 necessarily reflect the views of the NSF. The authors
                 also thank the University of Minnesota Statistical
                 Consulting Service for their helpful insights.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Arithmetic; bootstrap; bootstrap technique; Cities and
                 towns; Computer errors; Computer performance; computer
                 performance measurement; Confidence intervals;
                 confidence intervals; Electric variables measurement;
                 Equations; floating point operations; Harmonic
                 analysis; harmonic mean; jackknife; jackknife
                 technique; Monte Carlo methods; Monte Carlo
                 simulations; Nonparametric statistics; Performance
                 analysis; performance evaluation; Performance of
                 Systems; Probability distribution; rate-based
                 performance metrics; resampling; statistical analysis;
                 statistical resampling techniques; Statistics",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "Patil:2010:URT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Seznec:2010:PCM,
  author =       "Andre Seznec",
  title =        "A Phase Change Memory as a Secure Main Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/prng.bib",
  abstract =     "Phase change memory (PCM) technology appears as more
                 scalable than DRAM technology. As PCM exhibits access
                 time slightly longer but in the same range as DRAMs,
                 several recent studies have proposed to use PCMs for
                 designing main memory systems. Unfortunately PCM
                 technology suffers from a limited write endurance;
                 typically each memory cell can be only be written a
                 large but still limited number of times (10(7) to 10(9)
                 writes are reported for current technology). Till now,
                 research proposals have essentially focused their
                 attention on designing memory systems that will survive
                 to the average behavior of conventional applications.
                 However PCM memory systems should be designed to
                 survive worst-case applications, i.e., malicious
                 attacks targeting the physical destruction of the
                 memory through overwriting a limited number of memory
                 cells.",
  acknowledgement = ack-nhfb,
  affiliation =  "Seznec, A (Reprint Author), INRIA Rennes Bretagne
                 Atlantique, Ctr Rech, Campus Beaulieu, F-35042 Rennes,
                 France. INRIA Rennes Bretagne Atlantique, Ctr Rech,
                 F-35042 Rennes, France.",
  author-email = "seznec@irisa.fr",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "European Commission [27648]",
  funding-text = "This work was partially supported by the European
                 Commission in the context of the SARC integrated
                 project \#27648 (FP6).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application software; DRAM technology; Energy
                 consumption; memory cells; Memory Structures; PCM
                 memory systems; Phase change materials; phase change
                 memories; phase change memory; Phase change memory;
                 Physics computing; Proposals; Random access memory;
                 Random number generation; Random processes;
                 Scalability; secure PCM-based main memory;
                 Semiconductor Memories",
  keywords-plus = "TECHNOLOGY",
  number-of-cited-references = "8",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "17",
  unique-id =    "Seznec:2010:PCM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Park:2010:EIP,
  author =       "Seon-yeong Park and Euiseong Seo and Ji-Yong Shin and
                 Seungryoul Maeng and Joonwon Lee",
  title =        "Exploiting Internal Parallelism of Flash-based
                 {SSDs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "For the last few years, the major driving force behind
                 the rapid performance improvement of SSDs has been the
                 increment of parallel bus channels between a flash
                 controller and flash memory packages inside the
                 solid-state drives (SSDs). However, there are other
                 internal parallelisms inside SSDs yet to be explored.
                 In order to improve performance further by utilizing
                 the parallelism, this paper suggests request
                 rescheduling and dynamic write request mapping.
                 Simulation results with real workloads have shown that
                 the suggested schemes improve the performance of the
                 SSDs by up to 15\% without any additional hardware
                 support.",
  acknowledgement = ack-nhfb,
  affiliation =  "Park, SY (Reprint Author), Korea Adv Inst Sci \&
                 Technol, Taejon, South Korea. Park, Seon-yeong; Shin,
                 Ji-Yong; Maeng, Seungryoul, Korea Adv Inst Sci \&
                 Technol, Taejon, South Korea. Seo, Euiseong, Ulsan Natl
                 Inst Sci \& Technol, Ulsan, South Korea. Lee, Joonwon,
                 Sungkyunkwan Univ, Seoul, South Korea.",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Korea government(MEST) [2009-0080381]",
  funding-text = "This work was supported by the Korea Science and
                 Engineering Foundation (KOSEF) grant funded by the
                 Korea government (MEST), (No. 2009-080381)",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Delay; Drives; exploiting internal parallelism; flash
                 based SSD; flash controller; flash memories; Flash
                 memory; flash memory packages; Force control; Hard
                 disks; I/O scheduling; Input/Output Devices; Packaging;
                 parallel bus channels; parallel processing; Parallel
                 systems; parallelism; pipeline processing; Pipeline
                 processing; Secondary storage; Simulation; Solid state
                 circuits; solid state drives; Solid-State Drives
                 (SSDs); Space technology; Storage Management; system
                 buses; Throughput",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  researcherid-numbers = "Maeng, Seungryoul/C-1882-2011",
  times-cited =  "35",
  unique-id =    "Park:2010:EIP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Subramoni:2010:ISI,
  author =       "Hari Subramoni and Fabrizio Petrini and Virat Agarwal
                 and Davide Pasetto",
  title =        "Intra-Socket and Inter-Socket Communication in
                 Multi-core Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The increasing computational and communication demands
                 of the scientific and industrial communities require a
                 clear understanding of the performance trade-offs
                 involved in multi-core computing platforms. Such
                 analysis can help application and toolkit developers in
                 designing better, topology aware, communication
                 primitives intended to suit the needs of various high
                 end computing applications. In this paper, we take on
                 the challenge of designing and implementing a portable
                 intra-core communication framework for streaming
                 computing and evaluate its performance on some popular
                 multi-core architectures developed by Intel, AMD and
                 Sun. Our experimental results, obtained on the Intel
                 Nehalem, AMD Opteron and Sun Niagara 2 platforms, show
                 that we are able to achieve an intra-socket small
                 message latency between 120 and 271 nanoseconds, while
                 the inter-socket small message latency is between 218
                 and 320 nanoseconds. The maximum intra-socket
                 communication bandwidth ranges from 0.179 (Sun Niagara
                 2) to 6.5 (Intel Nehalem) Gbytes/second. We were also
                 able to obtain an inter-socket communication
                 performance of 1.2 and 6.6 Gbytes/second on the AMD
                 Opteron and Intel Nehalem, respectively.",
  acknowledgement = ack-nhfb,
  affiliation =  "Subramoni, H (Reprint Author), IBM TJ Watson, Yorktown
                 Hts, NY 10598 USA. Subramoni, Hari; Petrini, Fabrizio;
                 Agarwal, Virat, IBM TJ Watson, Yorktown Hts, NY 10598
                 USA. Pasetto, Davide, IBM Computat Sci Ctr, Dublin,
                 Ireland. Subramoni, Hari, Ohio State Univ, Columbus, OH
                 43210 USA.",
  author-email = "subramon@cse.ohio-state.edu fpetrin@us.ibm.com
                 viratagarwal@us.ibm.com pasetto\_davide@ie.ibm.com",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "AMD Opteron; Bandwidth; Communication industry;
                 communication primitives; Communication Protocols;
                 Computer applications; Computer architecture; Computer
                 industry; Delay; General; Hardware; High Performance
                 Computing; industrial communities; Intel Nehalem;
                 intersocket communication; Intrasocket communication;
                 multicore architectures; Multicore Processors;
                 multicore systems; multiprocessing systems; parallel
                 architectures; Performance of Systems; Portable
                 computers; streaming computing; Sun; toolkit
                 developers; Topology; topology aware",
  keywords-plus = "NETWORK",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Subramoni:2010:ISI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hoang:2010:CAN,
  author =       "Giang Hoang and Chang Bae and John Lange and Lide
                 Zhang and Peter Dinda and Russ Joseph",
  title =        "A Case for Alternative Nested Paging Models for
                 Virtualized Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/hash.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Address translation often emerges as a critical
                 performance bottleneck for virtualized systems and has
                 recently been the impetus for hardware paging
                 mechanisms. These mechanisms apply similar translation
                 models for both guest and host address translations. We
                 make an important observation that the model employed
                 to translate from guest physical addresses (GPAs) to
                 host physical addresses (HPAs) is in fact orthogonal to
                 the model used to translate guest virtual addresses
                 (GVAs) to GPAs. Changing this model requires VMM
                 cooperation, but has no implications for guest OS
                 compatibility. As an example, we consider a hashed page
                 table approach for GPA -> HPA translation. Nested
                 paging, widely considered the most promising approach,
                 uses unhashed multi-level forward page tables for both
                 GVA -> GPA and GPA -> HPA translations, resulting in a
                 potential O(n(2)) page walk cost on a TLB miss, for
                 n-level page tables. In contrast, the hashed page table
                 approach results in an expected O(n) cost. Our
                 simulation results show that when a hashed page table
                 is used in the nested level, the performance of the
                 memory system is not worse, and sometimes even better
                 than a nested forward-mapped page table due to reduced
                 page walks and cache pressure. This showcases the
                 potential for alternative paging mechanisms.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hoang, GA (Reprint Author), Northwestern Univ,
                 Evanston, IL 60208 USA. Hoang, Giang; Bae, Chang;
                 Lange, John; Dinda, Peter; Joseph, Russ, Northwestern
                 Univ, Evanston, IL 60208 USA. Zhang, Lide, Univ
                 Michigan, Ann Arbor, MI 48109 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "address translation; Computer Architecture; Computer
                 architecture; Computer Architecture; Computer displays;
                 Control systems; Costs; Emerging technologies; file
                 organisation; guest physical addresses; guest virtual
                 addresses; Hardware; hardware paging mechanisms;
                 Hardware/software interfaces; host physical addresses;
                 Instruction sets; Nested Paging; nested paging models;
                 Operating systems; OS compatibility; paged storage;
                 Platform virtualization; Software performance; storage
                 allocation; unhashed multilevel forward page tables;
                 virtual machine monitors; Virtual machine monitors;
                 virtual machines; Virtual Memory; Virtualization;
                 virtualized systems; VMM cooperation",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  researcherid-numbers = "Joseph, Russell/B-7230-2009 Dinda,
                 Peter/B-7142-2009",
  times-cited =  "5",
  unique-id =    "Hoang:2010:CAN",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Krimer:2010:SNT,
  author =       "Evgeni Krimer and Robert Pawlowski and Mattan Erez and
                 Patrick Chiang",
  title =        "{Synctium}: a Near-Threshold Stream Processor for
                 Energy-Constrained Parallel Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "While Moore's law scaling continues to double
                 transistor density every technology generation, supply
                 voltage reduction has essentially stopped, increasing
                 both power density and total energy consumed in
                 conventional microprocessors. Therefore, future
                 processors will require an architecture that can: (a)
                 take advantage of the massive amount of transistors
                 that will be available; and (b) operate these
                 transistors in the near-threshold supply domain,
                 thereby achieving near optimal energy/computation by
                 balancing the leakage and dynamic energy consumption.
                 Unfortunately, this optimality is typically achieved
                 while running at very low frequencies (i.e.,
                 0.1--10MHz) and with only one computation executing per
                 cycle, such that performance is limited. Further,
                 near-threshold designs suffer from severe process
                 variability that can introduce extremely large delay
                 variations. In this paper, we propose a near
                 energy-optimal, stream processor family that relies on
                 massively parallel, near-threshold VLSI circuits and
                 interconnect, incorporating cooperative
                 circuit/architecture techniques to tolerate the
                 expected large delay variations. Initial estimations
                 from circuit simulations show that it is possible to
                 achieve greater than 1 Giga-Operations per second
                 (1GOP/s) with less than 1mW total power consumption,
                 enabling a new class of energy-constrained,
                 high-throughput computing applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Krimer, E (Reprint Author), UT Austin, ECE, Austin, TX
                 USA. Krimer, Evgeni; Erez, Mattan, UT Austin, ECE,
                 Austin, TX USA. Pawlowski, Robert; Chiang, Patrick,
                 Oregon State Univ, EECS, Corvallis, OR 97331 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Circuits; Computer architecture; conventional
                 microprocessors; Delay; double transistor density;
                 dynamic energy consumption; energy constrained parallel
                 applications; Energy consumption; etc.; Frequency;
                 impact of technology trends; Low-power design;
                 Microprocessors; Mobile processors; Moore's Law; near
                 threshold stream processor; optimisation; parallel
                 programming; Physically aware micro-architecture:
                 power; pipeline processing; Power generation; SIMD
                 processors; supply voltage reduction; Synctium;
                 thermal; Very large scale integration; VLSI circuits;
                 Voltage",
  keywords-plus = "CIRCUITS; TOLERANCE; CMOS",
  number-of-cited-references = "19",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "22",
  unique-id =    "Krimer:2010:SNT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hilton:2010:SDE,
  author =       "Andrew Hilton and Amir Roth",
  title =        "{SMT-Directory}: Efficient Load-Load Ordering for
                 {SMT}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Memory models like SC, TSO, and PC enforce load-load
                 ordering, requiring that loads from any single thread
                 appear to occur in program order to all other threads.
                 Out-of-order execution can violate load-load ordering.
                 Conventional multi-processors with out-of-order cores
                 detect load-load ordering violations by snooping an
                 age-ordered load queue on cache invalidations or
                 evictions-events that act as proxies for the completion
                 of remote stores. This mechanism becomes less efficient
                 in an SMT processor, as every completing store must
                 search the loads queue segments of all other threads.
                 This inefficiency exists because store completions from
                 other threads in the same core are not filtered by the
                 cache and coherence protocol: thread 0 observes all of
                 thread 1's stores, not only the first store to every
                 cache line. SMT-Directory eliminates this overhead by
                 implementing the filtering traditionally provided by
                 the cache in the cache itself. SMT-Directory adds a
                 per-thread ``read'' bit to every data cache line. When
                 a load executes, it sets the bit corresponding to its
                 thread. When a store completes and write to the cache,
                 it checks the SMT-Directory bits of its cache line and
                 searches the load queue segments only of those threads
                 whose bits are set. As a result, local store
                 completions trigger searches only for data that is
                 actually shared.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hilton, A (Reprint Author), Univ Penn, Philadelphia,
                 PA 19104 USA. Hilton, Andrew; Roth, Amir, Univ Penn,
                 Philadelphia, PA 19104 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-0541292]",
  funding-text = "We thank Arun Raghavan for the address traces and Milo
                 Martin for comments on early versions of this work. The
                 anonymous reviewers provided valuable feedback. This
                 work was supported by NSF award CCF-0541292.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "age-ordered load queue; Buffer storage; cache
                 invalidations; cache protocol; cache storage; coherence
                 protocol; consistency models; data cache line;
                 directory; Filtering; Load modeling; load queue search;
                 load queue segments; load-load ordering; Memory
                 hierarchy; multi-threading; multiprocessing systems;
                 Multithreaded processors; Multithreading; Out of order;
                 Protocols; Read-write memory; Simultaneous
                 multithreading; SMT processor; Surface-mount
                 technology; Writing",
  keywords-plus = "CONSISTENCY",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Hilton:2010:SDE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hammoud:2010:DPA,
  author =       "Mohammad Hammoud and Sangyeun Cho and Rami G. Melhem",
  title =        "A Dynamic Pressure-Aware Associative Placement
                 Strategy for Large Scale Chip Multiprocessors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "29--32",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper describes dynamic pressure-aware
                 associative placement (DPAP), a novel distributed cache
                 management scheme for large-scale chip multiprocessors.
                 Our work is motivated by the large non-uniform
                 distribution of memory accesses across cache sets in
                 different L2 banks. DPAP decouples the physical
                 locations of cache blocks from their addresses for the
                 sake of reducing misses caused by destructive
                 interferences. Temporal pressure at the on-chip
                 last-level cache, is continuously collected at a group
                 (comprised of local cache sets) granularity, and
                 periodically recorded at the memory controller(s) to
                 guide the placement process. An incoming block is
                 consequently placed at a cache group that exhibits the
                 minimum pressure. Simulation results using a
                 full-system simulator demonstrate that DPAP outperforms
                 the baseline shared NUCA scheme by an average of 8.3\%
                 and by as much as 18.9\% for the benchmark programs we
                 examined. Furthermore, evaluations showed that DPAP
                 outperforms related cache designs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hammoud, M (Reprint Author), Univ Pittsburgh, Dept
                 Comp Sci, Pittsburgh, PA 15260 USA. Hammoud, Mohammad;
                 Cho, Sangyeun; Melhem, Rami G., Univ Pittsburgh, Dept
                 Comp Sci, Pittsburgh, PA 15260 USA.",
  author-email = "mhh@cs.pitt.edu cho@cs.pitt.edu melhem@cs.pitt.edu",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-0952273]",
  funding-text = "This work was supported in part by NSF grant
                 CCF-0952273.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Aggregate Cache Sets; Aggregates; Associative
                 Placement; cache storage; Chip Multiprocessors;
                 Computer architecture; Computer science; destructive
                 interferences; distributed cache management; DPAP;
                 dynamic pressure aware associative placement strategy;
                 Interference; large scale chip multiprocessors;
                 Large-scale systems; Local Cache Sets; memory access
                 distribution; memory controllers; microprocessor chips;
                 Network-on-a-chip; NUCA scheme; Pressure control;
                 Pressure-Aware Placement; Random access memory",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Hammoud:2010:DPA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2010:LUC,
  author =       "Hyungjun Kim and Paul V. Gratz",
  title =        "Leveraging Unused Cache Block Words to Reduce Power in
                 {CMP} Interconnect",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "33--36",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Power is of paramount importance in modern computer
                 system design. In particular, the cache interconnect in
                 future CMP designs is projected to consume up to half
                 of the system power for cache fills and spills [8].
                 Despite the power consumed by spills and fills, a
                 significant percentage of each cache line is unused
                 prior to eviction from the cache. If unused cache block
                 words can be identified, this information can be used
                 to improve CMP interconnect power and energy
                 consumption. We propose a new method of CMP
                 interconnect packet composition, leveraging unused data
                 to reduce power. These methods are well suited to
                 interconnection networks with high-bandwidth wires, and
                 do not require expensive multi-ported memory systems.
                 Assuming perfect prediction, our techniques achieve an
                 average of similar to 37\% savings in total dynamic
                 link power consumption. With our current best
                 prediction mechanism, our techniques reduce dynamic
                 power consumption by similar to 23\% on average.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, H (Reprint Author), Texas A\&M Univ, Dept Elect
                 \& Comp Engn, College Stn, TX 77843 USA. Kim, Hyungjun;
                 Gratz, Paul V., Texas A\&M Univ, Dept Elect \& Comp
                 Engn, College Stn, TX 77843 USA.",
  author-email = "hyungjuk@tamu.edu pgratz@tamu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "731BP",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; cache fills; cache interconnect; Cache
                 memories; cache spills; cache storage; CMP
                 interconnect; computer system design; Delay; dynamic
                 power; Energy consumption; energy consumption; flit
                 encoding; integrated circuit design; Interconnection
                 architectures; Low-power design; memory system;
                 microprocessor chips; Multicore; Multiprocessor
                 interconnection networks; Network-on-a-chip; NoC; power
                 aware computing; Power engineering computing; power
                 reduction; Power system interconnection; Random access
                 memory; total dynamic link power consumption; unused
                 cache block words; Very large scale integration;
                 Wires",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kim:2010:LUC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2010:EBCa,
  author =       "Anonymous",
  title =        "Editorial Board [Cover2]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "c2--c2",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:FCa,
  author =       "Anonymous",
  title =        "[Front cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "c1--c1",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:IAa,
  author =       "Anonymous",
  title =        "Information for authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "c3--c3",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ICSa,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [Cover4]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "1",
  pages =        "c4--c4",
  month =        jan # "\slash " # jun,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Skadron:2010:ELE,
  author =       "K. Skadron",
  title =        "Editorial: Letter from the {Editor-in-Chief}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "37--44",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.27",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Skadron:2010:U,
  author =       "Kevin Skadron",
  title =        "Untitled",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "37--44",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.27",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "731BX",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  number-of-cited-references = "0",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Skadron:2010:U",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Iqbal:2010:POS,
  author =       "Syed Muhammad Zeeshan Iqbal and Yuchen Liang and Hakan
                 Grahn",
  title =        "{ParMiBench} --- An Open-Source Benchmark for Embedded
                 Multiprocessor Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "45--48",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/gnu.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Multicore processors are the main computing platform
                 in laptops, desktop, and servers today, and are making
                 their way into the embedded systems market also. Using
                 benchmarks is a common approach to evaluate the
                 performance of a system. However, benchmarks for
                 embedded systems have so far been either targeted for a
                 uni-processor environment, e.g., MiBench, or have been
                 commercial, e.g., MultiBench by EEMBC. In this paper,
                 we propose and implement an open source benchmark,
                 ParMiBench, targeted for multiprocessor-based embedded
                 systems. ParMiBench consists of parallel
                 implementations of seven compute intensive algorithms
                 from the uni-processor benchmark suite MiBench. The
                 applications are selected from four domains: Automation
                 and Industry Control, Network, Office, and Security.",
  acknowledgement = ack-nhfb,
  affiliation =  "Iqbal, SMZ (Reprint Author), Blekinge Inst Technol,
                 Sch Comp, SE-37179 Karlskrona, Sweden. Iqbal, Syed
                 Muhammad Zeeshan; Liang, Yuchen; Grahn, Hakan, Blekinge
                 Inst Technol, Sch Comp, SE-37179 Karlskrona, Sweden.",
  author-email = "mzeeshan01@gmail.com yuchen9760@gmail.com
                 hakan.grahn@bth.se",
  da =           "2019-06-20",
  doc-delivery-number = "731BX",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "benchmark testing; Benchmark testing; Concurrent
                 Programming; desktop; embedded multiprocessor system;
                 Embedded system; embedded system market; embedded
                 systems; intensive algorithm; laptop; Load management;
                 Multicore processing; multiprocessing systems;
                 Multiprocessor Systems; open-source benchmark; parallel
                 architectures; parallel implementation; ParMiBench;
                 Performance Evaluation; Performance evaluation;
                 Performance Evaluation; Program processors; public
                 domain software; Security; uniprocessor benchmark
                 suite",
  number-of-cited-references = "9",
  ORCID-numbers = "Grahn, Hakan/0000-0001-9947-1088",
  research-areas = "Computer Science",
  researcherid-numbers = "Grahn, Hakan/G-9720-2011",
  times-cited =  "32",
  unique-id =    "Iqbal:2010:POS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Fang:2010:BRP,
  author =       "Zhen Fang and Erik G. Hallnor and Bin Li and Michael
                 Leddige and Donglai Dai and Seung Eun Lee and Srihari
                 Makineni and Ravi Iyer",
  title =        "{Boomerang}: Reducing Power Consumption of Response
                 Packets in {NoCs} with Minimal Performance Impact",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "49--52",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Most power reduction mechanisms for NoC channel
                 buffers rely on on-demand wakeup to transition from a
                 low-power state to the active state. Two drawbacks of
                 on-demand wakeup limit its effectiveness: (1)
                 performance impact caused by wakeup delays, and (2)
                 energy and area cost of sleep circuitry itself. What
                 makes the problem harder to solve is that solutions to
                 either problem tend to exacerbate the other. For
                 example, faster wakeup from a power-gated state
                 requires greater charge/discharge current for the sleep
                 transistors while using nimbler sleep transistors
                 implies long wakeup delays. As a result, powerdowns
                 have to be conservatively prescribed, missing many
                 power-saving opportunities. We propose Boomerang, a
                 novel power-saving method that overcomes the above
                 drawbacks. Specifically, based on the observation that
                 a response is always preceded by a request, we let the
                 request trigger wakeup of the buffer that is to be used
                 by its response in the ( near) future, instead of using
                 on-demand wakeups. Hiding the wakeup delay completely,
                 Boomerang allows us to employ aggressive sleep policies
                 and use low-cost power gating circuits on response
                 buffers.",
  acknowledgement = ack-nhfb,
  affiliation =  "Fang, Z (Reprint Author), Intel Corp, Santa Clara, CA
                 95051 USA. Fang, Zhen; Hallnor, Erik G.; Li, Bin;
                 Leddige, Michael; Dai, Donglai; Makineni, Srihari;
                 Iyer, Ravi, Intel Corp, Santa Clara, CA 95051 USA. Lee,
                 Seung Eun, Seoul Natl Univ Sci \& Technol, Seoul, South
                 Korea.",
  author-email = "zhen.fang@intel.com",
  da =           "2019-06-20",
  doc-delivery-number = "731BX",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Boomerang; buffer circuits; charge-discharge current;
                 Delay; Interconnection networks; Leakage currents;
                 leakage power; low-cost power gating circuits;
                 low-power design; Mobile communication;
                 network-on-chip; nimbler sleep transistors; NoC channel
                 buffers; packet-switching networks; power aware
                 computing; power consumption reduction mechanism;
                 power-gated state; power-saving method; response
                 packets; Routing; Switches; System-on-a-chip;
                 Transistors; wakeup delay",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Fang:2010:BRP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lyons:2010:ASF,
  author =       "Michael J. Lyons and Mark Hempstead and Gu-Yeon Wei
                 and David Brooks",
  title =        "The Accelerator Store framework for high-performance,
                 low-power accelerator-based systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "53--56",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware acceleration can increase performance and
                 reduce energy consumption. To maximize these benefits,
                 accelerator-based systems that emphasize computation on
                 accelerators (rather than on general purpose cores)
                 should be used. We introduce the ``accelerator store,''
                 a structure for sharing memory between accelerators in
                 these accelerator-based systems. The accelerator store
                 simplifies accelerator I/O and reduces area by mapping
                 memory to accelerators when needed at runtime.
                 Preliminary results demonstrate a 30\% system area
                 reduction with no energy overhead and less than 1\%
                 performance overhead in contrast to conventional DMA
                 schemes.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lyons, MJ (Reprint Author), Harvard Univ, Sch Engn \&
                 Appl Sci, Cambridge, MA 02138 USA. Lyons, Michael J.;
                 Brooks, David, Harvard Univ, Sch Engn \& Appl Sci,
                 Cambridge, MA 02138 USA.",
  author-email = "mjlyons@eecs.harvard.edu mhempstead@coe.drexel.edu
                 guyeon@eecs.harvard.edu dbrooks@eecs.harvard.edu",
  da =           "2019-06-20",
  doc-delivery-number = "731BX",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [IIS-0926148];
                 Gigascale Systems Research Center",
  funding-text = "This material is based upon work supported by the
                 National Science Foundation under Grant No.
                 IIS-0926148. The authors acknowledge the support of the
                 Gigascale Systems Research Center, one of six research
                 centers funded under the Focus Center Research Program
                 (FCRP), a Semiconductor Research Corporation entity.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; accelerator store framework; energy
                 consumption; General; hardware acceleration;
                 Heterogeneous (hybrid) systems; high-performance
                 low-power accelerator-based system; low-power
                 electronics; memory architecture; Memory management;
                 memory mapping; memory sharing; Program processors;
                 Random access memory; Real time systems; Real-time and
                 embedded systems; shared memory systems; storage
                 management; Throughput; Transform coding",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "13",
  unique-id =    "Lyons:2010:ASF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Manevich:2010:CAR,
  author =       "Ran Manevich and Israel Cidon and Avinoam Kolodny and
                 Isask'har Walter",
  title =        "Centralized Adaptive Routing for {NoCs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "57--60",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As the number of applications and programmable units
                 in CMPs and MPSoCs increases, the Network-on-Chip (NoC)
                 encounters diverse and time dependent traffic loads.
                 This trend motivates the introduction of NoC
                 load-balanced, adaptive routing mechanisms that achieve
                 higher throughput as compared with traditional
                 oblivious routing schemes that are perceived better
                 suited for hardware implementations. However, an
                 efficient adaptive routing scheme should base its
                 decisions on the global state of the system rather than
                 on local or regional congestion signals as is common in
                 current adaptive routing schemes. In this paper we
                 introduce a novel paradigm of NoC centralized adaptive
                 routing, and a specific design for mesh topology. Our
                 scheme continuously monitors the global traffic load in
                 the network and modifies the routing of packets to
                 improve load balancing accordingly. In our specific
                 mesh-based design, XY or YX routes are adaptively
                 selected for each source-destination pair. We show that
                 while our implementation is scalable and lightweight in
                 hardware costs, it outperforms distributed adaptive
                 routing schemes in terms of load balancing and
                 throughput.",
  acknowledgement = ack-nhfb,
  affiliation =  "Manevich, R (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
                 Manevich, Ran; Cidon, Israel; Kolodny, Avinoam; Walter,
                 Isask'har, Technion Israel Inst Technol, Dept Elect
                 Engn, IL-32000 Haifa, Israel.",
  author-email = "ranman@tx.technion.ac.il cidon@ee.technion.ac.il
                 kolodny@ee.technion.ac.il zigi@tx.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "731BX",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "adaptive routing; Adaptive systems; centralized
                 adaptive routing; Computer architecture; distributed
                 adaptive routing; global state; load balanced adaptive
                 routing; load balancing; Load control; Load management;
                 mesh based design; mesh topology; network on chip;
                 Network on Chip; network routing; Network-on-Chip;
                 network-on-chip; NoC; packet routing; programmable
                 unit; regional congestion signal; routing algorithms;
                 Routing protocols; Telecommunication traffic;
                 Throughput; time dependent traffic load",
  number-of-cited-references = "7",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "Manevich:2010:CAR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zhang:2010:FCA,
  author =       "Meng Zhang and Alvin R. Lebeck and Daniel J. Sorin",
  title =        "Fractal Consistency: Architecting the Memory System to
                 Facilitate Verification",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "61--64",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.18",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "One of the most challenging problems in developing a
                 multicore processor is verifying that the design is
                 correct, and one of the most difficult aspects of
                 pre-silicon verification is verifying that the memory
                 system obeys the architecture's specified memory
                 consistency model. To simplify the process of
                 pre-silicon design verification, we propose a system
                 model called the Fractally Consistent Model (FCM). We
                 prove that systems that adhere to the FCM can be
                 verified to obey the memory consistency model in three
                 simple, scalable steps. The procedure for verifying FCM
                 systems contrasts sharply with the difficult,
                 non-scalable procedure required to verify non-FCM
                 systems. We show that FCM systems do not necessarily
                 sacrifice performance, compared to non-FCM systems,
                 despite being simpler to verify.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zhang, M (Reprint Author), Duke Univ, Dept Elect \&
                 Comp Engn, Durham, NC 27706 USA. Zhang, Meng; Sorin,
                 Daniel J., Duke Univ, Dept Elect \& Comp Engn, Durham,
                 NC 27706 USA. Lebeck, Alvin R., Duke Univ, Dept Comp
                 Sci, Durham, NC 27706 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "731BX",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CCF-0702434,
                 CCF-0811290]",
  funding-text = "This material is based upon work supported by the
                 National Science Foundation under grants CCF-0702434
                 and CCF-0811290.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Arithmetic and Logic Structures; Coherence;
                 Computational modeling; Computer architecture; Computer
                 Reliability; Fault-Tolerance; FCM systems; Formal
                 verification; fractal consistent model; Fractals;
                 Hardware; Memory; memory architecture; Memory
                 Consistency; memory consistency model; Memory
                 hierarchy; memory system architecture;
                 Micro-architecture implementation considerations;
                 microprocessor chips; Multicore; multicore processor;
                 multiprocessing systems; Performance Analysis and
                 Design Aids; presilicon verification; Processor
                 Architectures; Protocols; Testing; Validation;
                 Verification",
  number-of-cited-references = "10",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Zhang:2010:FCA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2010:AIT,
  author =       "Anonymous",
  title =        "Advertisement --- {{\booktitle{IEEE Transactions on
                 Computers}}} Celebrates 60 Years",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "65--65",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.29",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ICSb,
  author =       "Anonymous",
  title =        "2011 {IEEE Computer Society} Simulator Design
                 Competition",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "66--66",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.19",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ASS,
  author =       "Anonymous",
  title =        "Advertisement --- Special Student Offer",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "67--67",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.30",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ADY,
  author =       "Anonymous",
  title =        "Advertisement --- Distinguish Yourself With the
                 {CSDP}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "68--68",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.26",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:CPS,
  author =       "Anonymous",
  title =        "{Conference Proceedings Services (CPS)}
                 [advertisement]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "69--69",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.21",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ICSc,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} Jobs",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "70--70",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.28",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ASC,
  author =       "Anonymous",
  title =        "Advertisement --- Stay Connected to the {IEEE Computer
                 Society}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "71--71",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.31",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ACS,
  author =       "Anonymous",
  title =        "Advertisement --- {Computer Society Digital Library}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "72--72",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.20",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:EBCb,
  author =       "Anonymous",
  title =        "Editorial Board [Cover2]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "c2--c2",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.23",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:FCb,
  author =       "Anonymous",
  title =        "[Front cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "c1--c1",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.22",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:IAb,
  author =       "Anonymous",
  title =        "Information for authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "c3--c3",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.24",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2010:ICSd,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [Cover4]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "9",
  number =       "2",
  pages =        "c4--c4",
  month =        jul # "\slash " # dec,
  year =         "2010",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2010.25",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Skadron:2011:ELE,
  author =       "K. Skadron",
  title =        "Editorial: Letter from the {Editor-in-Chief}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "1--3",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Skadron:2011:U,
  author =       "Kevin Skadron",
  title =        "Untitled",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "1--3",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  number-of-cited-references = "0",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Skadron:2011:U",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Vandierendonck:2011:FMM,
  author =       "Hans Vandierendonck and Andre Seznec",
  title =        "Fairness Metrics for Multi-Threaded Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "4--7",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Multi-threaded processors execute multiple threads
                 concurrently in order to increase overall throughput.
                 It is well documented that multi-threading affects
                 per-thread performance but, more importantly, some
                 threads are affected more than others. This is
                 especially troublesome for multi-programmed workloads.
                 Fairness metrics measure whether all threads are
                 affected equally. However defining equal treatment is
                 not straightforward. Several fairness metrics for
                 multi-threaded processors have been utilized in the
                 literature, although there does not seem to be a
                 consensus on what metric does the best job of measuring
                 fairness. This paper reviews the prevalent fairness
                 metrics and analyzes their main properties. Each metric
                 strikes a different trade-off between fairness in the
                 strict sense and throughput. We categorize the metrics
                 with respect to this property. Based on experimental
                 data for SMT processors, we suggest using the minimum
                 fairness metric in order to balance fairness and
                 throughput.",
  acknowledgement = ack-nhfb,
  affiliation =  "Vandierendonck, H (Reprint Author), Univ Ghent, Dept
                 Elect \& Informat Syst, Ghent, Belgium. Vandierendonck,
                 Hans, Univ Ghent, Dept Elect \& Informat Syst, Ghent,
                 Belgium. Seznec, Andre, INRIA Rennes, Rennes, France.",
  author-email = "hans.vandierendonck@elis.ugent.be
                 Andre.Seznec@inria.fr",
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Correlation; fairness; fairness metrics; Harmonic
                 analysis; Instruction sets; measurement; Measurement;
                 multi-programming; Multi-threaded processors;
                 multi-threading; multiprocessing systems;
                 multiprogrammed workloads; multithreaded processors;
                 Parallel Architectures; Performance of Systems;
                 quality-of-service; resource allocation; SMT
                 processors; software metrics; System-on-a-chip;
                 Throughput",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "13",
  unique-id =    "Vandierendonck:2011:FMM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Tang:2011:PEM,
  author =       "Jie Tang and Shaoshan Liu and Zhimin Gu and Chen Liu
                 and Jean-Luc Gaudiot",
  title =        "Prefetching in Embedded Mobile Systems Can Be
                 Energy-Efficient",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "8--11",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Data prefetching has been a successful technique in
                 high-performance computing platforms. However, the
                 conventional wisdom is that they significantly increase
                 energy consumption, and thus not suitable for embedded
                 mobile systems. On the other hand, as modern mobile
                 applications pose an increasing demand for high
                 performance, it becomes essential to implement
                 high-performance techniques, such as prefetching, in
                 these systems. In this paper, we study the impact of
                 prefetching on the performance and energy consumption
                 of embedded mobile systems. Contrary to the
                 conventional wisdom, our findings demonstrate that as
                 technology advances, prefetching can be
                 energy-efficient while improving performance.
                 Furthermore, we have developed a simple but effective
                 analytical model to help system designers to identify
                 the conditions for energy efficiency.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tang, J (Reprint Author), Beijing Inst Technol,
                 Beijing 100081, Peoples R China. Tang, Jie; Gu, Zhimin,
                 Beijing Inst Technol, Beijing 100081, Peoples R China.
                 Liu, Shaoshan, Microsoft Corp, Redmond, WA 98052 USA.
                 Liu, Chen, Florida Int Univ, Miami, FL 33199 USA.
                 Gaudiot, Jean-Luc, Univ Calif Irvine, Irvine, CA USA.",
  author-email = "tangjie.bit@gmail.com shaoliu@microsoft.com
                 zmgu@x263.net chen.liu@fiu.edu gaudiot@uci.edu",
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "data prefetching; embedded mobile systems; embedded
                 systems; energy consumption; energy efficiency
                 condition; energy-efficient prefetching;
                 high-performance computing platform; Low power
                 electronics; Low-power design; Memory management;
                 Memory Structures; mobile computing; Mobile computing;
                 Mobile Computing; storage management",
  number-of-cited-references = "11",
  ORCID-numbers = "Liu, Chen/0000-0003-1558-6836",
  research-areas = "Computer Science",
  times-cited =  "19",
  unique-id =    "Tang:2011:PEM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Khan:2011:DDC,
  author =       "Omer Khan and Mieszko Lis and Yildiz Sinangil and
                 Srinivas Devadas",
  title =        "{DCC}: A Dependable Cache Coherence Multicore
                 Architecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "12--15",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Cache coherence lies at the core of
                 functionally-correct operation of shared memory
                 multicores. Traditional directory-based hardware
                 coherence protocols scale to large core counts, but
                 they incorporate complex logic and directories to track
                 coherence states. Technology scaling has reached
                 miniaturization levels where manufacturing
                 imperfections, device unreliability and occurrence of
                 hard errors pose a serious dependability challenge.
                 Broken or degraded functionality of the coherence
                 protocol can lead to a non-operational processor or
                 user visible performance loss. In this paper, we
                 propose a dependable cache coherence architecture (DCC)
                 that combines the traditional directory protocol with a
                 novel execution-migration-based architecture to ensure
                 dependability that is transparent to the programmer.
                 Our architecturally redundant execution migration
                 architecture only permits one copy of data to be cached
                 anywhere in the processor: when a thread accesses an
                 address not locally cached on the core it is executing
                 on, it migrates to the appropriate core and continues
                 execution there. Both coherence mechanisms can co-exist
                 in the DCC architecture and we present architectural
                 extensions to seamlessly transition between the
                 directory and execution migration protocols.",
  acknowledgement = ack-nhfb,
  affiliation =  "Khan, O (Reprint Author), MIT, 77 Massachusetts Ave,
                 Cambridge, MA 02139 USA. Khan, Omer; Lis, Mieszko;
                 Sinangil, Yildiz; Devadas, Srinivas, MIT, Cambridge, MA
                 02139 USA. Khan, Omer, Univ Massachusetts, Lowell, MA
                 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architecturally redundant execution migration
                 architecture; B.3.4 Reliability, Testing, and
                 Fault-Tolerance; B.8 Performance and Reliability;
                 broken functionality; C.4.b Fault tolerance; cache
                 coherence; cache storage; Coherence; coherence
                 mechanisms; coherence states; DCC architecture;
                 degraded functionality; dependability challenge;
                 Dependable architecture; dependable cache coherence
                 architecture; dependable cache coherence multicore
                 architecture; device unreliability; directory protocol;
                 directory-based hardware coherence protocols;
                 execution-migration-based architecture;
                 functionally-correct operation; Hardware; incorporate
                 complex logic; Instruction sets; large core counts;
                 manufacturing imperfections; memory architecture;
                 memory protocols; microprocessor chips; miniaturization
                 levels; Multicore processing; multicores;
                 nonoperational processor; Protocols; shared memory
                 multicores; shared memory systems; System-on-a-chip;
                 technology scaling; user visible performance loss",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Khan:2011:DDC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Rosenfeld:2011:DCA,
  author =       "Paul Rosenfeld and Elliott Cooper-Balis and Bruce
                 Jacob",
  title =        "{DRAMSim2}: A Cycle Accurate Memory System Simulator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "16--19",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper we present DRAMSim2, a cycle accurate
                 memory system simulator. The goal of DRAMSim2 is to be
                 an accurate and publicly available DDR2/3 memory system
                 model which can be used in both full system and
                 trace-based simulations. We describe the process of
                 validating DRAMSim2 timing against manufacturer Verilog
                 models in an effort to prove the accuracy of simulation
                 results. We outline the combination of DRAMSim2 with a
                 cycle-accurate x86 simulator that can be used to
                 perform full system simulations. Finally, we discuss
                 DRAMVis, a visualization tool that can be used to graph
                 and compare the results of DRAMSim2 simulations.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rosenfeld, P (Reprint Author), Univ Maryland, Dept
                 Elect \& Comp Engn, College Pk, MD 20742 USA.
                 Rosenfeld, Paul; Cooper-Balis, Elliott; Jacob, Bruce,
                 Univ Maryland, Dept Elect \& Comp Engn, College Pk, MD
                 20742 USA.",
  author-email = "prosenf1@umd.edu ecc17@umd.edu blj@umd.edu",
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  eissn =        "1556-6064",
  esi-highly-cited-paper = "Y",
  esi-hot-paper = "N",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computational modeling; cycle accurate memory system
                 simulator; DDR2/3 memory system model; DRAM; DRAM
                 chips; DRAMSim2 simulation; DRAMSim2 timing; Driver
                 circuits; Hardware design languages; Load modeling;
                 memory architecture; memory cards; Object oriented
                 modeling; Primary memory; Random access memory;
                 Simulation; Timing; trace-based simulation; Verilog
                 model; visualization tool",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "270",
  unique-id =    "Rosenfeld:2011:DCA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Gou:2011:ESH,
  author =       "Chunyang Gou and Georgi N. Gaydadjiev",
  title =        "Exploiting {SPMD} Horizontal Locality",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "20--23",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this paper, we analyze a particular spatial
                 locality case (called horizontal locality) inherent to
                 manycore accelerator architectures employing barrel
                 execution of SPMD kernels, such as GPUs. We then
                 propose an adaptive memory access granularity framework
                 to exploit and enforce the horizontal locality in order
                 to reduce the interferences among accelerator cores
                 memory accesses and hence improve DRAM efficiency. With
                 the proposed technique, DRAM efficiency grows by 1.42X
                 on average, resulting in 12.3\% overall performance
                 gain, for a set of representative memory intensive
                 GPGPU applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Gou, C (Reprint Author), Delft Univ Technol, NL-2600
                 AA Delft, Netherlands. Gou, Chunyang; Gaydadjiev,
                 Georgi N., Delft Univ Technol, NL-2600 AA Delft,
                 Netherlands.",
  author-email = "c.gou@tudelft.nl g.n.gaydadjiev@tudelft.nl",
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerator core memory access; adaptive memory access
                 granularity; Bandwidth; barrel execution; DRAM chips;
                 DRAM efficiency; GPU; Graphics processing unit;
                 Instruction sets; interference; Kernel; manycore
                 accelerator architecture; Memory hierarchy;
                 microprocessor chips; Multi-core/single-chip
                 multiprocessors; parallel architectures; Pipelines;
                 Proposals; Random access memory; SIMD processors;
                 single program multiple data; spatial locality; SPMD
                 horizontal locality; SPMD kernel",
  number-of-cited-references = "13",
  ORCID-numbers = "Gaydadjiev, Georgi/0000-0002-3678-7007",
  research-areas = "Computer Science",
  researcherid-numbers = "Gaydadjiev, Georgi/F-1488-2010",
  times-cited =  "1",
  unique-id =    "Gou:2011:ESH",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wang:2011:GGC,
  author =       "Xiaoqun Wang and Zhenzhou Ji and Chen Fu and Mingzeng
                 Hu",
  title =        "{GCMS}: A Global Contention Management Scheme in
                 Hardware Transactional Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "24--27",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware Transactional Memory (HTM) is a promising
                 Transactional Memory (TM) implementation because of its
                 strong atomicity and high performance. Unfortunately,
                 most contention management approaches in HTMs are
                 dedicated to specific transaction conflict scenarios
                 and it is hard to choose a universal strategy for
                 different workloads. In addition, HTM performance
                 degrades sharply when there are severe transaction
                 conflicts. In this paper, we present a Global
                 Contention Management Scheme (GCMS) to resolve severe
                 transaction conflicts in HTMs. Our scheme depends on a
                 Deadlock and Livelock Detection Mechanism (DLDM) and a
                 Global Contention Manager (GCM) to resolve severe
                 transaction conflicts. This scheme is orthogonal to the
                 rest of the contention management policies. We have
                 incorporated GCMS into different HTMs and compared the
                 performance of the enhanced systems with that of the
                 original HTMs with the STAMP benchmark suite. The
                 results demonstrate that the performance of the
                 enhanced HTMs is improved.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wang, XQ (Reprint Author), Harbin Inst Technol, Sch
                 Comp Sci, Harbin 150006, Peoples R China. Wang,
                 Xiaoqun; Ji, Zhenzhou; Fu, Chen; Hu, Mingzeng, Harbin
                 Inst Technol, Sch Comp Sci, Harbin 150006, Peoples R
                 China.",
  author-email = "wxiaoqun@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "773ZN",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bioinformatics; Concurrent Programming; Contention
                 Management; deadlock-and-livelock detection mechanism;
                 GCMS scheme; Genomics; global contention management
                 scheme; global contention manager; Hardware; Hardware
                 Transactional Memory; hardware transactional memory;
                 Multi-core/single-chip multiprocessors; Multicore
                 Processors; Parallel Programming; Program processors;
                 Radiation detectors; storage management; System
                 recovery; transaction conflict; transaction
                 processing",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Wang:2011:GGC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2011:RL,
  author =       "Anonymous",
  title =        "2010 Reviewers List",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "28--28",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "IEEE publishing",
}

@Article{Anonymous:2011:AI,
  author =       "Anonymous",
  title =        "2010 Annual Index",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "??--??",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:Ca,
  author =       "Anonymous",
  title =        "Cover 2",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "c2--c2",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:Cb,
  author =       "Anonymous",
  title =        "Cover 3",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "c3--c3",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:Cc,
  author =       "Anonymous",
  title =        "Cover 4",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "c4--c4",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:FCa,
  author =       "Anonymous",
  title =        "[Front cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "1",
  pages =        "c1--c1",
  month =        jan # "\slash " # jun,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Mars:2011:HHW,
  author =       "Jason Mars and Lingjia Tang and Robert Hundt",
  title =        "Heterogeneity in ``Homogeneous'' Warehouse-Scale
                 Computers: A Performance Opportunity",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "29--32",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The class of modern datacenters recently coined as
                 ``warehouse scale computers'' (WSCs) has traditionally
                 been embraced as homogeneous computing platforms.
                 However, due to frequent machine replacements and
                 upgrades, modern WSCs are in fact composed of diverse
                 commodity microarchitectures and machine
                 configurations. Yet, current WSCs are designed with an
                 assumption of homogeneity, leaving a potentially
                 significant performance opportunity unexplored. In this
                 paper, we investigate the key factors impacting the
                 available heterogeneity in modern WSCs, and the benefit
                 of exploiting this heterogeneity to maximize overall
                 performance. We also introduce a new metric,
                 opportunity factor, which can be used to quantify an
                 application's sensitivity to the heterogeneity in a
                 given WSC. For applications that are sensitive to
                 heterogeneity, we observe a performance improvement of
                 up to 70\% when employing our approach. In a WSC
                 composed of state-of-the-art machines, we can improve
                 the overall performance of the entire datacenter by
                 16\% over the status quo.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mars, J (Reprint Author), Univ Virginia,
                 Charlottesville, VA 22903 USA. Mars, Jason; Tang,
                 Lingjia, Univ Virginia, Charlottesville, VA 22903
                 USA.",
  author-email = "jom5x@cs.virginia.edu lt8f@cs.virginia.edu
                 rhundt@google.com",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Computer architecture; computer
                 centres; datacenters; Design studies; Distributed
                 architectures; diverse commodity microarchitectures;
                 Heterogeneous (hybrid) systems; homogeneous
                 warehouse-scale computers; integration and modeling;
                 machine configurations; mainframes; Microarchitecture;
                 Optimization; Scheduling and task partitioning; Super
                 (very large) computers; System architectures",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "22",
  unique-id =    "Mars:2011:HHW",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Michelogiannakis:2011:PCE,
  author =       "George Michelogiannakis and Nan Jiang and Daniel U.
                 Becker and William J. Dally",
  title =        "Packet Chaining: Efficient Single-Cycle Allocation for
                 On-Chip Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "33--36",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/hash.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper introduces packet chaining, a simple and
                 effective method to increase allocator matching
                 efficiency and hence network performance, particularly
                 suited to networks with short packets and short cycle
                 times. Packet chaining operates by chaining packets
                 destined to the same output together, to reuse the
                 switch connection of a departing packet. This allows an
                 allocator to build up an efficient matching over a
                 number of cycles, like incremental allocation, but not
                 limited by packet length. For a 64-node 2D mesh at
                 maximum injection rate and with single-flit packets,
                 packet chaining increases network throughput by 15\%
                 compared to a conventional single-iteration separable
                 iSLIP allocator, outperforms a wavefront allocator, and
                 gives comparable throughput with an augmenting paths
                 allocator. Packet chaining achieves this performance
                 with a cycle time comparable to a single-iteration
                 separable allocator. Packet chaining also reduces
                 average network latency by 22.5\% compared to iSLIP.
                 Finally, packet chaining increases IPC up to 46\% (16\%
                 average) for application benchmarks because short
                 packets are critical in a typical cache-coherent CMP.
                 These are considerable improvements given the maturity
                 of network-on-chip routers and allocators.",
  acknowledgement = ack-nhfb,
  affiliation =  "Michelogiannakis, G (Reprint Author), Stanford Univ,
                 Stanford, CA 94305 USA. Michelogiannakis, George;
                 Jiang, Nan; Becker, Daniel U.; Dally, William J.,
                 Stanford Univ, Stanford, CA 94305 USA.",
  author-email = "mihelog@stanford.edu njiang37@stanford.edu
                 dub@stanford.edu dally@stanford.edu",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CCF-0702341];
                 National Security Agency [H98230-08-C-0272-P007];
                 Robert Bosch Fellowship; Prof. Michael Farmwald
                 Fellowship; Prof. Michael J. Flynn Stanford Graduate
                 Fellowship",
  funding-text = "This work was supported in part by the National
                 Science Foundation under Grant CCF-0702341, in part by
                 the National Security Agency under Contract
                 H98230-08-C-0272-P007 and in part by the Robert Bosch,
                 Prof. Michael Farmwald and Prof. Michael J. Flynn
                 Stanford Graduate Fellowships.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "allocator matching efficiency; Benchmark testing;
                 Interconnection architectures; network performance;
                 network-on-chip; network-on-chip routers; On-chip
                 interconnection networks; on-chip networks; packet
                 chaining; Resource management; single-iteration
                 separable iSLIP allocator; System-on-a-chip;
                 Throughput",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Michelogiannakis:2011:PCE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Ho:2011:EIB,
  author =       "Chen-Han Ho and Garret Staus and Aaron Ulmer and
                 Karthikeyan Sankaralingam",
  title =        "Exploring the Interaction Between Device Lifetime
                 Reliability and Security Vulnerabilities",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "37--40",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As technology scales, device reliability is becoming a
                 fundamental problem. Even though manufacture test can
                 guarantee product quality, due to various types of
                 wearout and failure modes, permanent faults appearing
                 in the filed is becoming an increasingly important and
                 real problem. Such types of wear-out creates permanent
                 faults in devices after release to the user during
                 their lifetime. In this paper, we perform a formal
                 investigation of the impact of permanent faults on
                 security, examine empirical evidence, and demonstrate a
                 real attack. Our results show that permanent stuck-at
                 faults may leave security holes in microprocessors. We
                 show that an adversary with knowledge of a fault can
                 launch attacks which can obtain critical secrets such
                 as a private key in 30 seconds.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ho, CH (Reprint Author), Univ Wisconsin, Madison, WI
                 53706 USA. Ho, Chen-Han; Staus, Garret; Ulmer, Aaron;
                 Sankaralingam, Karthikeyan, Univ Wisconsin, Madison, WI
                 53706 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Arithmetic and Logic Structures; Circuit faults;
                 Computer bugs; Control Structures and Microprogramming;
                 Cryptography; device lifetime reliability; failure
                 mode; fault tolerant computing; Hardware reliability;
                 Logic programming; microprocessor chips;
                 microprocessors; Permanent Fault; permanent fault;
                 private key; product quality; Program processors;
                 public key cryptography; Reliability; Reliability
                 engineering; Security; security vulnerability; wear-out
                 type; wearout mode",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Ho:2011:EIB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Hernandez:2011:FTV,
  author =       "Carles Hernandez and Antoni Roca and Jose Flich and
                 Federico Silla and Jose Duato",
  title =        "Fault-Tolerant Vertical Link Design for Effective 3D
                 Stacking",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "41--44",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Recently, 3D stacking has been proposed to alleviate
                 the memory bandwidth limitation arising in chip
                 multiprocessors (CMPs). As the number of integrated
                 cores in the chip increases the access to external
                 memory becomes the bottleneck, thus demanding larger
                 memory amounts inside the chip. The most accepted
                 solution to implement vertical links between stacked
                 dies is by using Through Silicon Vias (TSVs). However,
                 TSVs are exposed to misalignment and random defects
                 compromising the yield of the manufactured 3D chip. A
                 common solution to this problem is by
                 over-provisioning, thus impacting on area and cost. In
                 this paper, we propose a fault-tolerant vertical link
                 design. With its adoption, fault-tolerant vertical
                 links can be implemented in a 3D chip design at low
                 cost without the need of adding redundant TSVs (no
                 over-provision). Preliminary results are very promising
                 as the fault-tolerant vertical link design increases
                 switch area only by 6.69\% while the achieved
                 interconnect yield tends to 100\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "Hernandez, C (Reprint Author), Univ Politecn Valencia,
                 C Cami de Vera S-N, Valencia 46022, Spain. Hernandez,
                 Carles; Roca, Antoni; Flich, Jose; Silla, Federico;
                 Duato, Jose, Univ Politecn Valencia, Valencia 46022,
                 Spain.",
  author-email = "carherlu@gap.upv.es",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Spanish MEC; MICINN; European Commission
                 [CSD2006-00046, TIN2009-14475-C04]; NaNoC [248972]",
  funding-text = "This work was supported by the Spanish MEC and MICINN,
                 as well as European Commission FEDER funds, under
                 Grants CSD2006-00046 and TIN2009-14475-C04. It was also
                 partly supported by the project NaNoC (project label
                 248972) which is funded by the European Commission
                 within the Research Programme FP7.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "3D chip; 3D Stacking; 3D stacking; chip
                 multiprocessors; CMP; effective 3D stacking; external
                 memory; Fault Tolerance; fault tolerance; Fault
                 Tolerance; Fault tolerant systems; fault-tolerant
                 vertical link design; memory bandwidth limitation;
                 Memory management; microprocessor chips;
                 network-on-chip; NoC; Stacking; storage management
                 chips; Three dimensional displays; three-dimensional
                 integrated circuits; through silicon vias; TSV",
  number-of-cited-references = "20",
  oa =           "Green Published",
  ORCID-numbers = "Silla, Federico/0000-0002-6435-1200 Hernandez,
                 Carles/0000-0001-5393-3195",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Hernandez:2011:FTV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Choi:2011:EID,
  author =       "Inseok Choi and Minshu Zhao and Xu Yang and Donald
                 Yeung",
  title =        "Experience with Improving Distributed Shared Cache
                 Performance on {Tilera}'s {Tile} Processor",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "45--48",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.18",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper describes our experience with profiling and
                 optimizing physical locality for the distributed shared
                 cache (DSC) in Tilera's Tile multicore processor. Our
                 approach uses the Tile Processor's hardware performance
                 measurement counters (PMCs) to acquire page-level
                 access pattern profiles. A key problem we address is
                 imprecise PMC interrupts. Our profiling tools use
                 binary analysis to correct for interrupt ``skid'', thus
                 pinpointing individual memory operations that incur
                 remote DSC slice references and permitting us to sample
                 their access patterns. We use our access pattern
                 profiles to drive page homing optimizations for both
                 heap and static data objects. Our experiments show we
                 can improve physical locality for 5 out of 11 SPLASH2
                 benchmarks running on 32 cores, enabling 32.9\%-77.9\%
                 of DSC references to target the local DSC slice. To our
                 knowledge, this is the first work to demonstrate page
                 homing optimizations on a real system.",
  acknowledgement = ack-nhfb,
  affiliation =  "Choi, I (Reprint Author), Univ Maryland, Dept Elect \&
                 Comp Engn, College Pk, MD 20742 USA. Choi, Inseok;
                 Zhao, Minshu; Yang, Xu; Yeung, Donald, Univ Maryland,
                 Dept Elect \& Comp Engn, College Pk, MD 20742 USA.",
  author-email = "inseok@umd.edu mszhao@umd.edu yangxu@umd.edu
                 yeung@umd.edu",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; binary analysis; cache storage;
                 Computer architecture; Data streams; Design
                 methodology; Design studies; distributed shared cache
                 performance; hardware performance measurement counters;
                 microprocessor chips; Multi-core/single-chip
                 multiprocessors; Multicore processing; Multiple Data
                 Stream Architectures (Multiprocessors); multiprocessing
                 systems; Multiprocessing systems; page homing
                 optimization; page-level access pattern profile; PMC
                 interrupt; profiling tool; Tilera tile multicore
                 processor",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "5",
  unique-id =    "Choi:2011:EID",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Prieto:2011:MCM,
  author =       "Pablo Prieto and Valentin Puente and Jose-Angel
                 Gregorio",
  title =        "Multilevel Cache Modeling for Chip-Multiprocessor
                 Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "49--52",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.20",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "This paper presents a simple analytical model for
                 predicting on-chip cache hierarchy effectiveness in
                 chip multiprocessors (CMP) for a state-of-the-art
                 architecture. Given the complexity of this type of
                 systems, we use rough approximations, such as the
                 empirical observation that the re-reference timing
                 pattern follows a power law and the assumption of a
                 simplistic delay model for the cache, in order to
                 provide a useful model for the memory hierarchy
                 responsiveness. This model enables the analytical
                 determination of average access time, which makes
                 design space pruning useful before sweeping the vast
                 design space of this class of systems. The model is
                 also useful for predicting cache hierarchy behavior in
                 future systems. The fidelity of the model has been
                 validated using a state-of-the-art, full-system
                 simulation environment, on a system with up to sixteen
                 out-of-order processors with cache-coherent caches and
                 using a broad spectrum of applications, including
                 complex multithread workloads. This simple model can
                 predict a near-to-optimal, on-chip cache distribution
                 while also estimating how future systems running future
                 applications might behave.",
  acknowledgement = ack-nhfb,
  affiliation =  "Prieto, P (Reprint Author), Univ Cantabria, Cantabria,
                 Spain. Prieto, Pablo; Puente, Valentin; Gregorio,
                 Jose-Angel, Univ Cantabria, Cantabria, Spain.",
  author-email = "prietop@unican.es vpuente@unican.es
                 monaster@unican.es",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Spanish Ministry of Science and Innovation
                 [TIN2010-18159]; HiPEAC2 European Network of
                 Excellence",
  funding-text = "This work has been supported by the Spanish Ministry
                 of Science and Innovation, under contracts
                 TIN2010-18159, and by the HiPEAC2 European Network of
                 Excellence. The authors would like to thank the
                 reviewers for their valuable comments.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "approximation theory; cache hierarchy behavior
                 prediction; cache storage; Cache storage;
                 cache-coherent caches; chip-multiprocessor systems;
                 complex multithread workloads; Complexity theory;
                 Computational modeling; design space; integrated
                 circuit design; Memory hierarchy; memory hierarchy
                 responsiveness; microprocessor chips;
                 Multi-core/single-chip multiprocessors; multilevel
                 cache modeling; multiprocessing systems;
                 Multiprocessing systems; near-to-optimal on-chip cache
                 distribution; on-chip cache hierarchy effectiveness
                 prediction; power law; re-reference timing pattern;
                 rough approximations; simplistic delay model
                 assumption; Software tools; Thermal analysis; Thermal
                 sensors",
  number-of-cited-references = "13",
  ORCID-numbers = "Prieto, Pablo/0000-0002-5818-1188 Puente,
                 Valentin/0000-0002-6904-3282 Gregorio, Jose
                 Angel/0000-0003-2214-303X",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Prieto:2011:MCM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Siozios:2011:SRT,
  author =       "Kostas Siozios and Dimitrios Rodopoulos and Dimitrios
                 Soudris",
  title =        "On Supporting Rapid Thermal Analysis",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "53--56",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.19",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Detailed thermal analysis is usually performed
                 exclusively at design time since it is a
                 computationally intensive task. In this paper, we
                 introduce a novel methodology for fast, yet accurate,
                 thermal analysis. The introduced methodology is
                 software supported by a new open source tool that
                 enables hierarchical thermal analysis with adaptive
                 levels of granularity. Experimental results prove the
                 efficiency of our approach since it leads to average
                 reduction of the execution overhead up to 70\% with a
                 penalty in accuracy ranging between 2\% and 8\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "Siozios, K (Reprint Author), Natl Tech Univ Athens,
                 Sch ECE, GR-10682 Athens, Greece. Siozios, Kostas;
                 Rodopoulos, Dimitrios; Soudris, Dimitrios, Natl Tech
                 Univ Athens, Sch ECE, GR-10682 Athens, Greece.",
  da =           "2019-06-20",
  doc-delivery-number = "855NW",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Complexity theory; Computational modeling; Computer
                 Systems Organization; Design Methodologies; General;
                 Hardware; hierarchical thermal analysis; Modeling
                 techniques; Monitoring; open source tool; Performance
                 of Systems; Power Management; public domain software;
                 rapid thermal analysis; Reconfigurable Hardware;
                 Reconfigurable hardware; Reliability; software
                 engineering; software supported; Software tools;
                 thermal analysis; Thermal analysis; Thermal Monitoring;
                 Thermal sensors",
  number-of-cited-references = "8",
  ORCID-numbers = "Siozios, Kostas/0000-0002-0285-2202 Soudris,
                 Dimitrios/0000-0002-6930-6847",
  research-areas = "Computer Science",
  researcherid-numbers = "Soudris, Dimitrios/I-5252-2014 Siozios,
                 Kostas/F-9726-2011 Soudris, Dimitrios/O-8843-2019",
  times-cited =  "3",
  unique-id =    "Siozios:2011:SRT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2011:Cd,
  author =       "Anonymous",
  title =        "Cover 3",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "c3--c3",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.30",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:FCb,
  author =       "Anonymous",
  title =        "[Front cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "c1--c1",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.28",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:ICS,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [society information]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "c4--c4",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.31",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2011:PI,
  author =       "Anonymous",
  title =        "Publication information",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "10",
  number =       "2",
  pages =        "c2--c2",
  month =        jul # "\slash " # dec,
  year =         "2011",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.29",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Sethumadhavan:2012:CHD,
  author =       "Simha Sethumadhavan and Ryan Roberts and Yannis
                 Tsividis",
  title =        "A Case for Hybrid Discrete-Continuous Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.22",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Current technology trends indicate that power- and
                 energy-efficiency will limit chip throughput in the
                 future. Current solutions to these problems, either in
                 the way of programmable or fixed-function digital
                 accelerators will soon reach their limits as
                 microarchitectural overheads are successively trimmed.
                 A significant departure from current computing methods
                 is required to carry forward computing advances beyond
                 digital accelerators. In this paper we describe how the
                 energy-efficiency of a large class of problems can be
                 improved by employing a hybrid of the discrete and
                 continuous models of computation instead of the
                 ubiquitous, traditional discrete model of computation.
                 We present preliminary analysis of domains and
                 benchmarks that can be accelerated with the new model.
                 Analysis shows that machine learning, physics and up to
                 one-third of SPEC, RMS and Berkeley suite of
                 applications can be accelerated with the new hybrid
                 model.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sethumadhavan, S (Reprint Author), Columbia Univ, New
                 York, NY 10027 USA. Sethumadhavan, Simha; Roberts,
                 Ryan; Tsividis, Yannis, Columbia Univ, New York, NY
                 10027 USA.",
  author-email = "simha@cs.columbia.edu",
  da =           "2019-06-20",
  doc-delivery-number = "953VM",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "DARPA; AFRL [FA8750-10-2-0253,
                 FA9950-09-1-0389]; NSF",
  funding-text = "Sethumadhavan's research is funded by grants from
                 DARPA, AFRL (FA8750-10-2-0253, FA9950-09-1-0389), the
                 NSF CAREER program, gifts from Microsoft Research and
                 Columbia University, and software donations from
                 Synopsys and Wind River. Roberts conducted this
                 research as a GRA in Sethumadhavan's Lab.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Approximation algorithms; Benchmark testing; Berkeley
                 suite; Computational modeling; Computer architecture;
                 computer architecture; Computer architecture; computer
                 architecture; computing methods; continuous models;
                 cryptography; Design studies; Differential equations;
                 discrete model; discrete models; domains analysis;
                 energy conservation; energy-efficiency; fixed-function
                 digital accelerators; forward computing advances;
                 hybrid discrete-continuous architectures; Hybrid
                 systems; machine learning; Mathematical model;
                 microarchitectural overheads; microprocessor chips;
                 power-efficiency; Processor architectures; RMS; SPEC;
                 Very large scale integration",
  number-of-cited-references = "16",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Sethumadhavan:2012:CHD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kong:2012:ASF,
  author =       "Ji Kong and Peilin Liu and Yu Zhang",
  title =        "Atomic Streaming: A Framework of On-Chip Data Supply
                 System for Task-Parallel {MPSoCs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.21",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "State of the art fabrication technology for
                 integrating numerous hardware resources such as
                 Processors/DSPs and memory arrays into a single chip
                 enables the emergence of Multiprocessor System-on-Chip
                 (MPSoC). Stream programming paradigm based on MPSoC is
                 highly efficient for single functionality scenario due
                 to its dedicated and predictable data supply system.
                 However, when memory traffic is heavily shared among
                 parallel tasks in applications with multiple
                 interrelated functionalities, performance suffers
                 through task interferences and shared memory
                 congestions which lead to poor parallel speedups and
                 memory bandwidth utilizations. This paper proposes a
                 framework of stream processing based on-chip data
                 supply system for task-parallel MPSoCs. In this
                 framework, stream address generations and data
                 computations are decoupled and parallelized to allow
                 full utilization of on-chip resources. Task
                 granularities are dynamically tuned to jointly optimize
                 the overall application performance. Experiments show
                 that proposed framework as well as the tuning scheme
                 are effective for joint optimization in task-parallel
                 MPSoCs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kong, J (Reprint Author), Shanghai Jiao Tong Univ, Sch
                 Elect Informat \& Elect Engn, Shanghai 200030, Peoples
                 R China. Kong, Ji; Liu, Peilin, Shanghai Jiao Tong
                 Univ, Sch Elect Informat \& Elect Engn, Shanghai
                 200030, Peoples R China.",
  author-email = "johnhophen@sjtu.edu.cn liupeilin@sjtu.edu.cn
                 zhyu@cn.ibm.com",
  da =           "2019-06-20",
  doc-delivery-number = "953VM",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "IBM Research-China under the IBM",
  funding-text = "This work has been partially supported by IBM
                 Research-China under the IBM Ph.D. Fellowship program
                 for the 2010-2011 academic year.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application studies resulting in better
                 multiple-processor systems; atomic streaming;
                 Bandwidth; data computations; Memory hierarchy;
                 Multi-core/single-chip multiprocessors; Multicore
                 processing; Multiple Data Stream Architectures
                 (Multiprocessors); Multiprocessing systems;
                 multiprocessor system-on-chip; on-chip data supply
                 system; Prefetching; shared memory congestions; shared
                 memory systems; stream address generations; stream
                 programming paradigm; Streaming media;
                 System-on-a-chip; system-on-chip; task interferences;
                 task-parallel MPSoC; Throughput",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kong:2012:ASF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Deb:2012:HSC,
  author =       "Abhishek Deb and Josep Maria Codina and Antonio
                 Gonzalez",
  title =        "A {HW\slash SW} Co-designed Programmable Functional
                 Unit",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.23",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "In this paper, we propose a novel programmable
                 functional unit (PFU) to accelerate general purpose
                 application execution on a modern out-of-order x86
                 processor. Code is transformed and instructions are
                 generated that run on the PFU using a co-designed
                 virtual machine (Cd-VM). Results presented in this
                 paper show that this HW/SW co-designed approach
                 produces average speedups in performance of 29\% in
                 SPECFP and 19\% in SPECINT, and up-to 55\%, over modern
                 out-of-order processor.",
  acknowledgement = ack-nhfb,
  affiliation =  "Deb, A (Reprint Author), Univ Politecn Cataluna, C
                 Jordi Girona 1-3, Barcelona, Spain. Deb, Abhishek;
                 Gonzalez, Antonio, Univ Politecn Cataluna, Barcelona,
                 Spain. Maria Codina, Josep; Gonzalez, Antonio, Intel
                 Res Labs Barcelona, Barcelona, Spain.",
  author-email = "abhishek@ac.upc.edu josep.m.codina@intel.com
                 antonio@intel.com",
  da =           "2019-06-20",
  doc-delivery-number = "953VM",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; hardware-software codesign;
                 Hardware/software interfaces; hw/sw co-designed;
                 Interface states; Load modeling; Micro-architecture
                 implementation considerations; Microarchitecture;
                 Processor Architectures; programmable functional unit;
                 Programmable functional units; Registers; virtual
                 machine",
  number-of-cited-references = "13",
  ORCID-numbers = "Gonzalez, Antonio/0000-0002-0009-0996",
  research-areas = "Computer Science",
  researcherid-numbers = "Gonzalez, Antonio/I-2961-2014",
  times-cited =  "0",
  unique-id =    "Deb:2012:HSC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Piscitelli:2012:HLP,
  author =       "Roberta Piscitelli and Andy D. Pimentel",
  title =        "A High-Level Power Model for {MPSoC} on {FPGA}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.24",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper presents a framework for high-level power
                 estimation of multiprocessor systems-on-chip (MPSoC)
                 architectures on FPGA. The technique is based on
                 abstract execution profiles, called event signatures.
                 As a result, it is capable of achieving good evaluation
                 performance, thereby making the technique highly useful
                 in the context of early system-level design space
                 exploration. We have integrated the power estimation
                 technique in a system-level MPSoC synthesis framework.
                 Using this framework, we have designed a range of
                 different candidate MPSoC architectures and compared
                 our power estimation results to those from real
                 measurements on a Virtex-6 FPGA board.",
  acknowledgement = ack-nhfb,
  affiliation =  "Piscitelli, R (Reprint Author), Univ Amsterdam, Inst
                 Informat, NL-1012 WX Amsterdam, Netherlands.
                 Piscitelli, Roberta; Pimentel, Andy D., Univ Amsterdam,
                 Inst Informat, NL-1012 WX Amsterdam, Netherlands.",
  author-email = "r.piscitelli@uva.nl a.d.pimentel@uva.nl",
  da =           "2019-06-20",
  doc-delivery-number = "953VM",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "MADNESS STREP",
  funding-text = "This work has been partially supported by the MADNESS
                 STREP-FP7 European Project.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "abstract execution profiles; Computational modeling;
                 Computer architecture; Estimation; event signatures;
                 Field programmable gate arrays; field programmable gate
                 arrays; Field programmable gate arrays; Formal models;
                 High-level power estimation; high-level power
                 estimation framework; high-level power model;
                 integrated circuit design; Mathematical model;
                 Microprocessors; MPSoC on FPGA; multiprocessing
                 systems; multiprocessor systems-on-chip architectures;
                 Performance Analysis and Design Aids; performance
                 evaluation; power aware computing; Power demand; power
                 estimation technique; Simulation; system-level design
                 space exploration; system-level MPSoC design space
                 exploration; system-level MPSoC synthesis framework;
                 system-on-chip; Virtex-6 FPGA board",
  number-of-cited-references = "15",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Piscitelli:2012:HLP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Finlayson:2012:OSP,
  author =       "Ian Finlayson and Gang-Ryung Uh and David Whalley and
                 Gary Tyson",
  title =        "An Overview of Static Pipelining",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.26",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "A new generation of mobile applications requires
                 reduced energy consumption without sacrificing
                 execution performance. In this paper, we propose to
                 respond to these conflicting demands with an innovative
                 statically pipelined processor supported by an
                 optimizing compiler. The central idea of the approach
                 is that the control during each cycle for each portion
                 of the processor is explicitly represented in each
                 instruction. Thus the pipelining is in effect
                 statically determined by the compiler. The benefits of
                 this approach include simpler hardware and that it
                 allows the compiler to perform optimizations that are
                 not possible on traditional architectures. The initial
                 results indicate that static pipelining can
                 significantly reduce power consumption without
                 adversely affecting performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Finlayson, I (Reprint Author), Florida State Univ,
                 Dept Comp Sci, Tallahassee, FL 32306 USA. Finlayson,
                 Ian; Whalley, David; Tyson, Gary, Florida State Univ,
                 Dept Comp Sci, Tallahassee, FL 32306 USA. Uh,
                 Gang-Ryung, Boise State Univ, Dept Comp Sci, Boise, ID
                 83725 USA.",
  author-email = "finlayso@cs.fsu.edu uh@cs.boisestate.edu
                 whalley@cs.fsu.edu tyson@cs.fsu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "953VM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CNS-0964413, CNS-0915926]",
  funding-text = "We thank the anonymous reviewers for their
                 constructive comments and suggestions. This research
                 was supported in part by NSF grants CNS-0964413 and
                 CNS-0915926.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Computer architecture; Energy
                 consumption; energy consumption reduction; execution
                 performance; General; mobile applications; optimising
                 compilers; Optimization; optimizing compiler; Pipeline
                 processing; pipeline processing; Pipeline processors;
                 power aware computing; Radio frequency; Registers;
                 statically pipelined processor",
  number-of-cited-references = "14",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Finlayson:2012:OSP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wu:2012:CID,
  author =       "Lisa Wu and Martha A. Kim and Stephen A. Edwards",
  title =        "Cache Impacts of Datatype Acceleration",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.25",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware acceleration is a widely accepted solution
                 for performance and energy efficient computation
                 because it removes unnecessary hardware for general
                 computation while delivering exceptional performance
                 via specialized control paths and execution units. The
                 spectrum of accelerators available today ranges from
                 coarse-grain off-load engines such as GPUs to
                 fine-grain instruction set extensions such as SSE. This
                 research explores the benefits and challenges of
                 managing memory at the data-structure level and
                 exposing those operations directly to the ISA. We call
                 these instructions Abstract Datatype Instructions
                 (ADIs). This paper quantifies the performance and
                 energy impact of ADIs on the instruction and data cache
                 hierarchies. For instruction fetch, our measurements
                 indicate that ADIs can result in 21-48\% and 16-27\%
                 reductions in instruction fetch time and energy
                 respectively. For data delivery, we observe a 22-40\%
                 reduction in total data read/write time and 9-30\% in
                 total data read/write energy.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wu, L (Reprint Author), Columbia Univ, Dept Comp Sci,
                 New York, NY 10027 USA. Wu, Lisa; Kim, Martha A.;
                 Edwards, Stephen A., Columbia Univ, Dept Comp Sci, New
                 York, NY 10027 USA.",
  author-email = "lisa@cs.columbia.edu martha@cs.columbia.edu
                 sedwards@cs.columbia.edu",
  da =           "2019-06-20",
  doc-delivery-number = "953VM",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "abstract data types; abstract datatype instruction;
                 Accelerators; ADI; cache hierarchy; Cache Hierarchy;
                 cache hierarchy; Cache memories; cache storage; coarse
                 grain off-load engine; data read-write energy; data
                 structure level; Data Structures; energy conservation;
                 energy efficient computation; energy impact; execution
                 unit; fine grain instruction set extension; hardware
                 acceleration; Hardware acceleration; hardware
                 acceleration; Hardware/software interfaces; Instruction
                 fetch; instruction fetch energy; instruction fetch
                 time; Instruction Set Extensions; instruction sets;
                 ISA; Memory hierarchy; memory management; Memory
                 Structures; Multicore processing; power aware
                 computing; Program processors; Support vector machines;
                 Vectors",
  number-of-cited-references = "15",
  ORCID-numbers = "Edwards, Stephen/0000-0003-2609-4861",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Wu:2012:CID",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2012:RL,
  author =       "Anonymous",
  title =        "2011 Reviewers List",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "25--26",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Lists the reviewers who contributed to IEEE Computer
                 Architecture Letters in 2011.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "IEEE publishing",
}

@Article{Anonymous:2012:TNQ,
  author =       "Anonymous",
  title =        "There now is a quick and easy way to find out about
                 our collection of {{\booktitle{Transactions}}}
                 [Advertisement]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "26--26",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.19",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Advertisement: Visit http://www.computer.org/whats-new
                 today!",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:ACP,
  author =       "Anonymous",
  title =        "Advertisement --- Conference Publishing Services
                 (CPS)",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "28--28",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "IEEE Conference Publishing Services (CPS)
                 advertisement.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:AI,
  author =       "Anonymous",
  title =        "2011 Annual Index",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "??--??",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This index covers all technical items --- papers,
                 correspondence, reviews, etc. --- that appeared in this
                 periodical during the year, and items from previous
                 years that were commented upon or corrected in this
                 year. Departments and other items may also be covered
                 if they have been judged to have archival value. The
                 Author Index contains the primary entry for each item,
                 listed under the first author's name. The primary entry
                 includes the co-authors' names, the title of the paper
                 or other item, and its location, specified by the
                 publication abbreviation, year, month, and inclusive
                 pagination. The Subject Index contains entries
                 describing the item under all appropriate subject
                 headings, plus the first author's name, the publication
                 abbreviation, month, and year, and inclusive pages.
                 Note that the item title is found only under he primary
                 entry in the Author Index.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Indexes",
}

@Article{Anonymous:2012:Ca,
  author =       "Anonymous",
  title =        "[Cover2]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "c2--c2",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Provides a listing of current society officers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:Cb,
  author =       "Anonymous",
  title =        "[Cover3]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "c3--c3",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Provides a listing of current society officers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:FCT,
  author =       "Anonymous",
  title =        "[Front cover and table of contents]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "c1--c1",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Presents the table of contents for this issue of the
                 periodical.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:ICS,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [Back cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "1",
  pages =        "c4--c4",
  month =        jan # "\slash " # jun,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Provides a listing of current committee members and
                 society officers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Davis:2012:IVL,
  author =       "John D. Davis and Suzanne Rivoire and Moises
                 Goldszmidt and Ehsan K. Ardestani",
  title =        "Including Variability in Large-Scale Cluster Power
                 Models",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "29--32",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.27",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Studying the energy efficiency of large-scale computer
                 systems requires models of the relationship between
                 resource utilization and power consumption. Prior work
                 on power modeling assumes that models built for a
                 single node will scale to larger groups of machines.
                 However, we find that inter-node variability in
                 homogeneous clusters leads to substantially different
                 models for different nodes. Moreover, ignoring this
                 variability will result in significant prediction
                 errors when scaled to the cluster level. We report on
                 inter-node variation for four homogeneous five-node
                 clusters using embedded, laptop, desktop, and server
                 processors. The variation is manifested quantitatively
                 in the prediction error and qualitatively on the
                 resource utilization variables (features) that are
                 deemed relevant for the models. These results
                 demonstrate the need to sample multiple machines in
                 order to produce accurate cluster models.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rivoire, Suzanne, Sonoma State Univ, Rohnert Pk, CA
                 94928 USA. Ardestani, Ehsan K., Univ CA, Santa Cruz, CA
                 USA.",
  author-email = "john.d@microsoft.com suzanne.rivoire@sonoma.edu
                 moises@microsoft.com eka@soe.ucsc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computational modeling; Data models; evaluation;
                 Measurement; modeling; Power demand; Power Management;
                 Power measurement; Predictive models; Radiation
                 detectors; Servers; simulation of multiple-processor
                 systems",
  number-of-cited-references = "26",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Davis:2012:IVL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lakshminarayana:2012:DSP,
  author =       "Nagesh B. Lakshminarayana and Jaekyu Lee and Hyesoon
                 Kim and Jinwoo Shin",
  title =        "{DRAM} Scheduling Policy for {GPGPU} Architectures
                 Based on a Potential Function",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "33--36",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.32",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "GPGPU architectures (applications) have several
                 different characteristics compared to traditional CPU
                 architectures (applications): highly multithreaded
                 architectures and SIMD-execution behavior are the two
                 important characteristics of GPGPU computing. In this
                 paper, we propose a potential function that models the
                 DRAM behavior in GPGPU architectures and a DRAM
                 scheduling policy, alpha-SJF policy to minimize the
                 potential function. The scheduling policy essentially
                 chooses between SJF and FR-FCFS at run-time based on
                 the number of requests from each thread and whether the
                 thread has a row buffer hit.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lakshminarayana, NB (Reprint Author), Georgia Inst
                 Technol, Sch Comp Sci, Atlanta, GA 30332 USA.
                 Lakshminarayana, Nagesh B.; Lee, Jaekyu; Kim, Hyesoon;
                 Shin, Jinwoo, Georgia Inst Technol, Sch Comp Sci,
                 Atlanta, GA 30332 USA.",
  author-email = "nageshbl@cc.gatech.edu jaekyu.lee@cc.gatech.edu
                 hyesoon.kim@cc.gatech.edu jshin72@cc.gatech.edu",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Computer architecture; DRAM chips;
                 DRAM scheduling; DRAM scheduling policy; dynamic random
                 access memory; Equations; general-purpose graphics
                 processing unit; GPGPU; GPGPU architecture; graphics
                 processing units; Instruction sets; Mathematical model;
                 multi-threading; multithreaded architecture; Potential
                 function; potential function; Potential function;
                 Processor scheduling; Random access memory; row buffer
                 hit; scheduling; SIMD-execution behavior",
  number-of-cited-references = "5",
  research-areas = "Computer Science",
  researcherid-numbers = "Shin, Jinwoo/M-5389-2013",
  times-cited =  "7",
  unique-id =    "Lakshminarayana:2012:DSP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wang:2012:ISA,
  author =       "Yaohua Wang and Shuming Chen and Kai Zhang and
                 Jianghua Wan and Xiaowen Chen and Hu Chen and Haibo
                 Wang",
  title =        "Instruction Shuffle: Achieving {MIMD}-like Performance
                 on {SIMD} Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "37--40",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.34",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "SIMD architectures are less efficient for applications
                 with the diverse control-flow behavior, which can be
                 mainly attributed to the requirement of the identical
                 control-flow. In this paper, we propose a novel
                 instruction shuffle scheme that features an efficient
                 control-flow handling mechanism. The cornerstones are
                 composed of a shuffle source instruction buffer array
                 and an instruction shuffle unit. The shuffle unit can
                 concurrently deliver instructions of multiple distinct
                 control-flows from the instruction buffer array to
                 eligible SIMD lanes. Our instruction shuffle scheme
                 combines the best attributes of both the SIMD and MIMD
                 execution paradigms. Experimental results show that, an
                 average performance improvement of 86\% can be
                 achieved, at a cost of only 5.8\% area overhead.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wang, YH (Reprint Author), Natl Univ Def Technol, Sch
                 Comp Sci, Changsha, Hunan, Peoples R China. Wang,
                 Yaohua; Chen, Shuming; Zhang, Kai; Wan, Jianghua; Chen,
                 Xiaowen; Chen, Hu; Wang, Haibo, Natl Univ Def Technol,
                 Sch Comp Sci, Changsha, Hunan, Peoples R China.",
  author-email = "nudtyh@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Natural Science Foundation of
                 China [61070036, 61133007]; National 863 Program of
                 China [2009AA011704]",
  funding-text = "The work is partially supported by the National
                 Natural Science Foundation of China (No. 61070036), the
                 National Natural Science Foundation of China (No.
                 61133007), the National 863 Program of China (No.
                 2009AA011704).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Arrays; data dependent control-flow; diverse
                 control-flow behavior; identical control-flow behavior;
                 instruction buffer array; Instruction sets; instruction
                 shuffle; instruction shuffle unit; Kernel; MIMD
                 execution paradigm; MIMD-like performance; multiple
                 instruction multiple data; parallel processing; Process
                 control; Resource management; Scalability; shuffle
                 source instruction buffer array; SIMD; SIMD
                 architecture; SIMD execution paradigm; single
                 instruction multiple data; Vectors",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  researcherid-numbers = "Chen, Shuming/Q-1147-2018",
  times-cited =  "6",
  unique-id =    "Wang:2012:ISA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Panda:2012:BFB,
  author =       "Reena Panda and Paul V. Gratz and Daniel A.
                 Jim{\'e}nez",
  title =        "{B-Fetch}: Branch Prediction Directed Prefetching for
                 In-Order Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "41--44",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.33",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Computer architecture is beset by two opposing trends.
                 Technology scaling and deep pipelining have led to high
                 memory access latencies; meanwhile, power and energy
                 considerations have revived interest in traditional
                 in-order processors. In-order processors, unlike their
                 superscalar counterparts, do not allow execution to
                 continue around data cache misses. In-order processors,
                 therefore, suffer a greater performance penalty in the
                 light of the current high memory access latencies.
                 Memory prefetching is an established technique to
                 reduce the incidence of cache misses and improve
                 performance. In this paper, we introduce B-Fetch, a new
                 technique for data prefetching which combines branch
                 prediction based lookahead deep path speculation with
                 effective address speculation, to efficiently improve
                 performance in in-order processors. Our results show
                 that B-Fetch improves performance 38.8\% on SPEC
                 CPU2006 benchmarks, beating a current, state-of-the-art
                 prefetcher design at similar to 1/3 the hardware
                 overhead.",
  acknowledgement = ack-nhfb,
  affiliation =  "Panda, R (Reprint Author), Texas A\&M Univ, Dept Elect
                 \& Comp Engn, CESG, College Stn, TX 77843 USA. Panda,
                 Reena; Gratz, Paul V., Texas A\&M Univ, Dept Elect \&
                 Comp Engn, CESG, College Stn, TX 77843 USA. Jimenez,
                 Daniel A., Univ Texas San Antonio, Dept Comp Sci, San
                 Antonio, TX USA.",
  author-email = "reena.panda@tamu.edu pgratz@tamu.edu dj@cs.utsa.edu",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "address speculation; B-fetch; Benchmark testing;
                 Branch Prediction; branch prediction based lookahead
                 deep path speculation; branch prediction directed
                 prefetching; Cache memory; computer architecture;
                 Computer architecture; data cache; Data Cache
                 Prefetching; deep pipelining; energy consideration;
                 Hardware; in-order processor; In-order Processors;
                 memory access latency; memory prefetching; Memory
                 Systems; Pipelines; power aware computing; power
                 consideration; Prefetching; Process control; Registers;
                 storage management; superscalar processor; technology
                 scaling; Value Prediction",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Panda:2012:BFB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Miller:2012:MEP,
  author =       "Timothy N. Miller and Renji Thomas and Radu
                 Teodorescu",
  title =        "Mitigating the Effects of Process Variation in
                 Ultra-low Voltage Chip Multiprocessors using Dual
                 Supply Voltages and Half-Speed Units",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "45--48",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.36",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Energy efficiency is a primary concern for
                 microprocessor designers. One very effective approach
                 to improving processor energy efficiency is to lower
                 its supply voltage to very near to the transistor
                 threshold voltage. This reduces power consumption
                 dramatically, improving energy efficiency by an order
                 of magnitude. Low voltage operation, however, increases
                 the effects of parameter variation resulting in
                 significant frequency heterogeneity between (and
                 within) otherwise identical cores. This heterogeneity
                 severely limits the maximum frequency of the entire
                 CMP. We present a combination of techniques aimed at
                 reducing the effects of variation on the performance
                 and energy efficiency of near-threshold, many-core
                 CMPs. Dual Voltage Rail (DVR), mitigates core-to-core
                 variation with a dual-rail power delivery system that
                 allows post-manufacturing assignment of different
                 supply voltages to individual cores. This speeds up
                 slow cores by assigning them to a higher voltage and
                 saves power on fast cores by assigning them to a lower
                 voltage. Half-Speed Unit (HSU) mitigates within-core
                 variation by halving the frequency of select functional
                 blocks with the goal of boosting the frequency of
                 individual cores, thus raising the frequency ceiling
                 for the entire CMP. Together, these variation-reduction
                 techniques result in almost 50\% improvement in CMP
                 performance for the same power consumption over a mix
                 of workloads.",
  acknowledgement = ack-nhfb,
  affiliation =  "Miller, TN (Reprint Author), Ohio State Univ, Dept
                 Comp Sci \& Engn, Columbus, OH 43210 USA. Miller,
                 Timothy N.; Thomas, Renji; Teodorescu, Radu, Ohio State
                 Univ, Dept Comp Sci \& Engn, Columbus, OH 43210 USA.",
  author-email = "millerti@cse.ohio-state.edu thomasr@cse.ohio-state.edu
                 teodores@cse.ohio-state.edu",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CCF-1117799]",
  funding-text = "This work was supported in part by the National
                 Science Foundation under grant CCF-1117799 and an
                 allocation of computing time from the Ohio
                 Supercomputer Center. The authors would like to thank
                 the anonymous reviewers for their suggestions and
                 feedback.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; chip multiprocessors; Clocks; CMP
                 frequency ceiling; CMP performance; Computer
                 architecture; core-to-core variation; Delay; dual
                 supply voltage; dual voltage rail; dual-rail power
                 delivery system; energy conservation; Energy
                 efficiency; energy efficiency; Energy efficiency;
                 frequency heterogeneity; half-speed unit; low voltage
                 operation; microprocessor chips; microprocessor design;
                 Multiprocessing systems; near-threshold voltage;
                 parameter variation; power aware computing; power
                 consumption; Power demand; process variation; process
                 variation effect; Rails; supply voltage assignment;
                 Threshold voltage; transistor threshold voltage;
                 ultra-low voltage chip multiprocessors; within-core
                 variation",
  number-of-cited-references = "15",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Miller:2012:MEP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Li:2012:LSS,
  author =       "Yong Li and Rami Melhem and Alex K. Jones",
  title =        "Leveraging Sharing in Second Level
                 Translation-Lookaside Buffers for Chip
                 Multiprocessors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "49--52",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.35",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Traversing page table during virtual to physical
                 address translation causes significant pipeline stalls
                 when misses occur in the translation-lookaside buffer
                 (TLB). To mitigate this penalty, we propose a fast,
                 scalable, multi-level TLB organization that leverages
                 page sharing behaviors and performs efficient TLB entry
                 placement. Our proposed partial sharing TLB (PSTLB)
                 reduces TLB misses by around 60\%. PSTLB also improves
                 TLB performance by nearly 40\% compared to traditional
                 private TLBs and 17\% over the state of the art
                 scalable TLB proposal.",
  acknowledgement = ack-nhfb,
  affiliation =  "Li, Y (Reprint Author), Univ Pittsburgh, Dept Elect \&
                 Comp Engn, Pittsburgh, PA 15261 USA. Li, Yong, Univ
                 Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA
                 15261 USA.",
  author-email = "yol26@pitt.edu",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-0702452]",
  funding-text = "This work is supported by NSF award CCF-0702452",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; buffer storage; chip
                 multiprocessor; CMPs; Fluids; microprocessor chips;
                 multilevel TLB organization; multiprocessing systems;
                 Oceans; page sharing behavior; Partial Sharing; partial
                 sharing TLB; Prefetching; private TLB; program
                 interpreters; Runtime; second level
                 translation-lookaside buffers; Tiles; TLB entry
                 placement; TLBs; Virtual private networks;
                 virtual-to-physical address translation",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Li:2012:LSS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Delimitrou:2012:DDS,
  author =       "Christina Delimitrou and Sriram Sankar and Kushagra
                 Vaid and Christos Kozyrakis",
  title =        "Decoupling Datacenter Storage Studies from Access to
                 Large-Scale Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "53--56",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2011.37",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Suboptimal storage design has significant cost and
                 power impact in large-scale datacenters (DCs).
                 Performance, power and cost-optimized systems require
                 deep understanding of target workloads, and mechanisms
                 to effectively model different storage design choices.
                 Traditional benchmarking is invalid in cloud
                 data-stores, representative storage profiles are hard
                 to obtain, while replaying applications in different
                 storage configurations is impractical both in cost and
                 time. Despite these issues, current workload generators
                 are not able to reproduce key aspects of real
                 application patterns (e.g., spatial/temporal locality,
                 I/O intensity). In this paper, we propose a modeling
                 and generation framework for large-scale storage
                 applications. As part of this framework we use a state
                 diagram-based storage model, extend it to a
                 hierarchical representation, and implement a tool that
                 consistently recreates DC application I/O loads. We
                 present the principal features of the framework that
                 allow accurate modeling and generation of storage
                 workloads, and the validation process performed against
                 ten original DC application traces. Finally, we explore
                 two practical applications of this methodology: SSD
                 caching and defragmentation benefits on enterprise
                 storage. Since knowledge of the workload's spatial and
                 temporal locality is necessary to model these use
                 cases, our framework was instrumental in quantifying
                 their performance benefits. The proposed methodology
                 provides detailed understanding of the storage activity
                 of large-scale applications, and enables a wide
                 spectrum of storage studies, without the requirement to
                 access application code and full application
                 deployment.",
  acknowledgement = ack-nhfb,
  affiliation =  "Delimitrou, C (Reprint Author), Stanford Univ,
                 Stanford, CA 94305 USA. Delimitrou, Christina;
                 Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305
                 USA. Sankar, Sriram; Vaid, Kushagra, Microsoft Corp,
                 Seattle, WA USA.",
  author-email = "cdel@stanford.edu srsankar@microsoft.com
                 kvaid@microsoft.com kozyraki@stanford.edu",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cloud data-store; Computational modeling; computer
                 centres; cost impact; datacenter storage; Electronic
                 mail; enterprise storage defragmentation; Generators;
                 large-scale datacenter; Load modeling; Mass storage;
                 Modeling of computer architecture; Modeling techniques;
                 power impact; SSD caching; state diagram-based storage
                 model; Storage area networks; storage design choice;
                 storage management; storage profile; storage workload;
                 suboptimal storage design; Super (very large)
                 computers; Throughput; Very large scale integration;
                 workload spatial locality; workload temporal locality",
  number-of-cited-references = "7",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Delimitrou:2012:DDS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Chen:2012:NPD,
  author =       "Jie Chen and Guru Venkataramani and Gabriel Parmer",
  title =        "The Need for Power Debugging in the Multi-Core
                 Environment",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "57--60",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Debugging an application for power has a wide array of
                 benefits ranging from minimizing the thermal hotspots
                 to reducing the likelihood of CPU malfunction. In this
                 work, we justify the need for power debugging, and show
                 that performance debugging of a parallel application
                 does not automatically guarantee power balance across
                 multiple cores. We perform experiments and show our
                 results using two case study benchmarks, Volrend from
                 Splash-2 and Bodytrack from Parsec-1.0.",
  acknowledgement = ack-nhfb,
  affiliation =  "Chen, J (Reprint Author), George Washington Univ,
                 Washington, DC 20052 USA. Chen, Jie; Venkataramani,
                 Guru; Parmer, Gabriel, George Washington Univ,
                 Washington, DC 20052 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CCF-1117243]",
  funding-text = "This material is based upon work supported in part by
                 the National Science Foundation under Grant No.
                 CCF-1117243.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Bodytrack; Debugging; Instruction
                 sets; Multi-cores; multicore environment; Multicore
                 processing; multiprocessing systems; parallel
                 application; parallel programming; Parsec-1.0;
                 performance debugging; power aware computing; power
                 balance; Power Debugging; power debugging; Power
                 Debugging; Power demand; Power Imbalance; program
                 debugging; Splash-2; Volrend",
  number-of-cited-references = "18",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Chen:2012:NPD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Meza:2012:EES,
  author =       "Justin Meza and Jichuan Chang and HanBin Yoon and Onur
                 Mutlu and Parthasarathy Ranganathan",
  title =        "Enabling Efficient and Scalable Hybrid Memories Using
                 Fine-Granularity {DRAM} Cache Management",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "61--64",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hybrid main memories composed of DRAM as a cache to
                 scalable non-volatile memories such as phase-change
                 memory (PCM) can provide much larger storage capacity
                 than traditional main memories. A key challenge for
                 enabling high-performance and scalable hybrid memories,
                 though, is efficiently managing the metadata (e.g.,
                 tags) for data cached in DRAM at a fine granularity.
                 Based on the observation that storing metadata off-chip
                 in the same row as their data exploits DRAM row buffer
                 locality, this paper reduces the overhead of
                 fine-granularity DRAM caches by only caching the
                 metadata for recently accessed rows on-chip using a
                 small buffer. Leveraging the flexibility and efficiency
                 of such a fine-granularity DRAM cache, we also develop
                 an adaptive policy to choose the best granularity when
                 migrating data into DRAM. On a hybrid memory with a
                 512MB DRAM cache, our proposal using an 8KB on-chip
                 buffer can achieve within 6\% of the performance of,
                 and 18\% better energy efficiency than, a conventional
                 8MB SRAM metadata store, even when the energy overhead
                 due to large SRAM metadata storage is not considered.",
  acknowledgement = ack-nhfb,
  affiliation =  "Meza, J (Reprint Author), Carnegie Mellon Univ,
                 Pittsburgh, PA 15213 USA. Meza, Justin; Yoon, HanBin;
                 Mutlu, Onur, Carnegie Mellon Univ, Pittsburgh, PA 15213
                 USA. Chang, Jichuan; Ranganathan, Parthasarathy,
                 Hewlett Packard Labs, Palo Alto, CA USA.",
  author-email = "meza@cmu.edu jichuan.chang@hp.com hanbinyoon@cmu.edu
                 onur@cmu.edu partha.ranganathan@hp.com",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF CAREER [CCF-0953246]; NSF EAGER
                 [CCF-1147397]; Gigascale Systems Research Center",
  funding-text = "We thank the members of the SAFARI research group and
                 the anonymous reviewers for their comments and
                 suggestions. We gratefully acknowledge the support of
                 an NSF CAREER Award CCF-0953246, NSF EAGER Grant
                 CCF-1147397, and the Gigascale Systems Research Center.
                 Part of this work was done while Justin Meza and HanBin
                 Yoon were interns at Hewlett-Packard Labs.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Buffer storage; Cache memories; Cache
                 memory; cache storage; data migration; DRAM chips; DRAM
                 row buffer locality; dynamic random access memory;
                 fine-granularity DRAM cache management; hybrid main
                 memories; hybrid main memory; Indexes; Memory
                 management; meta data; metadata caching; metadata
                 management; metadata storage; non-volatile memories;
                 Phase change materials; phase-change memory; Random
                 access memory; scalable hybrid memory;
                 System-on-a-chip; tag storage",
  number-of-cited-references = "16",
  research-areas = "Computer Science",
  times-cited =  "35",
  unique-id =    "Meza:2012:EES",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zidenberg:2012:MHS,
  author =       "Tsahee Zidenberg and Isaac Keslassy and Uri Weiser",
  title =        "{MultiAmdahl}: How Should {I} Divide My Heterogeneous
                 Chip?",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "65--68",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Future multiprocessor chips will integrate many
                 different units, each tailored to a specific
                 computation. When designing such a system, a chip
                 architect must decide how to distribute the available
                 limited system resources, such as area and power, among
                 all the computational units. In this paper, we
                 introduce MultiAmdahl, an analytical optimization
                 technique for resource sharing among heterogeneous
                 units. MultiAmdahl takes into account the workload, the
                 performance of each computational unit, and the total
                 available resource. The results obtained by MultiAmdahl
                 allow us, for example, to provide a closed-form
                 solution for an optimal asymmetric-offload chip, and to
                 analyze the impact of different design constraints on
                 an optimal chip architecture.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zidenberg, T (Reprint Author), Technion Israel Inst
                 Technol, EE Dept, Haifa, Israel. Zidenberg, Tsahee;
                 Keslassy, Isaac; Weiser, Uri, Technion Israel Inst
                 Technol, EE Dept, Haifa, Israel.",
  author-email = "tsahee@tx.technion.ac.il isaac@ee.technion.ac.il
                 weiser@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "057JO",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "European Research Council [210389]; Intel
                 Heterogeneous Computing research grant",
  funding-text = "This work was partly supported by the European
                 Research Council Starting Grant No. 210389 and by the
                 Intel Heterogeneous Computing research grant.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "area resource; asymmetric-offload chip; Central
                 Processing Unit; Chip Multiprocessors; Computational
                 modeling; computational unit; Computer architecture;
                 design constraint; heterogeneous unit; heterogenous
                 chip; Mathematical model; microprocessor chips;
                 Modeling of computer architecture; MultiAmdahl
                 analytical optimization technique; multiprocessing
                 systems; multiprocessor chip; optimal chip
                 architecture; Optimization; power resource; Program
                 processors; resource allocation; Resource management;
                 resource sharing",
  keywords-plus = "AMDAHLS LAW",
  number-of-cited-references = "7",
  research-areas = "Computer Science",
  times-cited =  "12",
  unique-id =    "Zidenberg:2012:MHS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2012:BC,
  author =       "Anonymous",
  title =        "[Back cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "c4--c4",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.38",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:BIC,
  author =       "Anonymous",
  title =        "[Back inside cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "c3--c3",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.37",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2012:FIC,
  author =       "Anonymous",
  title =        "[Front inside cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "11",
  number =       "2",
  pages =        "c2--c2",
  month =        jul # "\slash " # dec,
  year =         "2012",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.36",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Skadron:2013:INE,
  author =       "Kevin Skadron",
  title =        "Introducing the New {Editor-in-Chief} of the
                 {{\booktitle{IEEE Computer Architecture Letters}}}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "1--1",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.15",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The out-going Editor-in-Chief introduces Jose F.
                 Mart{\'\i}nez as the new Editor-in-Chief (EIC) of the
                 IEEE Computer Architecture Letters (CAL). A brief
                 professional biography is included. In addition, it is
                 noted that CAL aims to provide fast-turnaround for
                 early work with outstanding promise. The majority of
                 decisions are returned within one month, nearly all
                 within six weeks, and all decisions are rendered within
                 two months. The overall acceptance rate has
                 consistently run at about 25\%. Many papers first
                 published in CAL go on to become full papers in premier
                 conferences and journals, and CAL's impact factor
                 continues to increase. CAL has been a valuable addition
                 to the publishing landscape in computer architecture
                 and under Prof. Martinez's leadership, we can look
                 forward to even greater impact in the future. I would
                 like to take this opportunity to thank all of the CAL
                 Associate Editors, authors, readers, and reviewers for
                 their great help and support.",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  number-of-cited-references = "0",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Skadron:2013:INE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2013:AI,
  author =       "Anonymous",
  title =        "2012 Annual Index",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This index covers all technical items - papers,
                 correspondence, reviews, etc. - that appeared in this
                 periodical during the year, and items from previous
                 years that were commented upon or corrected in this
                 year. Departments and other items may also be covered
                 if they have been judged to have archival value. The
                 Author Index contains the primary entry for each item,
                 listed under the first author's name. The primary entry
                 includes the co-authors' names, the title of the paper
                 or other item, and its location, specified by the
                 publication abbreviation, year, month, and inclusive
                 pagination. The Subject Index contains entries
                 describing the item under all appropriate subject
                 headings, plus the first author's name, the publication
                 abbreviation, month, and year, and inclusive pages.
                 Note that the item title is found only under the
                 primary entry in the Author Index.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Indexes",
}

@Article{Eeckhout:2013:MNE,
  author =       "Lieven Eeckhout",
  title =        "A Message from the New {Editor-in-Chief} and
                 Introduction of New {Associate Editors}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "2--2",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  affiliation =  "Eeckhout, L (Reprint Author), Univ Ghent, B-9000
                 Ghent, Belgium.",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  number-of-cited-references = "0",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Eeckhout:2013:MNE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Martinez:2013:MNE,
  author =       "J. Martinez",
  title =        "A Message from the New {Editor-in-Chief} and
                 Introduction of New {Associate} Editors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "2--4",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.12",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The incoming Editor-in-Chief states that his goal
                 during his tenure with IEEE Computer Architecture
                 Letters (CAL) will be to further increase its
                 visibility in our research community, and to attract
                 more submissions from computer architecture leaders.
                 The {"Best} of {CAL"} session at HPCA, which has taken
                 place for the last couple of years, is a good step in
                 this direction. He is also committed to continue
                 improving the coordination with authors and conference
                 program chairs, and to consolidate CAL's unique place
                 in the publication pipeline as the prime venue for
                 quick dissemination of high-quality novel ideas and
                 early results.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Tavakkol:2013:NSS,
  author =       "Arash Tavakkol and Mohammad Arjomand and Hamid
                 Sarbazi-Azad",
  title =        "{Network-on-SSD}: A Scalable and High-Performance
                 Communication Design Paradigm for {SSDs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In recent years, flash memory solid state disks (SSDs)
                 have shown a great potential to change storage
                 infrastructure because of its advantages of high speed
                 and high throughput random access. This promising
                 storage, however, greatly suffers from performance loss
                 because of frequent ``erase-before-write'' and
                 ``garbage collection'' operations. Thus. novel
                 circuit-level, architectural, and algorithmic
                 techniques are currently explored to address these
                 limitations. In parallel with others, current study
                 investigates replacing shared buses in multi-channel
                 architecture of SSDs with an interconnection network to
                 achieve scalable, high throughput, and reliable SSD
                 storage systems. Roughly speaking, such a communication
                 scheme provides superior parallelism that allows us to
                 compensate the main part of the performance loss
                 related to the aforementioned limitations through
                 increasing data storage and retrieval processing
                 throughput.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tavakkol, A (Reprint Author), Sharif Univ Technol,
                 Dept Comp Engn, HPCAN Lab, Tehran, Iran. Tavakkol,
                 Arash; Arjomand, Mohammad; Sarbazi-Azad, Hamid, Sharif
                 Univ Technol, Dept Comp Engn, HPCAN Lab, Tehran, Iran.
                 Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch
                 Comp Sci, Tehran, Iran.",
  author-email = "tavakkol@ce.sharif.edu arjomand@ce.sharif.edu
                 azad@sharif.edu",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "algorithmic technique; architectural technique;
                 Bandwidth; Buffer storage; circuit-level technique;
                 Complexity theory; Data storage systems; data storage
                 throughput; flash memories; Flash memory; flash memory
                 solid state disks; frequent erase-before-write
                 operations; garbage collection operations; high speed
                 random access; high throughput random access;
                 high-performance communication design paradigm;
                 integrated circuit design; integrated circuit
                 reliability; Inter-package parallelism; interconnection
                 network; Interconnection network; interconnection
                 network; Interconnections (Subsystems); Mass storage;
                 memory architecture; multichannel architecture;
                 multiprocessor interconnection networks;
                 network-on-chip; network-on-SSD; parallel memories;
                 Parallel processing; parallel storage; performance
                 evaluation; performance loss; retrieval processing
                 throughput; scalable communication design paradigm;
                 Solid state disk; SSD storage system reliability;
                 storage infrastructure; storage management; system
                 buses; Throughput",
  keywords-plus = "MEMORY",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Tavakkol:2013:NSS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Sun:2013:NWC,
  author =       "Guang Sun and Chia-Wei Chang and Bill Lin",
  title =        "A New Worst-Case Throughput Bound for Oblivious
                 Routing in Odd Radix Mesh Network",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "1/2 network capacity is often believed to be the limit
                 of worst-case throughput for mesh networks. However,
                 this letter provides a new worst-case throughput bound,
                 which is higher than 1/2 network capacity, for odd
                 radix two-dimensional mesh networks. In addition, we
                 propose a routing algorithm called U2TURN that can
                 achieve this worst-case throughput bound. U2TURN
                 considers all routing paths with at most 2 turns and
                 distributes the traffic loads uniformly in both X and Y
                 dimensions. Theoretical analysis and simulation results
                 show that U2TURN outperforms existing routing
                 algorithms in worst-case throughput. Moreover, U2TURN
                 achieves good average-throughput at the expense of
                 approximately 1.5x minimal average hop count.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sun, G (Reprint Author), Tsinghua Univ, Beijing,
                 Peoples R China. Sun, Guang, Tsinghua Univ, Beijing,
                 Peoples R China. Chang, Chia-Wei; Lin, Bill, Univ Calif
                 San Diego, San Diego, CA 92103 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithm design and analysis; average-case
                 throughput; Computer architecture; Interconnection
                 architectures; mesh; Mesh networks; network capacity;
                 network-on-chip; Networks-on-Chip (NoC); oblivious
                 routing; odd radix mesh network; odd radix
                 two-dimensional mesh network; On-chip interconnection
                 networks; Parallel algorithms; Routing; routing;
                 Routing; Routing protocols; Throughput; traffic load;
                 U2TURN; Worst-case analysis; worst-case throughput;
                 worst-case throughput bound",
  number-of-cited-references = "10",
  ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X",
  research-areas = "Computer Science",
  researcherid-numbers = "Lin, Binshan/A-9772-2009",
  times-cited =  "1",
  unique-id =    "Sun:2013:NWC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Karsli:2013:EDT,
  author =       "I. Burak Karsli and Pedro Reviriego and M. Fatih Balli
                 and O{\u{g}}uz Ergin and J. A. Maestro",
  title =        "Enhanced Duplication: a Technique to Correct Soft
                 Errors in Narrow Values",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Soft errors are transient errors that can alter the
                 logic value of a register bit causing data corruption.
                 They can be caused by radiation particles such as
                 neutrons or alpha particles. Narrow values are commonly
                 found in the data consumed or produced by processors.
                 Several techniques have recently been proposed to
                 exploit the unused bits in narrow values to protect
                 them against soft errors. These techniques replicate
                 the narrow value over the unused register bits such
                 that errors can be detected when the value is
                 duplicated and corrected when the value is tripled. In
                 this letter, a technique that can correct errors when
                 the narrow value is only duplicated is presented. The
                 proposed approach stores a modified duplicate of the
                 narrow value such that errors on the original value and
                 the duplicate can be distinguished and therefore
                 corrected. The scheme has been implemented at the
                 circuit level to evaluate its speed and also at the
                 architectural level to assess the benefits in
                 correcting soft errors. The results show that the
                 scheme is significantly faster than a parity check and
                 can improve substantially the number of soft errors
                 that are corrected compared to existing techniques.",
  acknowledgement = ack-nhfb,
  affiliation =  "Karsli, IB (Reprint Author), TOBB Univ Econ \&
                 Technol, Ankara, Turkey. Karsli, I. Burak; Balli, M.
                 Fatih; Ergin, O{\u{g}}uz, TOBB Univ Econ \& Technol,
                 Ankara, Turkey. Reviriego, Pedro; Maestro, J. A., Univ
                 Antonio de Nebrija, Madrid, Spain.",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Spanish Ministry of Science and Education
                 [AYA2009-13300-C03]; Scientific and Technological
                 Research Council of Turkey (TUBITAK) [112E004]",
  funding-text = "This work was supported in part by the Spanish
                 Ministry of Science and Education under Grant
                 AYA2009-13300-C03 and by the Scientific and
                 Technological Research Council of Turkey (TUBITAK)
                 under Grant 112E004. The work is a collaboration in the
                 framework of COST ICT Action 1103 ``Manufacturable and
                 Dependable Multicore Architectures at Nanoscale''.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "alpha particles; architectural level; Benchmark
                 testing; computer architecture; Data Cache; data
                 corruption; Data processing; enhanced duplication;
                 Error correction; Error Correction; Error correction;
                 Error-checking; Logic gates; logic value;
                 microprocessor chips; narrow values; Narrow Values;
                 narrow values; neutrons; Parity check codes;
                 processors; Program processors; radiation hardening
                 (electronics); radiation particles; Redundant design;
                 register bit; Registers; soft errors; Soft Errors; soft
                 errors",
  number-of-cited-references = "11",
  ORCID-numbers = "Sousa, Leonel/0000-0002-8066-221X Ergin,
                 O{\u{g}}uz/0000-0003-2701-3787 Maestro, Juan
                 Antonio/0000-0001-7133-9026 Reviriego,
                 Pedro/0000-0001-6805-6519",
  research-areas = "Computer Science",
  researcherid-numbers = "Sousa, Leonel/B-2749-2009 Ergin,
                 O{\u{g}}uz/E-5717-2010 Maestro, Juan
                 Antonio/L-6091-2014 Reviriego, Pedro/B-8353-2009",
  times-cited =  "2",
  unique-id =    "Karsli:2013:EDT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lyons:2013:SFF,
  author =       "Michael Lyons and Gu-Yeon Wei and David Brooks",
  title =        "{Shrink-Fit}: A Framework for Flexible Accelerator
                 Sizing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "RTL design complexity discouraged adoption of
                 reconfigurable logic in general purpose systems,
                 impeding opportunities for performance and energy
                 improvements. Recent improvements to HLS compilers
                 simplify RTL design and are easing this barrier. A new
                 challenge will emerge: managing reconfigurable
                 resources between multiple applications with custom
                 hardware designs. In this paper, we propose a method to
                 ``shrink-fit' accelerators within widely varying fabric
                 budgets. Shrink-fit automatically shrinks existing
                 accelerator designs within small fabric budgets and
                 grows designs to increase performance when larger
                 budgets are available. Our method takes advantage of
                 current accelerator design techniques and introduces a
                 novel architectural approach based on fine-grained
                 virtualization. We evaluate shrink-fit using a
                 synthesized implementation of an IDCT for decoding
                 JPEGs and show the IDCT accelerator can shrink by a
                 factor of 16x with minimal performance and area
                 overheads. Using shrink-fit, application designers can
                 achieve the benefits of hardware acceleration with
                 single RTL designs on FPGAs large and small.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lyons, M (Reprint Author), Harvard Univ, Sch Engn \&
                 Appl Sci, Cambridge, MA 02138 USA. Lyons, Michael; Wei,
                 Gu-Yeon; Brooks, David, Harvard Univ, Sch Engn \& Appl
                 Sci, Cambridge, MA 02138 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accelerators; computational complexity; Computer
                 applications; custom hardware design; Decoding;
                 discrete cosine transforms; fabric budget; field
                 programmable gate arrays; Field programmable gate
                 arrays; fine grained virtualization; flexible
                 accelerator sizing; FPGA; general purpose computers;
                 general purpose system; hardware acceleration;
                 Heterogeneous (hybrid) systems; HLS compiler; IDCT
                 accelerator; inverse transforms; JPEG decoding; program
                 compilers; Program processors; reconfigurable
                 architectural approach; reconfigurable architectures;
                 Reconfigurable hardware; reconfigurable logic;
                 reconfigurable resource management; RTL design
                 complexity; Runtime; shrink fit accelerator;
                 Special-Purpose and Application-Based Systems; temporal
                 logic; virtual machines; virtualisation",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Lyons:2013:SFF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Duong:2013:CAS,
  author =       "Nam Duong and Alexander V. Veidenbaum",
  title =        "Compiler-Assisted, Selective Out-Of-Order Commit",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper proposes an out-of-order instruction commit
                 mechanism using a novel compiler/architecture
                 interface. The compiler creates instruction ``blocks''
                 guaranteeing some commit conditions and the processor
                 uses the block information to commit certain
                 instructions out of order. Micro-architectural support
                 for the new commit mode is made on top of the standard,
                 ROB-based processor and includes out-of-order
                 instruction commit with register and load queue entry
                 release. The commit mode may be switched multiple times
                 during execution. Initial results for a 4-wide
                 processor show that, on average, 52\% instructions are
                 committed out of order resulting in 10\% to 26\%
                 speedups over in-order commit, with minimal hardware
                 overhead. The performance improvement is a result of an
                 effectively larger instruction window that allows more
                 cache misses to be overlapped for both L1 and L2
                 caches.",
  acknowledgement = ack-nhfb,
  affiliation =  "Duong, N (Reprint Author), Univ Calif Irvine, Dept
                 Comp Sci, Irvine, CA 92717 USA. Duong, Nam; Veidenbaum,
                 Alexander V., Univ Calif Irvine, Dept Comp Sci, Irvine,
                 CA 92717 USA.",
  author-email = "nlduong@ics.uci.edu alexv@ics.uci.edu",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architecture/compiler co-design; Benchmark testing;
                 block information; cache misses; cache storage; Cache
                 storage; cache storage; Cache storage; commit
                 conditions; compiler-architecture interface;
                 compiler-assisted selective out-of-order commit;
                 computer architecture; Computer architecture; computer
                 architecture; dynamically-scheduled and
                 statically-scheduled implementation; Hardware/software
                 interfaces; instruction blocks; instruction sets; L1
                 cache; L2 cache; load queue entry release;
                 microarchitectural support; minimal hardware overhead;
                 Out of order instruction; Out-of-order commit;
                 out-of-order instruction commit mechanism; overlapping
                 cache misses; performance evaluation; performance
                 improvement; Pipeline implementation; Pipeline
                 processors; program compilers; Program processors;
                 register; resource release; RISC/CISC; ROB-based
                 processor; Superscalar; VLIW architectures; Von Neumann
                 architectures",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Duong:2013:CAS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Nilakantan:2013:MES,
  author =       "Siddharth Nilakantan and Steven Battle and Mark
                 Hempstead",
  title =        "Metrics for Early-Stage Modeling of Many-Accelerator
                 Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The term `Dark Silicon'' has been coined to describe
                 the threat to microprocessor performance caused by
                 increasing transistor power density. Improving energy
                 efficiency is now the primary design goal for all
                 market segments of microprocessors from mobile to
                 server. Specialized hardware accelerators, designed to
                 run only a subset of workloads with orders of magnitude
                 energy efficiency improvement, are seen as a potential
                 solution. Selecting an ensemble of accelerators to best
                 cover the workloads run on a platform remains a
                 challenge. We propose metrics for accelerator selection
                 derived from a detailed communication-aware performance
                 model and present an automated methodology to populate
                 this model. Employing a combination of characterized
                 RTL and our selection metrics, we evaluate a set of
                 accelerators for a sample application and compare
                 performance to selections based on execution time and
                 Pollack's rule. We find that the architecture selected
                 by our communication-aware metric shows improved
                 performance over architectures selected based on
                 execution time and Pollack's rule, as they do not
                 account for speedup being limited by communication.",
  acknowledgement = ack-nhfb,
  affiliation =  "Nilakantan, S (Reprint Author), Drexel Univ, Dept
                 Elect \& Comp Engn, Philadelphia, PA 19104 USA.
                 Nilakantan, Siddharth; Battle, Steven; Hempstead, Mark,
                 Drexel Univ, Dept Elect \& Comp Engn, Philadelphia, PA
                 19104 USA.",
  author-email = "sn446@drexel.edu sjb328@drexel.edu mdh77@drexel.edu",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accelerators; Code Profiling; communication-aware
                 performance model; Computer architecture; computer
                 architecture; Computer Systems Organization; dark
                 silicon; General; hardware accelerators; Heterogeneous
                 (hybrid) systems; Heterogeneous Architectures;
                 magnitude energy efficiency improvement;
                 many-accelerator architectures; microprocessor;
                 microprocessor chips; Modeling; Modeling of computer
                 architecture; modelling; Multiprocessing systems; Other
                 Architecture Styles; performance evaluation; Pollack
                 rule; Processor Architectures; Program processors; RTL;
                 transistor power density; transistors",
  number-of-cited-references = "16",
  ORCID-numbers = "Nilakantan, Siddharth/0000-0003-1067-700X",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Nilakantan:2013:MES",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Delimitrou:2013:NCD,
  author =       "Christina Delimitrou and Christos Kozyrakis",
  title =        "The {Netflix} Challenge: Datacenter Edition",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "29--32",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.10",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The hundreds of thousands of servers in modern
                 warehouse scale systems make performance and efficiency
                 optimizations pressing design challenges. These systems
                 are traditionally considered homogeneous. However, that
                 is not typically the case. Multiple server generations
                 compose a heterogeneous environment, whose performance
                 opportunities have not been fully explored since
                 techniques that account for platform heterogeneity
                 typically do not scale to the tens of thousands of
                 applications hosted in large-scale cloud providers. We
                 present ADSM, a scalable and efficient recommendation
                 system for application-to-server mapping in large-scale
                 datacenters (DCs) that is QoS-aware. ADSM overcomes the
                 drawbacks of previous techniques, by leveraging robust
                 and computationally efficient analytical methods to
                 scale to tens of thousands of applications with minimal
                 overheads. It is also OoS-aware, mapping applications
                 to platforms while enforcing strict QoS guarantees.
                 ADSM is derived from validated analytical models, has
                 low and bounded prediction errors, is simple to
                 implement and scales to thousands of applications
                 without significant changes to the system. Over 390
                 real DC workloads, ADSM improves performance by 16\% on
                 average and up to 2.5x and efficiency by 22\% in a DC
                 with 10 different server configurations.",
  acknowledgement = ack-nhfb,
  affiliation =  "Delimitrou, C (Reprint Author), Stanford Univ,
                 Stanford, CA 94305 USA. Delimitrou, Christina;
                 Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305
                 USA.",
  author-email = "cdel@stanford.edu kozyraki@stanford.edu",
  da =           "2019-06-20",
  doc-delivery-number = "172HT",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ADSM; application mapping; Application studies
                 resulting in better multiple-processor systems;
                 application-to-server mapping; Computer architecture;
                 computer centres; Computer System Implementation;
                 Computer Systems Organization; Data centers;
                 datacenter; design challenge; Design studies;
                 evaluation; Heterogeneous (hybrid) systems; Large and
                 Medium ( Mainframe ) Computers; Large-scale systems;
                 Measurement; modeling; Multiprocessing systems; Netflix
                 challenge; Other Architecture Styles; Parallel
                 Architectures; Performance of Systems; Processor
                 Architectures; QoS-aware; quality of service;
                 Scheduling; Scheduling and task partitioning; server
                 generation; simulation of multiple-processor systems;
                 Special-Purpose and Application-Based Systems; Super
                 (very large) computers; warehouse-scale system",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Delimitrou:2013:NCD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2013:RL,
  author =       "Anonymous",
  title =        "2012 reviewers list",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "33--34",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.11",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The publication offers a note of thanks and lists its
                 reviewers.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "IEEE publishing",
}

@Article{Anonymous:2013:IOAa,
  author =       "Anonymous",
  title =        "{IEEE} Open Access Publishing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "35--35",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.13",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Advertisement: This publication offers open access
                 options for authors. IEEE open access publishing.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2013:ITN,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Transactions}}} Newsletter",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "1",
  pages =        "36--36",
  month =        jan # "\slash " # jun,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.14",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Advertisement: Stay connected with the IEEE Computer
                 Society Transactions by signing up for our new
                 Transactions Connection newsletter. It is free and
                 contains valuable information.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Martinez:2013:E,
  author =       "J. F. Martinez",
  title =        "Editorial",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "37--38",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.32",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Jian:2013:HPE,
  author =       "Xun Jian and John Sartori and Henry Duwe and Rakesh
                 Kumar",
  title =        "High Performance, Energy Efficient Chipkill Correct
                 Memory with Multidimensional Parity",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "39--42",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.21",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "It is well-known that a significant fraction of server
                 power is consumed in memory; this is especially the
                 case for servers with chipkill correct memories. We
                 propose a new chipkill correct memory organization that
                 decouples correction of errors due to local faults that
                 affect a single symbol in a word from correction of
                 errors due to device-level faults that affect an entire
                 column, sub-bank, or device. By using a combination of
                 two codes that separately target these two fault modes,
                 the proposed chipkill correct organization reduces code
                 overhead by half as compared to conventional chipkill
                 correct memories for the same rank size. Alternatively,
                 this allows the rank size to be reduced by half while
                 maintaining roughly the same total code overhead.
                 Simulations using PARSEC and SPEC benchmarks show that,
                 compared to a conventional double chipkill correct
                 baseline, the proposed memory organization, by
                 providing double chipkill correct at half the rank
                 size, reduces power by up to 41\%, 32\% on average over
                 a conventional baseline with the same chipkill correct
                 strength and access granularity that relies on linear
                 block codes alone, at only 1\% additional code
                 overhead.",
  acknowledgement = ack-nhfb,
  affiliation =  "Jian, X (Reprint Author), Univ Illinois, Urbana, IL
                 USA. Jian, Xun; Sartori, John; Duwe, Henry; Kumar,
                 Rakesh, Univ Illinois, Urbana, IL USA.",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "block codes; chipkill correct; chipkill correct memory
                 organization; code overhead reduction; Computer
                 architecture; device level fault; DRAM; DRAM chips;
                 error correction; error correction codes; fault mode;
                 fault tolerant computing; granular computing;
                 granularity access; linear block code; linear codes;
                 low power; Low power electronics; PARSEC; Random access
                 memory; rank size; reliable memory; server power
                 consumption; Servers; SPEC; storage management",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Jian:2013:HPE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Maddah:2013:DDS,
  author =       "Rakan Maddah and Sangyeun Cho and Rami Melhem",
  title =        "Data Dependent Sparing to Manage Better-Than-Bad
                 Blocks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "43--46",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.20",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We forecast that proper handling of unreliable storage
                 blocks (e.g., ``bad block management'' in solid-state
                 drives) will remain critical for future systems built
                 with advanced and emerging memory technologies. This
                 paper argues that the conventional block retirement and
                 sparing approach --- a block is retired as soon as it
                 shows faulty behavior --- is overly conservative and
                 inefficient. We observe that it is highly unlikely that
                 all faulty bits in a storage block manifest errors.
                 Consequently, we propose data dependent sparing, a
                 relaxed block retirement and sparing approach that
                 recycles faulty storage blocks. At small management
                 cost and with less than 1\% sparing, data dependent
                 sparing achieves the same lifetime as the conventional
                 approach with 20\% sparing.",
  acknowledgement = ack-nhfb,
  affiliation =  "Maddah, R (Reprint Author), Univ Pittsburgh, Dept Comp
                 Sci, Pittsburgh, PA 15260 USA. Maddah, Rakan; Cho,
                 Sangyeun; Melhem, Rami, Univ Pittsburgh, Dept Comp Sci,
                 Pittsburgh, PA 15260 USA.",
  author-email = "rmaddah@cs.pitt.edu cho@cs.pitt.edu
                 melhem@cs.pitt.edu",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CCF-1064976, CCF-1059283,
                 CNS-1012070]",
  funding-text = "This work is supported in part by NSF grants
                 CCF-1064976, CCF-1059283, and CNS-1012070.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "better-than-bad block management; data dependent
                 sparing; data dependent sparing approach; Data storage
                 systems; fault tolerant computing; faulty bits; faulty
                 storage blocks; flash memory; Flash memory; flash
                 memory; management cost; memory technologies; phase
                 change memories; phase-change memory; phase-change
                 memory (PCM); relaxed block retirement approach;
                 solid-state drive; solid-state drive (SSD); Solid-state
                 drives; solid-state drives; Sparing; sparing; storage
                 block; storage management; stuck-at faults; unreliable
                 storage block handling",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Maddah:2013:DDS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2013:CFC,
  author =       "Hanjoon Kim and Yonggon Kim and John Kim",
  title =        "Clumsy Flow Control for High-Throughput Bufferless
                 On-Chip Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "47--50",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.22",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Bufferless on-chip networks are an alternative type of
                 on-chip network organization that can improve the
                 cost-efficiency of an on-chip network by removing
                 router input buffers. However, bufferless on-chip
                 network performance degrades at high load because of
                 the increased network contention and large number of
                 deflected packets. The energy benefit of bufferless
                 network is also reduced because of the increased
                 deflection. In this work, we propose a novel flow
                 control for bufferless on-chip networks in
                 high-throughput manycore accelerator architectures to
                 reduce the impact of deflection routing. By using a
                 clumsy flow control (CFC), instead of the per-hop flow
                 control that is commonly used in buffered on-chip
                 networks, we are able to reduce the amount of
                 deflection by up to 92\% on high-throughput workloads.
                 As a result, on average, CFC can approximately match
                 the performance of a baseline buffered router while
                 reducing the energy consumption by approximately
                 39\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, H (Reprint Author), Korea Adv Inst Sci \&
                 Technol, Dept Comp Sci, Taejon, South Korea. Kim,
                 Hanjoon; Kim, Yonggon; Kim, John, Korea Adv Inst Sci \&
                 Technol, Dept Comp Sci, Taejon, South Korea.",
  author-email = "hanj@kaist.ac.kr ilios@kaist.ac.kr jjk12@kaist.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "MKE, Korea, under the ITRC
                 [NIPA-2012-H0301-12-1011]; BST program through the NRF
                 of Korea; MEST [2012-0003579]",
  funding-text = "This research was supported in part by the MKE, Korea,
                 under the ITRC support program supervised by the NIPA
                 (NIPA-2012-H0301-12-1011) and in part by BST program
                 through the NRF of Korea funded by the
                 MEST(2012-0003579).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "bufferless NoC; bufferless router; CFC; clumsy flow
                 control; computer architecture; Computer architecture;
                 Computer Systems Organization; cost-efficiency
                 improvement; Data processing; deflection routing;
                 deflection routing impact reduction; energy benefit;
                 energy consumption reduction; flow control;
                 high-throughput bufferless on-chip networks;
                 high-throughput manycore accelerator architectures;
                 high-throughput workloads; Interconnection
                 architectures; microprocessor chips; Multiple Data
                 Stream Architectures (Multiprocessors); Multiprocessing
                 systems; network contention; network routing;
                 network-on-chip; On-chip interconnection networks;
                 on-chip network organization; on-chip networks;
                 Parallel architectures; Parallel Architectures;
                 performance evaluation; Processor Architectures; router
                 input buffer removal; System-on-chip",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  researcherid-numbers = "Kim, John/C-1792-2011",
  times-cited =  "7",
  unique-id =    "Kim:2013:CFC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kai:2013:GRP,
  author =       "Yi Kai and Yi Wang and Bin Liu",
  title =        "{GreenRouter}: Reducing Power by Innovating Router's
                 Architecture",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "51--54",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.23",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "High speed routers in Internet are becoming more
                 powerful, as well as more energy hungry. In this paper,
                 we present a new architecture of router named
                 GreenRouter which separates a line-card into two parts:
                 network interface card (DB) and packet processing card
                 (MB), connected by a two-stage switch fabric in traffic
                 flows' ingress and egress direction respectively.
                 Traffic from all DBs shares all the MBs in GreenRouter,
                 thus can be aggregated to a few active MBs on demand
                 and other MBs can be shut down to save power. Several
                 key issues to this new architecture are addressed. We
                 evaluate the power saving efficiency and give
                 preliminary simulation results. GreenRouter can well
                 adapt the traffic fluctuation and real trace
                 evaluations over one week shows that up to 63.7\% power
                 saving can be achieved while QoS constraints are
                 guaranteed.",
  acknowledgement = ack-nhfb,
  affiliation =  "Liu, B (Reprint Author), Tsinghua Univ, Dept Comp Sci
                 \& Technol, Beijing 100084, Peoples R China. Kai, Yi;
                 Wang, Yi; Liu, Bin, Tsinghua Univ, Dept Comp Sci \&
                 Technol, Beijing 100084, Peoples R China.",
  author-email = "kaiyi02@gmail.com pig020623@gmail.com
                 lmyujie@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSFC [61073171]; Tsinghua University
                 Initiative Scientific Research Program [20121080068];
                 Specialized Research Fund for the Doctoral Program of
                 Higher Education of China [20100002110051]",
  funding-text = "This work is supported by NSFC (61073171), Tsinghua
                 University Initiative Scientific Research Program
                 (20121080068), Specialized Research Fund for the
                 Doctoral Program of Higher Education of China
                 (20100002110051).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; DB; Energy efficiency;
                 energy-aware system; green computing; Green design;
                 GreenRouter; High-speed networks; Internet; line-card;
                 low power design; MB; network interface card; packet
                 processing card; power reduction; power saving
                 efficiency; QoS constraints; router; router
                 architecture innovation; Routers; telecommunication
                 network routing; Telecommunication traffic;
                 telecommunication traffic; traffic flow egress
                 direction; traffic flow ingress direction; traffic
                 fluctuation; two-stage switch fabric",
  number-of-cited-references = "6",
  ORCID-numbers = "Wang, Yi/0000-0002-9095-6879",
  research-areas = "Computer Science",
  researcherid-numbers = "Wang, Yi/A-8884-2015",
  times-cited =  "1",
  unique-id =    "Kai:2013:GRP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Joo:2013:HPS,
  author =       "Yongsoo Joo and Sangsoo Park",
  title =        "A Hybrid {PRAM} and {STT--RAM} Cache Architecture for
                 Extending the Lifetime of {PRAM} Caches",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "55--58",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.24",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "To extend the lifetime of phase change RAM (PRAM)
                 caches, we propose a hybrid cache architecture that
                 integrates a relatively small capacity of spin transfer
                 torque RAM (STT--RAM) write buffer with a PRAM cache.
                 Our hybrid cache improves the endurance limitation of
                 the PRAM cache by judiciously redirecting the write
                 traffic from an upper memory layer to the STT--RAM
                 write buffer. We have demonstrated through simulation
                 that the proposed hybrid cache outperforms existing
                 write-traffic reduction schemes with the same area
                 overhead. Moreover, our approach is orthogonal to the
                 existing schemes, providing an effective way of
                 investing die area for cache lifetime extension by
                 being used in combination with them.",
  acknowledgement = ack-nhfb,
  affiliation =  "Joo, Y (Reprint Author), Ewha Womans Univ, Dept Comp
                 Sci \& Engn, Seoul 120750, South Korea. Joo, Yongsoo;
                 Park, Sangsoo, Ewha Womans Univ, Dept Comp Sci \& Engn,
                 Seoul 120750, South Korea.",
  author-email = "ysjoo@ewha.ac.kr sangsoo.park@ewha.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Ewha Womans University",
  funding-text = "We thank Guangyu Sun and Cong Xu for their helpful
                 comments on NVRAM characteristics. This research was
                 supported by RP-Grant 2010 of Ewha Womans University.
                 Sangsoo Park is the corresponding author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache; cache lifetime extension; Cache memories; Cache
                 storage; cache storage; Computer architecture;
                 concurrency theory; Design Styles; endurance; Fault
                 tolerance; Hardware; hybrid cache architecture; hybrid
                 PRAM caches; investing die area; lifetime; memory
                 layer; Memory Structures; phase change memories; phase
                 change RAM; PRAM; Random access memory; Redundancy;
                 Redundant design; Reliability; spin transfer torque
                 RAM; STT RAM cache architecture; STT RAM write buffer;
                 STT--RAM; Testing and Fault-Tolerance; write traffic
                 reduction schemes",
  number-of-cited-references = "14",
  research-areas = "Computer Science",
  times-cited =  "10",
  unique-id =    "Joo:2013:HPS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Blem:2013:MMA,
  author =       "Emily Blem and Hadi Esmaeilzadeh and Renee St Amant
                 and Karthikeyan Sankaralingam and Doug Burger",
  title =        "Multicore Model from Abstract Single Core Inputs",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "59--62",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.27",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper describes a first order multicore model to
                 project a tighter upper bound on performance than
                 previous Amdahl's Law based approaches. The speedup
                 over a known baseline is a function of the core
                 performance, microarchitectural features, application
                 parameters, chip organization, and multicore topology.
                 The model is flexible enough to consider both CPU and
                 GPU like organizations as well as modern topologies
                 from symmetric to aggressive heterogeneous (asymmetric,
                 dynamic, and fused) designs. This extended model
                 incorporates first order effects-exposing more
                 bottlenecks than previous applications of Amdahl's
                 Law-while remaining simple and flexible enough to be
                 adapted for many applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Blem, E (Reprint Author), Univ Wisconsin, Madison, WI
                 53706 USA. Blem, Emily; Sankaralingam, Karthikeyan,
                 Univ Wisconsin, Madison, WI 53706 USA. Esmaeilzadeh,
                 Hadi, Univ Washington, Seattle, WA 98195 USA. St Amant,
                 Renee, Univ Texas Austin, Austin, TX 78712 USA.",
  author-email = "blem@cs.wisc.edu hadianeh@cs.washington.edu
                 stamant@cs.utexas.edu karu@cs.wisc.edu
                 dburger@microsoft.com",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "abstract single core inputs; aggressive heterogeneous
                 designs; Amdahl law based approach; application
                 parameters; chip organization; Computer Systems
                 Organization; CPU like organizations; first order
                 multicore model; General; GPU like organizations;
                 graphics processing units; microarchitectural features;
                 Modeling of computer architecture; multicore topology;
                 multicores; Multiple Data Stream Architectures
                 (Multiprocessors); multiprocessing systems; network
                 topology; parallelism; performance evaluation;
                 Performance modeling; Processor Architectures",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Blem:2013:MMA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Michaud:2013:DMT,
  author =       "Pierre Michaud",
  title =        "Demystifying Multicore Throughput Metrics",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "63--66",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.25",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Several different metrics have been proposed for
                 quantifying the throughput of multicore processors.
                 There is no clear consensus about which metric should
                 be used. Some studies even use several throughput
                 metrics. We show that there exists a relation between
                 single-thread average performance metrics and
                 throughput metrics, and that throughput metrics inherit
                 the meaning or lack of meaning of the corresponding
                 single-thread metric. We show that two popular
                 throughput metrics, the weighted speedup and the
                 harmonic mean of speedups, are inconsistent: they do
                 not give equal importance to all benchmarks. Moreover
                 we demonstrate that the weighted speedup favors
                 unfairness. We show that the harmonic mean of IPCs, a
                 seldom used throughput metric, is actually consistent
                 and has a physical meaning. We explain under which
                 conditions the arithmetic mean or the harmonic mean of
                 IPCs can be used as a strong indicator of throughput
                 increase.",
  acknowledgement = ack-nhfb,
  affiliation =  "Michaud, P (Reprint Author), INRIA Rennes, Rennes,
                 France. INRIA Rennes, Rennes, France.",
  author-email = "Pierre.Michaud@inria.fr",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; Computer Systems Organization;
                 evaluation; Measurement; Modeling; modeling;
                 Multi-core/single-chip multiprocessors; Multicore
                 processing; multicore processors; multicore throughput;
                 multicore throughput metrics; multiprocessing systems;
                 Parallel Architectures; Parallel architectures;
                 Performance evaluation; performance metric; Performance
                 of Systems; Processor Architectures; Program
                 processors; simulation of multiple-processor systems;
                 single thread metric; software metrics",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "7",
  unique-id =    "Michaud:2013:DMT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Tembey:2013:SSS,
  author =       "Priyanka Tembey and Augusto Vega and Alper
                 Buyuktosunoglu and Dilma {Da Silva} and Pradip Bose",
  title =        "{SMT} Switch: Software Mechanisms for Power Shifting",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "67--70",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.26",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Simultaneous multithreading (SMT) as a processor
                 design to achieve higher levels of system and
                 application throughput is a well-accepted and deployed
                 technique in most desktop and server processors. We
                 study the power implications of varying SMT levels
                 i.e., thread counts per core for various multi-threaded
                 applications on a real SMT multicore platform, and
                 introduce a novel software mechanism of changing SMT
                 level of a core to tune platform power. Power-shifting
                 policies by varying per core SMT levels for performance
                 benefits within a power cap are introduced. Projected
                 power savings (of 15\%) for a streaming parallel
                 benchmark can be attained using SMT-level power
                 shifting mechanisms.",
  acknowledgement = ack-nhfb,
  affiliation =  "Tembey, P (Reprint Author), Georgia Tech, Atlanta, GA
                 30332 USA. Tembey, Priyanka, Georgia Tech, Atlanta, GA
                 30332 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "279CD",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application throughput; Computer architecture;
                 Computer Systems Organization; Hardware;
                 multi-threading; Multicore platforms; multiprocessing
                 systems; Multithreaded processors; Multithreading;
                 Operating Systems; Other Architecture Styles; Parallel
                 processing; power aware computing; Power Management;
                 Power shifting; Power system management; Process
                 Management; Processor Architectures; processor design;
                 Program processors; Scheduling; simultaneous
                 multithreading; SMT; SMT multicore platform; SMT
                 switch; SMT-level power shifting mechanism; Software
                 engineering; software mechanisms; Software/Software
                 Engineering; streaming parallel benchmark; tune
                 platform power",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Tembey:2013:SSS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2013:IOAb,
  author =       "Anonymous",
  title =        "{IEEE} Open Access Publishing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "71--71",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.33",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2013:SCI,
  author =       "Anonymous",
  title =        "Stay Connected to the {IEEE Computer Society}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "72--72",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.34",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2013:BC,
  author =       "Anonymous",
  title =        "[Back cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "c4--c4",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.31",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2013:BIC,
  author =       "Anonymous",
  title =        "[Back inside cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "c3--c3",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.30",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2013:FC,
  author =       "Anonymous",
  title =        "[Front cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "c1--c1",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.28",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2013:FIC,
  author =       "Anonymous",
  title =        "[Front inside cover]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "12",
  number =       "2",
  pages =        "c2--c2",
  month =        jul # "\slash " # dec,
  year =         "2013",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.29",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Arelakis:2014:CVA,
  author =       "Angelos Arelakis and Per Stenstr{\"o}m",
  title =        "A Case for a Value-Aware Cache",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.31",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Replication of values causes poor utilization of
                 on-chip cache memory resources. This paper addresses
                 the question: How much cache resources can be
                 theoretically and practically saved if value
                 replication is eliminated? We introduce the concept of
                 value-aware caches and show that a sixteen times
                 smaller value-aware cache can yield the same miss rate
                 as a conventional cache. We then make a case for a
                 value-aware cache design using Huffman-based
                 compression. Since the value set is rather stable
                 across the execution of an application, one can afford
                 to reconstruct the coding tree in software. The
                 decompression latency is kept short by our proposed
                 novel pipelined Huffman decoder that uses canonical
                 codewords. While the (loose) upper-bound compression
                 factor is 5.2X, we show that, by eliminating
                 cache-block alignment restrictions, it is possible to
                 achieve a compression factor of 3.4X for practical
                 designs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Arelakis, A (Reprint Author), Chalmers, Gothenburg,
                 Sweden. Arelakis, Angelos; Stenstrom, Per, Chalmers,
                 Gothenburg, Sweden.",
  author-email = "angelos@chalmers.se per.stenstrom@chalmers.se",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Swedish Research Council",
  funding-text = "This research is supported by the Swedish Research
                 Council. The simulations ran on the resources provided
                 by the Swedish National Infrastructure for Computing
                 (SNIC) at C3SE.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B Hardware; B.3 Memory Structures; B.3.2 Design
                 Styles; B.3.2.b Cache memories; cache storage;
                 cache-block alignment restriction elimination; Clocks;
                 coding tree reconstruction; data compression; data
                 handling; Decoding; decompression latency; E Data; E.4
                 Coding and Information Theory; E.4.a Data compaction
                 and compression; Engines; Huffman codes; Huffman
                 coding; Huffman-based compression; Indexes; on-chip
                 cache memory resources; System-on-a-chip; tree codes;
                 value replication; value-aware cache design",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Arelakis:2014:CVA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Chen:2014:PEC,
  author =       "Zheng Chen and Huaxi Gu and Yintang Yang and Luying
                 Bai and Hui Li",
  title =        "A Power Efficient and Compact Optical Interconnect for
                 Network-on-Chip",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.5",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Optical interconnect is a promising alternative to
                 substitute the electrical interconnect for intra-chip
                 communications. The topology of optical Network-on-Chip
                 (ONoC) has a great impact on the network performance.
                 However, the size of ONoC is limited by the power
                 consumption and crosstalk noise, which are mainly
                 resulted from the waveguide crossings in the topology.
                 In this paper, a diagonal Mesh topology (DMesh) is
                 proposed to relieve the limitation of scalability by
                 reducing the number of waveguide crossing, which is
                 only 20\% that of Mesh. In addition, the number of
                 optical routers in DMesh is less than half of that in
                 Mesh-based ONoC. Due to its compact architecture and
                 favorable scalability, DMesh topology is suitable for
                 large-scale ONoC design.",
  acknowledgement = ack-nhfb,
  affiliation =  "Chen, Z (Reprint Author), Xidian Univ Xian, State Key
                 Lab Integrated Serv Networks, Xian, Peoples R China.
                 Chen, Zheng; Gu, Huaxi; Bai, Luying; Li, Hui, Xidian
                 Univ Xian, State Key Lab Integrated Serv Networks,
                 Xian, Peoples R China. Yang, Yintang, Xidian Univ Xian,
                 Inst Microelect, Xian, Peoples R China.",
  author-email = "chenzheng8331@stu.xidian.edu.cn hxgu@xidian.edu.cn
                 ytyang@xidian.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation of China
                 [61070046, 60803038]; State Key Lab [ISN1104001];
                 Fundamental Research Funds for the Central Universities
                 [K5051301003]; 111 Project [B08038]",
  funding-text = "This work is supported by the National Science
                 Foundation of China Grant No. 61070046 and 60803038,
                 the special fund from State Key Lab Grant No.
                 ISN1104001, the Fundamental Research Funds for the
                 Central Universities Grant No. K5051301003, the 111
                 Project Grant No. B08038.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "compact optical interconnect; crosstalk noise;
                 diagonal mesh topology; DMesh topology; integrated
                 optoelectronics; intra-chip communications; large-scale
                 ONoC design; mesh-based ONoC; multiprocessors; network
                 performance; Network topology; network-on-chip; optical
                 interconnections; Optical interconnects; optical
                 network-on-chip; optical router; Optical routers;
                 optical routers; power consumption; power efficient
                 interconnect; Topology; topology; Topology; waveguide
                 crossings; wavelength division multiplexing; Wavelength
                 division multiplexing; wavelength division
                 multiplexing",
  number-of-cited-references = "9",
  ORCID-numbers = "Gu, Huaxi/0000-0002-6409-2229",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Chen:2014:PEC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Cota:2014:AMR,
  author =       "Emilio G. Cota and Paolo Mantovani and Michele
                 Petracca and Mario R. Casu and Luca P. Carloni",
  title =        "Accelerator Memory Reuse in the Dark Silicon Era",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.29",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Accelerators integrated on-die with General-Purpose
                 CPUs (GP-CPUs) can yield significant performance and
                 power improvements. Their extensive use, however, is
                 ultimately limited by their area overhead; due to their
                 high degree of specialization, the opportunity cost of
                 investing die real estate on accelerators can become
                 prohibitive, especially for general-purpose
                 architectures. In this paper we present a novel
                 technique aimed at mitigating this opportunity cost by
                 allowing GP-CPU cores to reuse accelerator memory as a
                 non-uniform cache architecture (NUCA) substrate. On a
                 system with a last level-2 cache of 128kB, our
                 technique achieves on average a 25\% performance
                 improvement when reusing four 512 kB accelerator memory
                 blocks to form a level-3 cache. Making these blocks
                 reusable as NUCA slices incurs on average in a 1.89\%
                 area overhead with respect to equally-sized ad hoc
                 cache slices.",
  acknowledgement = ack-nhfb,
  affiliation =  "Cota, EG (Reprint Author), Columbia Univ, New York, NY
                 10027 USA. Cota, Emilio G.; Mantovani, Paolo; Carloni,
                 Luca P., Columbia Univ, New York, NY 10027 USA.
                 Petracca, Michele, Cadence Design Syst Inc, San Jose,
                 CA USA. Casu, Mario R., Politecn Torino, Turin,
                 Italy.",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [1018236,
                 1219001]; ONR Young Investigator Award; Gigascale
                 Systems Research Center; Focus Center Research Program
                 (FCRP), a Semiconductor Research Corporation entity",
  funding-text = "This research is partially supported by the National
                 Science Foundation under Awards \#: 1018236 and
                 1219001, an ONR Young Investigator Award, and the
                 Gigascale Systems Research Center, one of six research
                 centers funded under the Focus Center Research Program
                 (FCRP), a Semiconductor Research Corporation entity.
                 The authors thank John Demme and the anonymous
                 reviewers for their insightful comments.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; accelerator architectures; Accelerator
                 architectures; accelerator architectures; accelerator
                 memory reuse; cache formation; Cache memory; cache
                 slice; cache storage; dark silicon era; general purpose
                 CPU; general-purpose architecture; GP-CPU; Memory
                 management; nonuniform cache architecture; NUCA
                 substrate; Power demand; Silicon; Transform coding",
  keywords-plus = "CACHES",
  number-of-cited-references = "18",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Cota:2014:AMR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Chou:2014:EPE,
  author =       "Yu-Liang Chou and Shaoshan Liu and Eui-Young Chung and
                 Jean-Luc Gaudiot",
  title =        "An Energy and Performance Efficient {DVFS} Scheme for
                 Irregular Parallel Divide-and-Conquer Algorithms on the
                 {Intel SCC}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.1",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The divide-and-conquer paradigm can be used to express
                 many computationally significant problems, but an
                 important subset of these applications is inherently
                 load-imbalanced. Load balancing is a challenge for
                 irregular parallel divide-and-conquer algorithms and
                 efficiently solving these applications will be a key
                 requirement for future many-core systems. To address
                 the load imbalance issue, instead of attempting to
                 dynamically balancing the workloads, this paper
                 proposes an energy and performance efficient Dynamic
                 Voltage and Frequency Scaling (DVFS) scheduling scheme,
                 which takes into account the load imbalance behavior
                 exhibited by these applications. More specifically, we
                 examine the core of the divide-and-conquer paradigm and
                 determine that the base-case-reached point where
                 recursion stops is a suitable place in a
                 divide-and-conquer paradigm to apply the proposed DVFS
                 scheme. To evaluate the proposed scheme, we implement
                 four representative irregular parallel
                 divide-and-conquer algorithms, tree traversal,
                 quicksort, finding primes, and n-queens puzzle, on the
                 Intel Single-chip Cloud Computer (SCC) many-core
                 machine. We demonstrate that, on average, the proposed
                 scheme can improve performance by 41\% while reducing
                 energy consumption by 36\% compared to the baseline
                 running the whole computation with the default
                 frequency configuration (400MHz).",
  acknowledgement = ack-nhfb,
  affiliation =  "Chou, YL (Reprint Author), Univ Calif Irvine, Irvine,
                 CA 92697 USA. Chou, Yu-Liang; Gaudiot, Jean-Luc, Univ
                 Calif Irvine, Irvine, CA 92697 USA. Liu, Shaoshan,
                 Microsoft Corp, Redmond, WA 98052 USA. Chung,
                 Eui-Young, Yonsei Univ, Seoul 120749, South Korea.",
  author-email = "d943010010@gmail.com shaoliu@microsoft.com
                 eychung@yonsei.ac.kr gaudiot@uci.edu",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "US National Science Foundation
                 [CCF-1065448]; National Research Foundation of Korea
                 (NRF) [2012S1A2A1A01031420]; Ministry of Education,
                 Science and Technology [2012-047670]; National Science
                 Council [NSC 101-2917-I-564-079]",
  funding-text = "This work is partly supported by the US National
                 Science Foundation under Grant No. CCF-1065448, by the
                 National Research Foundation of Korea (NRF) under Grant
                 No. 2012S1A2A1A01031420, by the Ministry of Education,
                 Science and Technology under Grant No. 2012-047670, and
                 by the National Science Council under Grant No. NSC
                 101-2917-I-564-079. Any opinions, findings, and
                 conclusions expressed in this material are those of the
                 authors and do not necessarily reflect the views of
                 these sponsors.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "base-case-reached point; D Software/Software
                 Engineering; D.4 Operating Systems; D.4 Operating
                 Systems < D.4.7 Organization and Design; D.4.7.b
                 Distributed systems; D.4.7.f Parallel systems; D.4.8
                 Performance < D.4.8.a Measurements < Distributed
                 processing; divide and conquer methods;
                 Divide-and-conquer; DVFS; dynamic voltage and frequency
                 scaling; energy conservation; energy consumption
                 reduction; energy efficient DVFS scheme; finding
                 primes; frequency 400 MHz; Intel SCC; Intel single-chip
                 cloud computer; irregular parallel divide-and-conquer
                 algorithms; Load Imbalance; load imbalance behavior;
                 many-core machine; microprocessor chips;
                 multiprocessing systems; n-queens puzzle; Operating
                 systems; parallel algorithms; Parallel processing;
                 performance efficient DVFS scheme; Performance
                 evaluation; power aware computing; processor
                 scheduling; quicksort; recursion stops; resource
                 allocation; Software engineering; tree traversal",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Chou:2014:EPE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Rotem:2014:BUI,
  author =       "Nadav Rotem and Yosi {Ben Asher}",
  title =        "Block Unification {IF}-conversion for High Performance
                 Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.28",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Graphics Processing Units accelerate data-parallel
                 graphic calculations using wide SIMD vector units.
                 Compiling programs to use the GPU's SIMD architectures
                 require converting multiple control flow paths into a
                 single stream of instructions. IF-conversion is a
                 compiler transformation, which converts control
                 dependencies into data dependencies, and it is used by
                 vectorizing compilers to eliminate control flow and
                 enable efficient code generation. In this work we
                 enhance the IF-conversion transformation by using a
                 block unification method to improve the currently used
                 block flattening method. Our experimental results
                 demonstrate that our IF-conversion method is effective
                 in reducing the number of predicated instructions and
                 in boosting kernel execution speed.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rotem, N (Reprint Author), Univ Haifa, Dept Comp Sci,
                 IL-31999 Haifa, Israel. Rotem, Nadav; Ben Asher, Yosi,
                 Univ Haifa, Dept Comp Sci, IL-31999 Haifa, Israel.",
  author-email = "rotemn@cs.haifa.ac.il yosi@cs.haifa.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "block flattening method; block unification
                 IF-conversion; block unification method; code
                 generation; Code generation; compiler transformation;
                 Compilers; Computer architecture; data-parallel graphic
                 calculations; GPU SIMD architectures; Graphics
                 processing unit; graphics processing units; high
                 performance architectures; Kernel; Merging; multiple
                 control flow paths; parallel processing; Processors;
                 program compilers; Programming Languages; Registers;
                 Software/Software Engineering; vectorizing compilers;
                 Vectors; wide SIMD vector units",
  number-of-cited-references = "15",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Rotem:2014:BUI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Ilic:2014:CAR,
  author =       "Aleksandar Ilic and Frederico Pratas and Leonel
                 Sousa",
  title =        "Cache-aware Roofline model: Upgrading the loft",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "21--24",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.6",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The Roofline model graphically represents the
                 attainable upper bound performance of a computer
                 architecture. This paper analyzes the original Roofline
                 model and proposes a novel approach to provide a more
                 insightful performance modeling of modern architectures
                 by introducing cache-awareness, thus significantly
                 improving the guidelines for application optimization.
                 The proposed model was experimentally verified for
                 different architectures by taking advantage of built-in
                 hardware counters with a curve fitness above 90\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "Ilic, A (Reprint Author), Univ Tecn Lisboa, INESC ID
                 IST, Lisbon, Portugal. Ilic, Aleksandar; Pratas,
                 Frederico; Sousa, Leonel, Univ Tecn Lisboa, INESC ID
                 IST, Lisbon, Portugal.",
  author-email = "ilic@inesc-id.pt fcpp@inesc-id.pt las@inesc-id.pt",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "national funds through FCT (Fundacao para a
                 Ciencia e a Tecnologia) [PTDC/EEI-ELC/3152/2012,
                 PEst-OE/EEI/LA0021/2011, PTDC/EEA-ELC/117329/2010]; FCT
                 [SFRH/BPD/87734/2012]",
  funding-text = "This work was supported by national funds through FCT
                 (Fundacao para a Ciencia e a Tecnologia), under
                 projects PTDC/EEI-ELC/3152/2012,
                 PEst-OE/EEI/LA0021/2011, and PTDC/EEA-ELC/117329/2010.
                 F. Pratas also acknowledges the FCT scholarship
                 SFRH/BPD/87734/2012.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Application optimization; application optimization;
                 Application optimization; built-in hardware counters;
                 C.0.d Modeling of computer architecture < C.0 General <
                 C Computer Systems Organization; C.0.e System
                 architectures; C.4.d Modeling techniques < C.4
                 Performance of Systems < C Computer Systems
                 Organization; C.4.g Measurement; cache storage;
                 cache-aware Roofline model; cache-awareness; computer
                 architecture; computer architecture upper bound
                 performance; curve fitness; evaluation; integration and
                 modeling < C.0 General < C Computer Systems
                 Organization; Modeling; modeling; Multicore computer
                 architectures; Multiprocessing systems; multiprocessing
                 systems; Performance evaluation; Performance modeling;
                 Simulation; simulation of multiple-processor systems <
                 C.4 Performance of Systems < C Computer Syst",
  number-of-cited-references = "10",
  ORCID-numbers = "Ilic, Aleksandar/0000-0002-8594-3539 Sousa,
                 Leonel/0000-0002-8066-221X",
  research-areas = "Computer Science",
  researcherid-numbers = "Ilic, Aleksandar/L-1943-2014 Sousa,
                 Leonel/B-2749-2009",
  times-cited =  "24",
  unique-id =    "Ilic:2014:CAR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Efraim:2014:EAR,
  author =       "Rotem Efraim and Ran Ginosar and C. Weiser and Avi
                 Mendelson",
  title =        "Energy Aware Race to Halt: A Down to {EARtH} Approach
                 for Platform Energy Management",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "25--28",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.32",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The EARtH algorithm finds the optimal voltage and
                 frequency operational point of the processor in order
                 to achieve minimum energy of the computing platform.
                 The algorithm is based on a theoretical model employing
                 a small number of parameters, which are extracted from
                 real systems using off-line and run-time methods. The
                 model and algorithm have been validated on real systems
                 using 45nm, 32nm and 22nm Intel (R) Core processors.
                 The algorithm can save up to 44\% energy compared with
                 the commonly used fixed frequency policies.",
  acknowledgement = ack-nhfb,
  affiliation =  "Efraim, R (Reprint Author), Intel Corp, Santa Clara,
                 CA 95051 USA. Efraim, Rotem, Intel Corp, Santa Clara,
                 CA 95051 USA. Ginosar, Ran; Weiser, C.; Mendelson, Avi,
                 Technion Israeli Inst Technol, Haifa, Israel.",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithm design and analysis; B Hardware; B.9 Power
                 Management; B.9.2 Energy-aware systems; C Computer
                 Systems Organization; C.4 Performance of Systems; C.5
                 Computer System Implementation; C.5.4 VLSI Systems;
                 C.5.5 Servers; Computational modeling; Earth; EARtH
                 algorithm; energy aware race to halt; Energy
                 management; Energy measurement; fixed frequency
                 policies; Frequency measurement; frequency operational
                 point; Heterogeneous cores; Intel core processors;
                 microprocessor chips; off-line methods; optimal
                 voltage; platform energy management; power aware
                 computing; Power Management; run-time methods; size 22
                 nm; size 32 nm; size 45 nm; Voltage measurement",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "Efraim:2014:EAR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Cakmakci:2014:EVA,
  author =       "Yaman {\c{C}}akmak{\c{c}}i and O{\u{g}}uz Ergin",
  title =        "Exploiting Virtual Addressing for Increasing
                 Reliability",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "29--32",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.2",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "A novel method to protect a system against errors
                 resulting from soft errors occurring in the virtual
                 address (VA) storing structures such as translation
                 lookaside buffers (TLB), physical register file (PRF)
                 and the program counter (PC) is proposed in this paper.
                 The work is motivated by showing how soft errors impact
                 the structures that store virtual page numbers (VPN). A
                 solution is proposed by employing linear block encoding
                 methods to be used as a virtual addressing scheme at
                 link time. Using the encoding scheme to assign VPNs for
                 VAs, it is shown that the system can tolerate soft
                 errors using software with the help of the discussed
                 decoding techniques applied to the page fault handler.
                 The proposed solution can be used on all of the
                 architectures using virtually indexed addressing. The
                 main contribution of this paper is the decreasing of
                 AVF for data TLB by 42.5\%, instruction TLB by 40.3\%,
                 PC by 69.2\% and PRF by 33.3\%.",
  acknowledgement = ack-nhfb,
  affiliation =  "{\c{C}}akmak{\c{c}}i, Y (Reprint Author), TOBB Univ
                 Econ \& Technol, Dept Comp Engn, Ankara, Turkey.
                 {\c{C}}akmak{\c{c}}i, Yaman; Ergin, O{\u{g}}uz, TOBB
                 Univ Econ \& Technol, Dept Comp Engn, Ankara, Turkey.",
  author-email = "ycakmakci@etu.edu.tr oergin@etu.edu.tr",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Scientific and Technological Research
                 Council of Turkey (TUBITAK) [112E004]",
  funding-text = "This work was supported in part by the Scientific and
                 Technological Research Council of Turkey (TUBITAK)
                 under Grant 112E004. The work is in the framework of
                 COST ICT Action 1103 Manufacturable and Dependable
                 Multicore Architectures at Nanoscale.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "AVF; B Hardware; B.3 Memory Structures; B.3.2 Design
                 Styles; B.3.2.h Virtual memory; B.3.4 Reliability,
                 Testing and Fault-Tolerance; buffer storage; decoding
                 techniques; encoding; Fault tolerance; Hardware; linear
                 block encoding methods; Memory management; page fault
                 handler; PC; physical register file; PRF; program
                 counter; soft errors; TLB; translation lookaside
                 buffers; virtual address storing structures; virtual
                 addressing; virtual addressing scheme; Virtual memory;
                 virtual page numbers; virtually indexed addressing;
                 VPN",
  keywords-plus = "SOFT ERRORS",
  number-of-cited-references = "10",
  ORCID-numbers = "Ergin, O{\u{g}}uz/0000-0003-2701-3787",
  research-areas = "Computer Science",
  researcherid-numbers = "Ergin, O{\u{g}}uz/E-5717-2010",
  times-cited =  "1",
  unique-id =    "Cakmakci:2014:EVA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zhu:2014:EWC,
  author =       "Yuhao Zhu and Aditya Srikanth and Jingwen Leng and
                 Vijay Janapa Reddi",
  title =        "Exploiting Webpage Characteristics for
                 Energy-Efficient Mobile {Web} Browsing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "33--36",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.33",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Web browsing on mobile devices is undoubtedly the
                 future. However, with the increasing complexity of
                 webpages, the mobile device's computation capability
                 and energy consumption become major pitfalls for a
                 satisfactory user experience. In this paper, we propose
                 a mechanism to effectively leverage processor frequency
                 scaling in order to balance the performance and energy
                 consumption of mobile web browsing. This mechanism
                 explores the performance and energy tradeoff in webpage
                 loading, and schedules webpage loading according to the
                 webpages' characteristics, using the different
                 frequencies. The proposed solution achieves 20.3\%
                 energy saving compared to the performance mode, and
                 improves webpage loading performance by 37.1\% compared
                 to the battery saving mode.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zhu, YH (Reprint Author), Univ Texas Austin, Dept
                 Elect \& Comp Engn, Austin, TX 78712 USA. Zhu, Yuhao;
                 Srikanth, Aditya; Leng, Jingwen; Reddi, Vijay Janapa,
                 Univ Texas Austin, Dept Elect \& Comp Engn, Austin, TX
                 78712 USA.",
  author-email = "yzhu@utexas.edu aditya.srik@utexas.edu
                 jingwen@utexas.edu vj@ece.utexas.edu",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "C Computer Systems Organization; C.2
                 Communication/Networking and Information Technology;
                 C.2.8 Mobile Computing; Cascading style sheets; Cutoff;
                 EDP; Energy; energy conservation; energy consumption;
                 Energy consumption; energy-efficient mobile Web
                 browsing; HTML; Internet; Load modeling; Loading;
                 Market research; Mobile communication; mobile
                 computing; mobile device computation capability;
                 Performance; power aware computing; processor frequency
                 scaling; user experience; Web page characteristics; Web
                 page loading performance; Webpages",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Zhu:2014:EWC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Morad:2014:GMO,
  author =       "Amir Morad and Tomer Y. Morad and Leonid Yavits and
                 Ran Ginosar and Uri Weiser",
  title =        "Generalized {MultiAmdahl}: Optimization of
                 Heterogeneous Multi-Accelerator {SoC}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "37--40",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.34",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Consider a workload comprising a consecutive sequence
                 of program execution segments, where each segment can
                 either be executed on general purpose processor or
                 offloaded to a hardware accelerator. An analytical
                 optimization framework based on MultiAmdahl framework
                 and Lagrange multipliers, for selecting the optimal set
                 of accelerators and for allocating resources among them
                 under constrained area is proposed. Due to the
                 practical implementation of accelerators, the optimal
                 architecture under area constraints may exclude some of
                 the accelerators. As the fraction of the workload that
                 can be accelerated decreases, resources (e.g. area) may
                 shift from accelerators into the general purpose
                 processor. The framework can be extended in a number of
                 ways, spanning from SoC partitioning, bandwidth to
                 power distribution, energy and other constrained
                 resources.",
  acknowledgement = ack-nhfb,
  affiliation =  "Morad, A (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
                 Morad, Amir; Morad, Tomer Y.; Yavits, Leonid; Ginosar,
                 Ran; Weiser, Uri, Technion Israel Inst Technol, Dept
                 Elect Engn, IL-32000 Haifa, Israel.",
  author-email = "amirm@tx.technion.ac.il tomerm@tx.technion.ac.il
                 yavits@tx.technion.ac.il ran@ee.technion.ac.il
                 uri.weiser@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; analytical optimization framework; Chip
                 Multiprocessors; general purpose processor; generalized
                 multiAmdhal framework; Hardware; hardware accelerator;
                 heterogeneous multiaccelerator SoC partitioning;
                 Lagrange multiplier; Mathematical model; Modeling of
                 computer architecture; MultiAmdahl; Multicore
                 processing; optimisation; Optimization; power
                 distribution bandwidth; program execution segment;
                 resource allocation; Resource management;
                 System-on-a-chip; system-on-chip",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Morad:2014:GMO",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kvatinsky:2014:MBM,
  author =       "Shahar Kvatinsky and Yuval H. Nacson and Yoav Etsion
                 and Eby G. Friedman and Avinoam Kolodny and Uri C.
                 Weiser",
  title =        "Memristor-Based Multithreading",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "41--44",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.3",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "Switch on Event Multithreading (SoE MT, also known as
                 coarse-grained MT and block MT) processors run multiple
                 threads on a pipeline machine, while the pipeline
                 switches threads on stall events (e.g., cache miss).
                 The thread switch penalty is determined by the number
                 of stages in the pipeline that are flushed of in-flight
                 instructions. In this paper, Continuous Flow
                 Multithreading (CFMT), a new architecture of SoE MT, is
                 introduced. In CFMT, a multistate pipeline register
                 (MPR) holds the microarchitectural state of multiple
                 different threads within the execution pipeline stages,
                 where only one thread is active at a time. The MPRs
                 eliminate the need to flush in-flight instructions and
                 therefore significantly improve performance. In recent
                 years, novel memory technologies such as Resistive RAM
                 (RRAM) and Spin Torque Transfer Magnetoresistive RAM
                 (STT-MRAM), have been developed. All of these
                 technologies are nonvolatile, store data as resistance,
                 and can be described as ``memristors''. Memristors are
                 power efficient, dense, and fast as compared to
                 standard memory technologies such as SRAM, DRAM, and
                 Flash. Memristors therefore provide the opportunity to
                 place the MPRs physically within the pipeline stages. A
                 performance analysis of CFMT is compared to
                 conventional SoE MT processors, demonstrating up to a
                 2X performance improvement, while the operational
                 mechanism, due to the use of memristors, is low power
                 and low complexity as compared to conventional SoE MT
                 processors.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kvatinsky, S (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
                 Kvatinsky, Shahar; Etsion, Yoav; Kolodny, Avinoam;
                 Weiser, Uri C., Technion Israel Inst Technol, Dept
                 Elect Engn, IL-32000 Haifa, Israel. Etsion, Yoav,
                 Technion Israel Inst Technol, Dept Comp Sci, IL-32000
                 Haifa, Israel. Friedman, Eby G., Univ Rochester, Dept
                 Elect \& Comp Engn, Rochester, NY 14627 USA.",
  author-email = "skva@tx.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Hasso Plattner Institute",
  funding-text = "This work was supported by the Hasso Plattner
                 Institute. The authors thank Ravi Patel for his
                 comments and area overhead estimation and to Nimrod
                 Wald and Guy Satat for their help in evaluating the
                 architecture.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B Hardware; B.3 Memory Structures; B.7 Integrated
                 Circuits; B.7.1 Types and Design Styles; B.7.1.e Memory
                 technologies; C Computer Systems Organization; C.0
                 General; C.0.a Emerging technologies; C.0.d Modeling of
                 computer architecture; CFMT; Computer architecture;
                 continuous flow multithreading; in-flight instructions;
                 Integrated circuits; Memory management; memristor;
                 memristor-based multithreading; memristors; MPR;
                 multi-threading; multistate pipeline register;
                 multithreaded processors; Multithreading; novel memory
                 technologies; phase change memory; random-access
                 storage; resistive RAM; RRAM; RRAM, STT-MRAM; SoE MT
                 processors; spin torque transfer magnetoresistive RAM;
                 STT- MRAM; STT-MRAM; switch on event multithreading
                 processors; Systems design and analysis",
  keywords-plus = "RESISTIVE SWITCHING MEMORIES",
  number-of-cited-references = "21",
  research-areas = "Computer Science",
  times-cited =  "10",
  unique-id =    "Kvatinsky:2014:MBM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wingbermuehle:2014:OAS,
  author =       "Joseph G. Wingbermuehle and Ron K. Cytron and Roger D.
                 Chamberlain",
  title =        "Optimization of Application-Specific Memories",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "45--48",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.7",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memory access times are the primary bottleneck for
                 many applications today. This ``memory wall'' is due to
                 the performance disparity between processor cores and
                 main memory. To address the performance gap, we propose
                 the use of custom memory subsystems tailored to the
                 application rather than attempting to optimize the
                 application for a fixed memory subsystem. Custom
                 subsystems can take advantage of application-specific
                 properties as well as memory-specific properties to
                 improve access times or write-backs given constraints
                 on size or power.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wingbermuehle, JG (Reprint Author), Washington Univ,
                 Dept Comp Sci \& Engn, St Louis, MO 63130 USA.
                 Wingbermuehle, Joseph G.; Cytron, Ron K.; Chamberlain,
                 Roger D., Washington Univ, Dept Comp Sci \& Engn, St
                 Louis, MO 63130 USA.",
  author-email = "wingbej@wustl.edu cytron@wustl.edu roger@wustl.edu",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "National Science Foundation [CNS-09095368,
                 CNS-0931693]",
  funding-text = "This work is supported by the National Science
                 Foundation under grants CNS-09095368 and CNS-0931693.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "access time improvement; application-specific memory
                 optimization; B Hardware; B.3 Memory Structures; B.3.2
                 Design Styles; B.3.3 Performance Analysis and Design
                 Aids; B.3.3.b Simulation; C Computer Systems
                 Organization; C.1 Processor Architectures; C.1.5
                 Micro-architecture implementation considerations;
                 C.1.5.e Memory hierarchy; cache; cache storage;
                 Computer architecture; custom memory subsystems; fixed
                 memory subsystem; Hardware; memory access times; Memory
                 management; memory wall; memory-specific properties;
                 Multiprocessing systems; performance disparity;
                 Performance evaluation; performance gap; processor
                 cores; write-backs given constraints",
  number-of-cited-references = "21",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Wingbermuehle:2014:OAS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Xu:2014:STM,
  author =       "Yunlong Xu and Rui Wang and Nilanjan Goswami and Tao
                 Li and Depei Qian",
  title =        "Software Transactional Memory for {GPU}
                 Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "49--52",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.4",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "To make applications with dynamic data sharing among
                 threads benefit from GPU acceleration, we propose a
                 novel software transactional memory system for GPU
                 architectures (GPU-STM). The major challenges include
                 ensuring good scalability with respect to the massively
                 multithreading of GPUs, and preventing livelocks caused
                 by the SIMT execution paradigm of GPUs. To this end, we
                 propose (1) a hierarchical validation technique and (2)
                 an encounter-time lock-sorting mechanism to deal with
                 the two challenges, respectively. Evaluation shows that
                 GPU-STM outperforms coarse-grain locks on GPUs by up to
                 20x.",
  acknowledgement = ack-nhfb,
  affiliation =  "Xu, YL (Reprint Author), Xi An Jiao Tong Univ, Sch
                 Elect \& Informat Engn, Xian 710049, Peoples R China.
                 Xu, Yunlong; Qian, Depei, Xi An Jiao Tong Univ, Sch
                 Elect \& Informat Engn, Xian 710049, Peoples R China.
                 Wang, Rui; Qian, Depei, Beihang Univ, Sch Engn \& Comp
                 Sci, Beijing, Peoples R China. Goswami, Nilanjan; Li,
                 Tao, Univ Florida, ECE Dept, Gainesville, FL USA.",
  author-email = "xjtu.ylxu@stu.xjtu.edu.cn rui.wang@jsi.buaa.edu.cn
                 nil@ufl.edu taoli@ece.ufl.edu depeiq@xjtu.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF of China [61133004, 61128004,
                 61073011]; 863 Program of China [2012AA010902]",
  funding-text = "This work is supported by NSF of China under grant
                 61133004, 61128004 and 61073011, and 863 Program of
                 China under grant 2012AA010902.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "dynamic data sharing; encounter-time lock-sorting
                 mechanism; GPU acceleration; GPU architectures;
                 GPU-STM; graphics processing units; hierarchical
                 validation technique; multi-threading; Multicore
                 processing; multicore processor; Multicore Processors;
                 multiprocessing systems; Multiprocessing systems;
                 multithreading; parallel architectures; Parallel
                 processing; Parallel Programming; parallel programming;
                 Parallel Programming; Run-time Environments; Runtime
                 environment; SIMD processor; SIMD Processors; SIMT
                 execution paradigm; software transactional memory
                 system; sorting",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Xu:2014:STM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Shim:2014:TMP,
  author =       "Keun Sup Shim and Mieszko Lis and Omer Khan and
                 Srinivas Devadas",
  title =        "Thread Migration Prediction for Distributed Shared
                 Caches",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "53--56",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2012.30",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Chip-multiprocessors (CMPs) have become the mainstream
                 parallel architecture in recent years; for scalability
                 reasons, designs with high core counts tend towards
                 tiled CMPs with physically distributed shared caches.
                 This naturally leads to a Non-Uniform Cache Access
                 (NUCA) design, where on-chip access latencies depend on
                 the physical distances between requesting cores and
                 home cores where the data is cached. Improving data
                 locality is thus key to performance, and several
                 studies have addressed this problem using data
                 replication and data migration. In this paper, we
                 consider another mechanism, hardware-level thread
                 migration. This approach, we argue, can better exploit
                 shared data locality for NUCA designs by effectively
                 replacing multiple round-trip remote cache accesses
                 with a smaller number of migrations. High migration
                 costs, however, make it crucial to use thread
                 migrations judiciously; we therefore propose a novel,
                 on-line prediction scheme which decides whether to
                 perform a remote access (as in traditional NUCA
                 designs) or to perform a thread migration at the
                 instruction level. For a set of parallel benchmarks,
                 our thread migration predictor improves the performance
                 by 24\% on average over the shared-NUCA design that
                 only uses remote accesses.",
  acknowledgement = ack-nhfb,
  affiliation =  "Shim, KS (Reprint Author), MIT, 77 Massachusetts Ave,
                 Cambridge, MA 02139 USA. Shim, Keun Sup; Lis, Mieszko;
                 Devadas, Srinivas, MIT, Cambridge, MA 02139 USA. Khan,
                 Omer, Univ Connecticut, Storrs, CT USA.",
  da =           "2019-06-20",
  doc-delivery-number = "AT5MU",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B Hardware; B.3 Memory Structures; B.3.2 Design
                 Styles; B.3.2.g Shared memory; Benchmark testing; C
                 Computer Systems Organization; C.1 Processor
                 Architectures; C.1.4 Parallel Architectures; Cache
                 Coherence; cache storage; chip-multiprocessors; CMPs;
                 Coherence; Computer architecture; Context; core counts;
                 Data Locality; data locality improvement; data
                 migration; data replication; Distributed Caches;
                 hardware-level thread migration prediction; home cores;
                 Instruction sets; integrated circuit design; mainstream
                 parallel architecture; microprocessor chips;
                 multiprocessing systems; nonuniform cache access
                 design; on-chip access latencies; online prediction
                 scheme; Parallel Architecture; parallel architectures;
                 physical distributed shared caches; Protocols;
                 Registers; requesting cores; shared-NUCA design",
  number-of-cited-references = "13",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Shim:2014:TMP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2014:TCa,
  author =       "Anonymous",
  title =        "Table of Contents",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "C1--C4",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2360655",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2014:ITPa,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Transactions on Pattern Analysis and
                 Machine Intelligence}} Editorial Board}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "C2--C2",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2360656",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2014:ITPb,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Transactions on Pattern Analysis and
                 Machine Intelligence}}} Information for Authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "C3--C3",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2360657",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2014:ICSa,
  author =       "Anonymous",
  title =        "{IEEE Computer Society}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "1",
  pages =        "C4--C4",
  month =        jan # "\slash " # jun,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2360658",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Lavasani:2014:FBL,
  author =       "Maysam Lavasani and Hari Angepat and Derek Chiou",
  title =        "An {FPGA}-based In-Line Accelerator for {Memcached}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "57--60",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.17",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We present a method for accelerating server
                 applications using a hybrid CPU+FPGA architecture and
                 demonstrate its advantages by accelerating Memcached, a
                 distributed key-value system. The accelerator,
                 implemented on the FPGA fabric, processes request
                 packets directly from the network, avoiding the CPU in
                 most cases. The accelerator is created by profiling the
                 application to determine the most commonly executed
                 trace of basic blocks which are then extracted. Traces
                 are executed speculatively within the FPGA. If the
                 control flow exits the trace prematurely, the side
                 effects of the computation are rolled back and the
                 request packet is passed to the CPU. When compared to
                 the best reported software numbers, the Memcached
                 accelerator is 9.15x more energy efficient for common
                 case requests.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lavasani, M (Reprint Author), Univ Texas Austin, Dept
                 Elect \& Comp Engn, Austin, TX 78712 USA. Lavasani,
                 Maysam; Angepat, Hari; Chiou, Derek, Univ Texas Austin,
                 Dept Elect \& Comp Engn, Austin, TX 78712 USA.",
  author-email = "maysamlavasani@utexas.edu hangepat@utexas.edu
                 derek@utexas.edu",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "accelerating server; C.1.3.f Heterogeneous (hybrid)
                 systems; C.2.4.a Client/server; cache storage;
                 Client-server systems; Computer architecture; control
                 flow; distributed key-value system; distributed
                 processing; field programmable gate arrays; Field
                 programmable gate arrays; FPGA-based in-line
                 accelerator; hybrid CPU+FPGA architecture; Hybrid
                 systems; Memcached accelerator; Program processors;
                 reconfigurable architectures; request packet; rolled
                 back; software numbers",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "24",
  unique-id =    "Lavasani:2014:FBL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Song:2014:AFB,
  author =       "Xiang Song and Jian Yang and Haibo Chen",
  title =        "Architecting Flash-based Solid-State Drive for
                 High-performance {I/O} Virtualization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "61--64",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.22",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Flash-based solid-state drive (SSD) is now being
                 widely deployed in cloud computing platforms due to the
                 potential advantages of better performance and less
                 energy consumption. However, current virtualization
                 architecture lacks support for high-performance I/O
                 virtualization over persistent storage, which results
                 in sub-optimal I/O performance for guest virtual
                 machines (VMs) on SSD. Further, current software-based
                 I/O virtualization violates the ``don't hide power''
                 principle due to inefficient support for some advanced
                 SSD commands (e.g., TRIM) and constrained parallelism,
                 leading to sub-optimal performance and life cycle. This
                 paper observes that the massive internal parallelism
                 and the block emulation in the flash translation layer
                 (FTL) make flash-based SSD an ideal candidate to
                 support high-performance I/O virtualization for
                 persistent storage. Based on this observation, we
                 propose VFlash, the first storage I/O virtualization
                 architecture that extends existing SSDs with trivial
                 hardware changes to directly expose multiple virtual
                 SSDs to guest VMs. Performance evaluation using a
                 modified FlashSim with two FTL schemes (i.e., DFTL and
                 FAST) shows that VFlash incurs only small performance
                 overhead over native SSDs and can efficiently exploit
                 parallelism.",
  acknowledgement = ack-nhfb,
  affiliation =  "Chen, HB (Reprint Author), Shanghai Jiao Tong Univ,
                 Sch Software, Inst Parallel \& Distributed Syst,
                 Shanghai 200030, Peoples R China. Song, Xiang; Yang,
                 Jian; Chen, Haibo, Shanghai Jiao Tong Univ, Sch
                 Software, Inst Parallel \& Distributed Syst, Shanghai
                 200030, Peoples R China.",
  author-email = "haibochen@sjtu.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "China National Natural Science Foundation
                 [61003002]; Intel",
  funding-text = "This work was supported by China National Natural
                 Science Foundation under grant numbered 61003002 and a
                 grant from Intel.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B.4.4 Performance Analysis and Design Aids; C.4.g
                 Measurement; cloud computing; Cloud computing; cloud
                 computing platforms; Computer architecture; energy
                 consumption; evaluation; flash memories; flash-based
                 solid-state drive; high performance I/O virtualization
                 architecture; I/O virtualization; modeling;
                 Multiprocessing systems; Parallel processing;
                 Performance evaluation; performance evaluation; Random
                 access memory; simulation of multiple-processor
                 systems; software-based I/O virtualization; Solid state
                 circuits; Solid State Drive; SSD commands; virtual
                 machines; virtualisation; VM",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Song:2014:AFB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wu:2014:ATE,
  author =       "Carole-Jean Wu",
  title =        "Architectural Thermal Energy Harvesting Opportunities
                 for Sustainable Computing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "65--68",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.16",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Increased power dissipation in computing devices has
                 led to a sharp rise in thermal hotspots, creating
                 thermal runaway. To reduce the additional power
                 requirement caused by increased temperature, current
                 approaches apply cooling mechanisms to remove heat or
                 apply management techniques to avoid thermal
                 emergencies by slowing down heat generation. This paper
                 proposes to tackle the heat management problem of
                 computing platforms with a fundamentally new approach -
                 instead of heat removal using cooling mechanisms and
                 heat avoidance using dynamic thermal/power management
                 techniques, this work investigates the mechanisms to
                 recover wasted heat into reusable energy for
                 sustainable computing. Through recent advancements in
                 thermoelectric materials, we allow wasted heat energy
                 generated by computing devices to be recovered,
                 transformed, and harvested as electricity that can be
                 directly used within the system. We demonstrate a
                 real-system setup where we recover 0.3 to 1 watt of
                 power with the CPU running at 70 to 105 degrees C,
                 using a COTS thermoelectric device on top of the CPU.
                 Through this research, we hope to motivate more
                 in-depth efforts to explore heat energy harvesting
                 opportunities on computing devices and inspire
                 plausible solutions to overcome the technical
                 challenges discussed in this paper.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wu, CJ (Reprint Author), Arizona State Univ, Sch Comp,
                 Dept Comp Sci Engn, Tempe, AZ 85281 USA. Arizona State
                 Univ, Sch Comp, Dept Comp Sci Engn, Tempe, AZ 85281
                 USA.",
  author-email = "carole-jean.wu@asu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architectural thermal energy harvesting; cooling;
                 Cooling; cooling mechanisms; dynamic thermal-power
                 management technique; Energy conservation; energy
                 harvesting; Energy-aware systems; heat generation; heat
                 management problem; power dissipation; Power
                 distribution; power engineering computing; Resistance
                 heating; sustainable computing; Temperature
                 measurement; Temperature-aware design; thermal energy
                 storage; thermal runaway; Waste heat",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  times-cited =  "5",
  unique-id =    "Wu:2014:ATE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yavits:2014:CHO,
  author =       "Leonid Yavits and Amir Morad and Ran Ginosar",
  title =        "Cache Hierarchy Optimization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "69--72",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.18",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Power consumption, off-chip memory bandwidth, chip
                 area and Network on Chip (NoC) capacity are among main
                 chip resources limiting the scalability of Chip
                 Multiprocessors (CMP). A closed form analytical
                 solution for optimizing the CMP cache hierarchy and
                 optimally allocating area among hierarchy levels under
                 such constrained resources is developed. The
                 optimization framework is extended by incorporating the
                 impact of data sharing on cache miss rate. An
                 analytical model for cache access time as a function of
                 cache size is proposed and verified using CACTI
                 simulation.",
  acknowledgement = ack-nhfb,
  affiliation =  "Yavits, L (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
                 Yavits, Leonid; Morad, Amir; Ginosar, Ran, Technion
                 Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa,
                 Israel.",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "ICRI-CI; Hasso-Plattner-Institut",
  funding-text = "We thank Prof. Uri Weiser and Yaniv Ben Itzhak for
                 their review and remarks. This research was partially
                 funded by the ICRI-CI and Hasso-Plattner-Institut.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Analytical models; Analytical Performance Models;
                 Bandwidth; Cache Hierarchy; cache hierarchy
                 optimization; cache storage; CACTI simulation; chip
                 area; Chip Multiprocessor; chip multiprocessors; CMP;
                 Computational modeling; data sharing; Integrated
                 circuit modeling; Multiprocessing systems; network on
                 chip; network-on-chip; NoC; off-chip memory bandwidth;
                 optimisation; Optimization; power consumption; Resource
                 Allocation Optimization; Resource Allocation
                 Optimizations; Resource management",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Yavits:2014:CHO",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yazdanshenas:2014:CLL,
  author =       "Sadegh Yazdanshenas and Marzieh Ranjbar Pirbasti and
                 Mahdi Fazeli and Ahmad Patooghy",
  title =        "Coding Last Level {STT-RAM} Cache For High Endurance
                 And Low Power",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "73--76",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.8",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "STT-RAM technology has recently emerged as one of the
                 most promising memory technologies. However, its major
                 problems, limited write endurance and high write
                 energy, are still preventing it from being used as a
                 drop-in replacement of SRAM cache. In this paper, we
                 propose a novel coding scheme for STT-RAM last level
                 cache based on the concept of value locality. We reduce
                 switching probability in cache by swapping common
                 patterns with limited weight codes (LWC) to make writes
                 less often as well as more uniform. We also define some
                 policies for swapping these patterns. Our evaluation
                 shows that bit write variance in memory cells can be
                 reduced by about 20\% on average resulting in a more
                 uniform wear-out directly enhancing lifetime and
                 improving cell reliability. In addition, writes in
                 cache lines can be reduced by about 12\% compared to
                 one of the most effective circuit level techniques
                 known as early write termination (EWT) [12]. Our method
                 increases memory hierarchy access time by about 0.08\%
                 on average, which is negligible. We have shown that our
                 method doesn't adversely affect last level cache
                 energy-delay(2). The non-uniformity caused by the
                 coding scheme can be used for another coding scheme at
                 main memory or L1 cache depending on their
                 technologies.",
  acknowledgement = ack-nhfb,
  affiliation =  "Yazdanshenas, S (Reprint Author), Iran Univ Sci \&
                 Technol, Sch Comp Engn, Tehran, Iran. Yazdanshenas,
                 Sadegh; Pirbasti, Marzieh Ranjbar; Fazeli, Mahdi;
                 Patooghy, Ahmad, Iran Univ Sci \& Technol, Sch Comp
                 Engn, Tehran, Iran.",
  author-email = "sadegh\_yazdanshenas@comp.iust.ac.ir
                 m\_ranjbar@comp.iust.ac.ir m\_fazeli@iust.ac.ir
                 patooghy@iust.ac.ir",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B Hardware; B.3 Memory Structures; bit write variance;
                 C Computer Systems Organization; C.1 Processor
                 Architectures; cache; cache storage; cell reliability;
                 circuit level technique; coding scheme; Computer
                 architecture; early write termination; Encoding;
                 limited weight code; limited weight codes; memory
                 endurance; memory technology; nonvolatile memory;
                 Nonvolatile memory; probability; Random access memory;
                 random-access storage; STT-RAM; STT-RAM cache;
                 switching probability; Three-dimensional displays;
                 write energy; write hotspot",
  keywords-plus = "MEMORY; CIRCUIT; ENERGY; MRAM",
  number-of-cited-references = "13",
  ORCID-numbers = "Fazeli, Mahdi/0000-0002-2874-6256 Patooghy,
                 Ahmad/0000-0003-2647-2797",
  research-areas = "Computer Science",
  researcherid-numbers = "Fazeli/S-9574-2018",
  times-cited =  "14",
  unique-id =    "Yazdanshenas:2014:CLL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Martinsen:2014:HTL,
  author =       "Jan Kasper Martinsen and Hakan Grahn and Anders
                 Isberg",
  title =        "Heuristics for Thread-Level Speculation in {Web}
                 Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "77--80",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.26",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/java2010.bib",
  abstract =     "JavaScript is a sequential programming language, and
                 Thread-Level Speculation has been proposed to
                 dynamically extract parallelism in order to take
                 advantage of parallel hardware. In previous work, we
                 have showed significant speed-ups with a simple on/off
                 speculation heuristic. In this paper, we propose and
                 evaluate three heuristics for dynamically adapt the
                 speculation: a 2-bit heuristic, an exponential
                 heuristic, and a combination of these two. Our results
                 show that the combined heuristic is able to both
                 increase the number of successful speculations and
                 decrease the execution time for 15 popular web
                 applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Martinsen, JK (Reprint Author), Blekinge Inst Technol,
                 Sch Comp, SE-37179 Karlskrona, Sweden. Martinsen, Jan
                 Kasper; Grahn, Hakan, Blekinge Inst Technol, Sch Comp,
                 SE-37179 Karlskrona, Sweden. Isberg, Anders, Sony
                 Mobile Commun AB, SE-22188 Lund, Sweden.",
  author-email = "Jan.Kasper.Martinsen@bth.se Hakan.Grahn@bth.se
                 Anders.Isberg@sonymobile.com",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Industrial Excellence Center EASE -
                 Embedded Applications Software Engineering; BESQ+
                 research project --- Knowledge Foundation in Sweden
                 [20100311]",
  funding-text = "This work was partly funded by the Industrial
                 Excellence Center EASE --- Embedded Applications
                 Software Engineering, (http://ease.cs.lth.se), and the
                 BESQ+ research project funded by the Knowledge
                 Foundation (grant number 20100311) in Sweden.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "2-bit heuristic; Automatic Parallelization; Benchmark
                 testing; C.1.4 Parallel Architectures; C.1.4.f
                 Speculative multi-threading; exponential heuristic;
                 Instruction sets; Internet; Java; JavaScript; Multicore
                 processors; Multithreading; Parallel Computing;
                 parallel hardware; Parallel processing; parallel
                 programming; sequential programming language; Social
                 network services; thread-level speculation; Web
                 applications",
  number-of-cited-references = "12",
  oa =           "Green Published",
  ORCID-numbers = "Martinsen, Jan Kasper/0000-0001-8915-3633 Grahn,
                 Hakan/0000-0001-9947-1088",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Martinsen:2014:HTL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Nandakumar:2014:OKS,
  author =       "Vivek S. Nandakumar and Ma{\l}gorzata Marek-Sadowska",
  title =        "On Optimal Kernel Size for Integrated {CPU--GPUs} ---
                 a Case Study",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "81--84",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.27",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Integrated CPU-GPU architectures with a fully
                 addressable shared memory completely eliminate any
                 CPU-GPU data transfer overhead. Since such
                 architectures are relatively new, it is unclear what
                 level of interaction between the CPU and GPU attains
                 the best energy efficiency. Too coarse grained or
                 larger kernels with fairly low CPU--GPU interaction
                 could cause poor utilization of the shared resources
                 while too fine grained kernels could cause frequent
                 interrupts of GPU computation and performance
                 degradation. Also larger kernels require larger shared
                 resources causing increase in area and parasitics which
                 affect the latency sensitive CPU cores. In this paper,
                 we show the effect of granularity on the overall
                 system's energy efficiency using a synthetic workload.
                 We describe how our framework models a truly unified
                 shared memory in integrated architectures with frequent
                 CPU--GPU communication.",
  acknowledgement = ack-nhfb,
  affiliation =  "Nandakumar, VS (Reprint Author), Univ Calif Santa
                 Barbara, Dept Elect \& Comp Engn, Santa Barbara, CA
                 93106 USA. Nandakumar, Vivek S.; Marek-Sadowska,
                 Malgorzata, Univ Calif Santa Barbara, Dept Elect \&
                 Comp Engn, Santa Barbara, CA 93106 USA.",
  author-email = "vivek@ece.ucsb.edu mms@ece.uscb.edu",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "SRC grant [2236]",
  funding-text = "This work was supported by SRC grant \#2236.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B.3.2.g Shared memory; B.4.4.b Simulation; B.9.2
                 Energy-aware systems; C.1.3.f Heterogeneous (hybrid)
                 systems; C.4.g Measurement; Central Processing Unit;
                 Computational modeling; CPU-GPU communication; CPU-GPU
                 data transfer overhead; CPU-GPU interaction; D.4.4
                 Communications Management; energy efficiency; Energy
                 efficiency; evaluation; fine grained kernels; fully
                 addressable shared memory; GPU computation; graphics
                 processing units; Graphics processing units; integrated
                 CPU-GPU architectures; latency sensitive CPU cores;
                 Memory management; modeling; optimal kernel size;
                 overall system energy efficiency; performance
                 degradation; performance evaluation; power aware
                 computing; shared memory systems; simulation of
                 multiple-processor systems",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Nandakumar:2014:OKS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Liu:2014:PTE,
  author =       "Qixiao Liu and Victor Jimenez and Miquel Moreto and
                 Jaume Abella and Francisco J. Cazorla and Mateo
                 Valero",
  title =        "Per-task Energy Accounting in Computing Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "85--88",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.24",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We present for the first time the concept of per-task
                 energy accounting (PTEA) and relate it to per-task
                 energy metering (PTEM). We show the benefits of
                 supporting both in future computing systems. Using the
                 shared last-level cache (LLC) as an example: (1) We
                 illustrate the complexities in providing PTEM and PTEA;
                 (2) we present an idealized PTEM model and an accurate
                 and low-cost implementation of it; and (3) we introduce
                 a hardware mechanism to provide accurate PTEA in the
                 cache.",
  acknowledgement = ack-nhfb,
  affiliation =  "Liu, QX (Reprint Author), Univ Politecn Cataluna,
                 E-08028 Barcelona, Spain. Liu, Qixiao; Jimenez, Victor;
                 Moreto, Miquel; Valero, Mateo, Univ Politecn Cataluna,
                 E-08028 Barcelona, Spain. Liu, Qixiao; Jimenez, Victor;
                 Moreto, Miquel; Abella, Jaume; Cazorla, Francisco J.;
                 Valero, Mateo, Barcelona Supercomp Ctr, Barcelona,
                 Spain. Cazorla, Francisco J., Spanish Natl Res Council
                 IIIA CSIC, Barcelona, Spain.",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Spanish Ministry of Science and Innovation
                 [TIN2012-34557]; HiPEAC Network of Excellence; Chinese
                 Scholarship Council [2010608015]",
  funding-text = "This work has been partially supported by the Spanish
                 Ministry of Science and Innovation under grant
                 TIN2012-34557 and the HiPEAC Network of Excellence.
                 Qixiao Liu has also been funded by the Chinese
                 Scholarship Council under grant 2010608015.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; cache storage; Computational
                 modeling; computing systems; Energy consumption; Energy
                 management; Monitoring; Multicore processing; per-task
                 energy accounting; per-task energy metering; power
                 aware computing; PTEA; PTEM model; Radiation detectors;
                 shared last-level cache",
  number-of-cited-references = "20",
  oa =           "Green Published",
  ORCID-numbers = "Cazorla, Francisco/0000-0002-3344-376X Moreto Planas,
                 Miquel/0000-0002-9848-8758 Valero,
                 Mateo/0000-0003-2917-2482 Abella,
                 Jaume/0000-0001-7951-4028 Liu,
                 Qixiao/0000-0002-8196-7584",
  research-areas = "Computer Science",
  researcherid-numbers = "Cazorla, Francisco/D-7261-2016 Moreto Planas,
                 Miquel/C-1823-2016 Valero, Mateo/L-5709-2014 Abella,
                 Jaume/B-7422-2016",
  times-cited =  "2",
  unique-id =    "Liu:2014:PTE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Mahmoodi:2014:RCC,
  author =       "Hamid Mahmoodi and Sridevi Srinivasan Lakshmipuram and
                 Manish Arora and Yashar Asgarieh and Houman Homayoun
                 and Bill Lin and Dean M. Tullsen",
  title =        "Resistive Computation: A Critique",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "89--92",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.23",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Resistive Computation was suggested by [6] as an idea
                 for tacking the power wall by replacing conventional
                 CMOS logic with Magnetic Tunnel Junction (MTJ) based
                 Look-Up Tables (LUTs). Spin Transfer Torque RAM
                 (STTRAM) is an emerging CMOS-compatible non-volatile
                 memory technology based on Magnetic Tunnel Junctions as
                 a memory bit [3]. The principal advantage of STTRAM is
                 that it is leakage-resistant, which is an important
                 characteristic beyond the 45nm technology node, where
                 leakage concerns are becoming a limiting factor in
                 microprocessor performance. Although STTRAM is a good
                 candidate for replacing SRAM for on-chip memory, we
                 argue in this article MTJ-based LUTs are unnecessarily
                 expensive in terms of area, power, and performance when
                 implementing fixed combinational logic that does not
                 require the reprogramming ability provided by MTJs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mahmoodi, H (Reprint Author), San Francisco State
                 Univ, San Francisco, CA 94132 USA. Arora, Manish;
                 Asgarieh, Yashar; Lin, Bill; Tullsen, Dean M., Univ
                 Calif San Diego, La Jolla, CA 92093 USA. Mahmoodi,
                 Hamid; Lakshmipuram, Sridevi Srinivasan, San Francisco
                 State Univ, San Francisco, CA 94132 USA. Homayoun,
                 Houman, George Mason Univ, Fairfax, VA 22030 USA.",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "B.2.1 Design Styles; B.6.1.e Memory used as logic;
                 B.7.1.a Advanced technologies; B.9.1 Low-power design;
                 C.0.a Emerging technologies; CMOS integrated circuits;
                 CMOS-compatible nonvolatile memory technology; Delays;
                 dynamic current-mode logic; fixed combinational logic;
                 leakage power; leakage-resistance; Logic gates; look-up
                 tables; Low power electronics; magnetic tunnel
                 junction; Magnetic tunneling; magnetic tunnelling;
                 magnetic-tunnel junctions; memory bit; MRAM; MTJ-based
                 LUT; Power distribution; random-access storage;
                 Resistive computation; resistive computation; Resistive
                 computation; spin transfer torque RAM; STTRAM; Table
                 lookup; table lookup; Transistors",
  keywords-plus = "TECHNOLOGY; CIRCUIT",
  number-of-cited-references = "10",
  ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X",
  research-areas = "Computer Science",
  researcherid-numbers = "Lin, Binshan/A-9772-2009",
  times-cited =  "4",
  unique-id =    "Mahmoodi:2014:RCC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Eyerman:2014:RCW,
  author =       "Stijn Eyerman and Lieven Eeckhout",
  title =        "Restating the Case for Weighted-{IPC} Metrics to
                 Evaluate Multiprogram Workload Performance",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "93--96",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.9",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Weighted speedup is nowadays the most commonly used
                 multiprogram workload performance metric. Weighted
                 speedup is a weighted-IPC metric, i.e., the
                 multiprogram IPC of each program is first weighted with
                 its isolated IPC. Recently, Michaud questions the
                 validity of weighted-IPC metrics by arguing that they
                 are inconsistent and that weighted speedup favors
                 unfairness [4]. Instead, he advocates using the
                 arithmetic or harmonic mean of the raw IPC values of
                 the programs in the multiprogram workload. We show that
                 weighted-IPC metrics are not inconsistent, and that
                 weighted speedup is fair in giving equal importance to
                 each program. We argue that, in contrast to raw-IPC
                 metrics, weighted-IPC metrics have a system-level
                 meaning, and that raw-IPC metrics are affected by the
                 inherent behavior of the programs. We also show that
                 the choice of a metric may adversely affect the
                 conclusions from an experiment. We suggest to use two
                 weighted-IPC metrics-system throughput (STP) and
                 average normalized turnaround time (ANTT)-for
                 evaluating multiprogram workload performance, and to
                 avoid raw-IPC metrics.",
  acknowledgement = ack-nhfb,
  affiliation =  "Eyerman, S (Reprint Author), Univ Ghent, B-9000 Ghent,
                 Belgium. Eyerman, Stijn; Eeckhout, Lieven, Univ Ghent,
                 B-9000 Ghent, Belgium.",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Research Foundation --- Flanders (FWO);
                 European Research Council under the European Community
                 [259295]",
  funding-text = "Stijn Eyerman is supported through a postdoctoral
                 fellowship by the Research Foundation --- Flanders
                 (FWO). Additional support is provided by the European
                 Research Council under the European Community's Seventh
                 Framework Programme (FP7/2007-2013) / ERC Grant
                 agreement no. 259295.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ANTT; average normalized turnaround time; Benchmark
                 testing; C Computer Systems Organization; C.1 Processor
                 Architectures; C.1.3 Other Architecture Styles; C.1.3.h
                 Multithreaded processors; C.1.4 Parallel Architectures;
                 C.1.4.e Multi-core/single-chip multiprocessors; C.4
                 Performance of Systems; C.4.c Measurement techniques;
                 Degradation; Harmonic analysis; harmonic mean;
                 Multicore processing; multiprocessing systems;
                 multiprogram IPC; multiprogram workload performance
                 metric; multiprogramming; raw-IPC metrics; STP; system
                 throughput; system-level meaning; Throughput; Weight
                 measurement; weighted speedup; weighted-IPC metric",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "Eyerman:2014:RCW",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wolff:2014:RUR,
  author =       "Sonya R. Wolff and Ronald D. Barnes",
  title =        "Revisiting Using the Results of Pre-Executed
                 Instructions in Runahead Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "97--100",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.21",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Long-latency cache accesses cause significant
                 performance-impacting delays for both in-order and
                 out-of-order processor systems. To address these
                 delays, runahead pre-execution has been shown to
                 produce speedups by warming-up cache structures during
                 stalls caused by long-latency memory accesses. While
                 improving cache related performance, basic runahead
                 approaches do not otherwise utilize results from
                 accurately pre-executed instructions during normal
                 operation. This simple model of execution is
                 potentially inefficient and performance constraining.
                 However, a previous study showed that exploiting the
                 results of accurately pre-executed runahead
                 instructions for out-of-order processors provide little
                 performance improvement over simple re-execution. This
                 work will show that, unlike out-of-order runahead
                 architectures, the performance improvement from
                 runahead result use for an in-order pipeline is more
                 significant, on average, and in some situations
                 provides dramatic performance improvements. For a set
                 of SPEC CPU2006 benchmarks which experience performance
                 improvement from basic runahead, the addition of result
                 use to the pipeline provided an additional speedup of
                 1.14X (high --- 1.48X) for an in-order processor model
                 compared to only 1.05X (high --- 1.16X) for an
                 out-of-order one. When considering benchmarks with poor
                 data cache locality, the average speedup increased to
                 1.21X for in-order compared to only 1.10X for
                 out-of-order.",
  acknowledgement = ack-nhfb,
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; C.1.5.c Superscalar
                 dynamically-scheduled and statically-scheduled
                 implementation; C.1.5.e Memory hierarchy; cache
                 storage; data cache locality; Hidden Markov models;
                 in-order processor systems; long-latency cache
                 accesses; long-latency memory accesses; Memory Wall;
                 multiprocessing systems; Out of order; out-of-order
                 processor systems; out-of-order runahead architectures;
                 Pipeline processing; Pre-Execution; preexecuted
                 runahead instructions; Registers; Runahead; runahead
                 processors; SPEC CPU2006 benchmarks",
  keywords-plus = "PIPELINES",
  number-of-cited-references = "20",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Wolff:2014:RUR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2014:SGA,
  author =       "Youngsok Kim and Jaewon Lee and Donggyu Kim and
                 Jangwoo Kim",
  title =        "{ScaleGPU}: {GPU} Architecture for Memory-Unaware
                 {GPU} Programming",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "101--104",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.19",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Programmer-managed GPU memory is a major challenge in
                 writing GPU applications. Programmers must rewrite and
                 optimize an existing code for a different GPU memory
                 size for both portability and performance.
                 Alternatively, they can achieve only portability by
                 disabling GPU memory at the cost of significant
                 performance degradation. In this paper, we propose
                 ScaleGPU, a novel GPU architecture to enable
                 high-performance memory-unaware GPU programming.
                 ScaleGPU uses GPU memory as a cache of CPU memory to
                 provide programmers a view of CPU memory-sized
                 programming space. ScaleGPU also achieves high
                 performance by minimizing the amount of CPU-GPU data
                 transfers and by utilizing the GPU memory's high
                 bandwidth. Our experiments show that ScaleGPU can run a
                 GPU application on any GPU memory size and also
                 improves performance significantly. For example,
                 ScaleGPU improves the performance of the hotspot
                 application by similar to 48\% using the same size of
                 GPU memory and reduces its memory size requirement by
                 similar to 75\% maintaining the target performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, Y (Reprint Author), POSTECH, Dept Comp Sci \&
                 Engn, Pohang, South Korea. Kim, Youngsok; Lee, Jaewon;
                 Kim, Donggyu; Kim, Jangwoo, POSTECH, Dept Comp Sci \&
                 Engn, Pohang, South Korea.",
  author-email = "elixir@postech.ac.kr spiegel0@postech.ac.kr
                 vteori@postech.ac.kr jangwoo@postech.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Basic Science Research Program through the
                 National Research Foundation of Korea (NRF) ---
                 Ministry of Education, Science and Technology
                 [2011-0014817]; NRF Grant --- Korean Government
                 (NRF-Global Ph.D. Fellowship Program)",
  funding-text = "This research was supported by Basic Science Research
                 Program through the National Research Foundation of
                 Korea (NRF) funded by the Ministry of Education,
                 Science and Technology (2011-0014817) and NRF Grant
                 funded by the Korean Government (NRF-2012-Global Ph.D.
                 Fellowship Program).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "C.1.2.j SIMD processors; C.1.4.e
                 Multi-core/single-chip multiprocessors; C.1.5.e Memory
                 hierarchy; cache; cache storage; code rewrite; CPU
                 memory-sized programming space; CPU-GPU data transfers;
                 Data transfer; GPU applications; GPU architecture; GPU
                 memory high bandwidth; GPU memory size; graphics
                 processing units; Graphics processing units; graphics
                 processing units; high-performance memory-unaware GPU
                 programming; I.3.1.a Graphics processors; Instruction
                 sets; memory architecture; Memory management; memory
                 size requirement; programmer-managed GPU memory;
                 Programming; Random access memory; ScaleGPU",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "7",
  unique-id =    "Kim:2014:SGA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Sankar:2014:SFL,
  author =       "Sriram Sankar and Sudhanva Gurumurthi",
  title =        "Soft Failures in Large Datacenters",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "105--108",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.25",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "A major problem in managing large-scale datacenters is
                 diagnosing and fixing machine failures. Most large
                 datacenter deployments have a management infrastructure
                 that can help diagnose failure causes, and manage
                 assets that were fixed as part of the repair process.
                 Previous studies identify only actual hardware
                 replacements to calculate Annualized Failure Rate (AFR)
                 and component reliability. In this paper, we show that
                 service availability is significantly affected by soft
                 failures and that this class of failures is becoming an
                 important issue at large datacenters with minimum human
                 intervention. Soft failures in the datacenter do not
                 require actual hardware replacements, but still result
                 in service downtime, and are equally important because
                 they disrupt normal service operation. We show failure
                 trends observed in a large datacenter deployment of
                 commodity servers and motivate the need to modify
                 conventional datacenter designs to help reduce soft
                 failures and increase service availability.",
  acknowledgement = ack-nhfb,
  affiliation =  "Sankar, S (Reprint Author), Microsoft Corp, Redmond,
                 WA 98052 USA. Sankar, Sriram, Microsoft Corp, Redmond,
                 WA 98052 USA. Sankar, Sriram; Gurumurthi, Sudhanva,
                 Univ Virginia, Charlottesville, VA 22903 USA.
                 Gurumurthi, Sudhanva, Adv Micro Devices Inc, AMD Res,
                 Sunnyvale, CA 94088 USA.",
  author-email = "sriram.sankar@microsoft.com
                 Sudhanva.Gurumurthi@amd.com",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "AFR; annualized failure rate; asset management; C.4
                 Performance of Systems; C.5.5 Servers;
                 Characterization; Client-server systems; commodity
                 servers; component reliability; computer centres; Data
                 centers; Datacenter; datacenter deployments; datacenter
                 designs; datacenter management; failure cause
                 diagnosis; fault diagnosis; Hard disks; hardware
                 replacements; Large-scale systems; machine failure
                 diagnosis; machine failure fixing; Maintenance
                 engineering; Management; management infrastructure;
                 Market research; Reliability; repair process; service
                 availability; soft failures; Transient analysis",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Sankar:2014:SFL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2014:VPT,
  author =       "Daehoon Kim and Hwanju Kim and Jaehyuk Huh",
  title =        "{vCache}: Providing a Transparent View of the {LLC} in
                 Virtualized Environments",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "109--112",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/L-CA.2013.20",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
  abstract =     "Since most of the current multi-core processors use a
                 large last-level cache (LLC), efficient use of an LLC
                 is critical for the overall performance of multi-cores.
                 To improve the caching efficiency, page coloring is a
                 representative software-based approach to allow the OS
                 to control placement of pages on an LLC to improve
                 their cache utility and to avoid conflicts among cores.
                 However, system virtualization, with additional address
                 translation by the hypervisor, can make page coloring
                 techniques used by the guest OS ineffective, as guest
                 physical addresses used by the guest OS for coloring
                 differ from real addresses used for cache indexing in
                 the LLCs. In this paper, we propose a novel LLC
                 architecture to provide the guest OS with a flexible
                 control over LLC placement in virtualized systems. The
                 proposed vCache architecture can preserve coloring
                 information set by the guest OS. In addition to color
                 preservation, vCache can potentially eliminate the
                 traditional limitation of page coloring, the cost of
                 dynamic color changes for memory pages. By using the
                 pollute buffer mechanism, one of the color-based cache
                 optimization techniques, vCache shows performance
                 improvement of benchmark applications up to 33\%
                 without degrading the performance of another co-running
                 application in the VM.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, D (Reprint Author), Korea Adv Inst Sci \&
                 Technol, Dept Comp Sci, Taejon, South Korea. Kim,
                 Daehoon; Kim, Hwanju; Huh, Jaehyuk, Korea Adv Inst Sci
                 \& Technol, Dept Comp Sci, Taejon, South Korea.",
  author-email = "daehoon@calab.kaist.ac.kr hjukim@calab.kaist.ac.kr
                 jhuh@calab.kaist.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "AX5PM",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "SW Computing R\&D Program of
                 KEIT(UX-oriented Mobile SW Platform) --- Ministry of
                 Trade, Industry, and Energy [2011-10041313]",
  funding-text = "This research was supported by the SW Computing R\&D
                 Program of KEIT(2011-10041313, UX-oriented Mobile SW
                 Platform) funded by the Ministry of Trade, Industry,
                 and Energy.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "address translation; B.3.2.b Cache memories; benchmark
                 applications; buffer mechanism; C.1.4.e
                 Multi-core/single-chip multiprocessors; C.1.5.e Memory
                 hierarchy; cache indexing; Cache partitioning; cache
                 storage; Cache storage; cache utility improvement;
                 caching efficiency improvement; co-running application;
                 color-based cache optimization techniques; coloring
                 information preservation; core conflict avoidance;
                 dynamic color cost; guest OS; guest physical address;
                 hypervisor; last-level cache; LLC architecture; LLC
                 placement; Memory management; memory pages; Multicore
                 processing; multicore processor performance;
                 multiprocessing systems; operating systems (computers);
                 Page coloring; page coloring; page placement control;
                 paged storage; software-based approach; system
                 virtualization; transparent LLC view; vCache
                 architecture; Virtual machine monitors; virtual
                 machines; virtualisation; Virtualization; virtualized
                 environments; VM",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  researcherid-numbers = "Huh, Jaehyuk/C-1716-2011",
  times-cited =  "2",
  unique-id =    "Kim:2014:VPT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2014:TCb,
  author =       "Anonymous",
  title =        "Table of Contents",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "C1--C1",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2368891",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2014:ICAa,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Computer Architecture Letters}}
                 Editorial Board}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "C2--C2",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2368892",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2014:ICAb,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Computer Architecture Letters}}}
                 Information for Authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "C3--C3",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2368893",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2014:ICSb,
  author =       "Anonymous",
  title =        "{IEEE Computer Society} [advertisement]",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "13",
  number =       "2",
  pages =        "C4--C4",
  month =        jul # "\slash " # dec,
  year =         "2014",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2368894",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Liao:2015:AWL,
  author =       "Jianwei Liao and Fengxiang Zhang and Li Li and
                 Guoqiang Xiao",
  title =        "Adaptive Wear-Leveling in Flash-Based Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2329871",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The paper presents an adaptive wear-leveling scheme
                 based on several wear-thresholds in different periods.
                 The basic idea behind this scheme is that blocks can
                 have different wear-out speeds and the wear-leveling
                 mechanism does not conduct data migration until the
                 erasure counts of some hot blocks hit a threshold.
                 Through a series of emulation experiments based on
                 several realistic disk traces, we show that the
                 proposed wear-leveling mechanism can reduce total
                 erasure counts and yield uniform erasure counts among
                 all blocks at the late lifetime of the storage devices.
                 As a result, not only can the performance of storage
                 systems be advanced, the lifespan of the flash-based
                 memory can also be extended to a certain degree.",
  acknowledgement = ack-nhfb,
  affiliation =  "Liao, JW (Reprint Author), Southwest Univ, Coll Comp
                 \& Informat Sci, Chongqing, Peoples R China. Liao,
                 Jianwei; Zhang, Fengxiang; Li, Li; Xiao, Guoqiang,
                 Southwest Univ, Coll Comp \& Informat Sci, Chongqing,
                 Peoples R China.",
  author-email = "liaojianwei@il.is.s.u-okyo.ac.jp zhangfx@swu.edu.cn
                 lily@swu.edu.cn gqxiao@swu.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Adaptive systems; adaptive wear-leveling; Ash;
                 Benchmark testing; data migration; delayed migration;
                 disk traces; emulation experiments; Equations; erasure
                 evenness; extending lifetime; flash memories;
                 flash-based memory; Flash-based storage devices; Market
                 research; Servers; Standards; total erasure count
                 reduction; wear; wear-leveling; wear-leveling
                 mechanism; wear-out speeds; wear-thresholds",
  number-of-cited-references = "11",
  ORCID-numbers = "Liao, Jianwei/0000-0001-6149-6650",
  research-areas = "Computer Science",
  researcherid-numbers = "Liao, Jianwei/C-5339-2016",
  times-cited =  "4",
  unique-id =    "Liao:2015:AWL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2015:IIC,
  author =       "Anonymous",
  title =        "2014 Index {{\booktitle{IEEE Computer Architecture
                 Letters}}} Vol. 13",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "1--5",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2387774",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Indexes",
}

@Article{Chen:2015:HSC,
  author =       "Jie Chen and Guru Venkataramani",
  title =        "A Hardware-Software Cooperative Approach for
                 Application Energy Profiling",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2323711",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Energy consumption by software applications is a
                 critical issue that determines the future of multicore
                 software development. In this article, we propose a
                 hardware-software cooperative approach that uses
                 hardware support to efficiently gather the
                 energy-related hardware counters during program
                 execution, and utilizes parameter estimation models in
                 software to compute the energy consumption by
                 instructions at a finer grain level (say basic block).
                 We design mechanisms to minimize collinearity in
                 profiler data, and present results to validate our
                 energy estimation methodology.",
  acknowledgement = ack-nhfb,
  affiliation =  "Chen, J (Reprint Author), George Washington Univ, Dept
                 Elect \& Comp Engn, Washington, DC 20052 USA. Chen,
                 Jie; Venkataramani, Guru, George Washington Univ, Dept
                 Elect \& Comp Engn, Washington, DC 20052 USA.",
  author-email = "jiec@gwu.edu guruv@gwu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "application energy profiling; Benchmark testing;
                 Energy consumption; energy consumption; energy
                 debugging; energy estimation; energy estimation
                 methodology; Energy profiling; energy-related hardware
                 counters; Estimation; Hardware; hardware-software
                 codesign; hardware-software cooperative approach;
                 Mathematical model; multicore software development;
                 multiprocessing systems; Parameter estimation;
                 parameter estimation models; power aware computing;
                 profiler data collinearity; program execution;
                 Software; software applications",
  keywords-plus = "POWER",
  number-of-cited-references = "12",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Chen:2015:HSC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2015:ASM,
  author =       "Dae-Hyun Kim and Prashant J. Nair and Moinuddin K.
                 Qureshi",
  title =        "Architectural Support for Mitigating Row Hammering in
                 {DRAM} Memories",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2332177",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "DRAM scaling has been the prime driver of increasing
                 capacity of main memory systems. Unfortunately, lower
                 technology nodes worsen the cell reliability as it
                 increases the coupling between adjacent DRAM cells,
                 thereby exacerbating different failure modes. This
                 paper investigates the reliability problem due to Row
                 Hammering, whereby frequent activations of a given row
                 can cause data loss for its neighboring rows. As DRAM
                 scales to lower technology nodes, the threshold for the
                 number of row activations that causes data loss for the
                 neighboring rows reduces, making Row Hammering a
                 challenging problem for future DRAM chips. To overcome
                 Row Hammering, we propose two architectural solutions:
                 First, Counter-Based Row Activation (CRA), which uses a
                 counter with each row to count the number of row
                 activations. If the count exceeds the row hammering
                 threshold, a dummy activation is sent to neighboring
                 rows proactively to refresh the data. Second,
                 Probabilistic Row Activation (PRA), which obviates
                 storage overhead of tracking and simply allows the
                 memory controller to proactively issue dummy
                 activations to neighboring rows with a small
                 probability for all memory access. Our evaluations show
                 that these solutions are effective at mitigating Row
                 hammering while causing negligible performance loss (<
                 1 percent).",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, DH (Reprint Author), Georgia Inst Technol, Dept
                 ECE, Atlanta, GA 30363 USA. Kim, Dae-Hyun; Nair,
                 Prashant J.; Qureshi, Moinuddin K., Georgia Inst
                 Technol, Dept ECE, Atlanta, GA 30363 USA.",
  author-email = "dhkim@ece.gatech.edu pnair6@ece.gatech.edu
                 moin@ece.gatech.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architectural support; cell reliability; Computer
                 architecture; counter-based row activation; data
                 errors; data retention; DRAM chips; DRAM memories; DRAM
                 scaling; Dynamic random access memory; Dynamic random
                 access memory, row hammering, data retention, data
                 errors; Leakage currents; Logic gates; Microprocessors;
                 probabilistic row activation; probability; Radiation
                 detectors; Random access memory; reliability;
                 reliability problem; row hammering; Transistors",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "23",
  unique-id =    "Kim:2015:ASM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Nathan:2015:AGC,
  author =       "Ralph Nathan and Daniel J. Sorin",
  title =        "{Argus-G}: Comprehensive, Low-Cost Error Detection for
                 {GPGPU} Cores",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2298391",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "We have developed and evaluated Argus-G, an error
                 detection scheme for general purpose GPU (GPGPU) cores.
                 Argus-G is a natural extension of the Argus error
                 detection scheme for CPU cores, and we demonstrate how
                 to modify Argus such that it is compatible with GPGPU
                 cores. Using an RTL prototype, we experimentally show
                 that Argus-G can detect the vast majority of injected
                 errors at relatively low performance, area, and power
                 costs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Nathan, R (Reprint Author), Duke Univ, Durham, NC
                 27708 USA. Nathan, Ralph; Sorin, Daniel J., Duke Univ,
                 Durham, NC 27708 USA.",
  author-email = "ralph.nathan@duke.edu sorin@ee.duke.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Argus-G; Benchmark testing; Conferences; CPU cores;
                 error detection; fault tolerance; general purpose GPU
                 cores; GPGPU cores; Graphics processing units; graphics
                 processing units; Graphics processors; Hardware;
                 Hardware design languages; Instruction sets; low-cost
                 error detection; Registers",
  number-of-cited-references = "18",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Nathan:2015:AGC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{O:2015:CCI,
  author =       "Seongil O and Sanghyuk Kwon and Young Hoon Son and
                 Yujin Park and Jung Ho Ahn",
  title =        "{CIDR}: A Cache Inspired Area-Efficient {DRAM}
                 Resilience Architecture against Permanent Faults",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2324894",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "area overhead; area-efficient DRAM resilience
                 architecture; Arrays; augmented cache; bit errors;
                 Bloom filter; cache data array; cache storage; cache
                 tags; cache-inspired DRAM resilience architecture;
                 CIDR; Circuit faults; cost-sensitive main-memory DRAM
                 devices; data structures; Decoding; device failure
                 rates; DRAM arrays; DRAM chips; DRAM, error resilience,
                 permanent faults, row and column sparing, Bloom filter,
                 DRAM-side caching; energy overhead minimization; error
                 statistics; fault diagnosis; faulty cells; I/O pads;
                 memory architecture; permanent faults; processor-memory
                 interfaces; Random access memory; Resilience;
                 single-bit error rates; Testing; testing phase",
}

@Article{Seongil:2015:CCI,
  author =       "O. Seongil and Sanghyuk Kwon and Young Hoon Son and
                 Yujin Park and Jung Ho Ahn",
  title =        "{CIDR}: A Cache Inspired Area-Efficient {DRAM}
                 Resilience Architecture against Permanent Faults",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "17--20",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2324894",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Thu Jun 20 17:18:18 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Faulty cells have become major problems in
                 cost-sensitive main-memory DRAM devices. Conventional
                 solutions to reduce device failure rates due to cells
                 with permanent faults, such as populating spare rows
                 and relying on error-correcting codes, have had limited
                 success due to high area overheads. In this paper, we
                 propose CIDR, a novel cache-inspired DRAM resilience
                 architecture, which substantially reduces the area
                 overhead of handling bit errors from these faulty
                 cells. A DRAM device adopting CIDR has a small cache
                 next to its I/O pads to replace accesses to the
                 addresses that include the faulty cells with ones that
                 correspond to the cache data array. We minimize the
                 energy overhead of accessing the cache tags for every
                 read or write by adding a Bloom filter in front of the
                 cache. The augmented cache is programmed once during
                 the testing phase and is out of the critical path on
                 normal accesses because both cache and DRAM arrays are
                 accessed in parallel, making CIDR transparent to
                 existing processor-memory interfaces. Compared to the
                 conventional architecture relying on spare rows, CIDR
                 lowers the area overhead of achieving equal failure
                 rates over a wide range of single-bit error rates, such
                 as 23.6 x lower area overhead for a bit-error rate of
                 10(-5) and a device failure rate of 10(-3).",
  acknowledgement = ack-nhfb,
  affiliation =  "Seongil, O (Reprint Author), Seoul Natl Univ, Dept
                 Transdisciplinary Studies, Seoul, South Korea. Seongil,
                 O.; Kwon, Sanghyuk; Son, Young Hoon; Park, Yujin; Ahn,
                 Jung Ho, Seoul Natl Univ, Dept Transdisciplinary
                 Studies, Seoul, South Korea.",
  author-email = "swdfish@snu.ac.kr kkwon114@snu.ac.kr yhson96@snu.ac.kr
                 comesay@snu.ac.kr gajh@snu.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bloom filter; DRAM; DRAM-side caching; error
                 resilience; permanent faults; row and column sparing",
  number-of-cited-references = "13",
  oa =           "Bronze",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Seongil:2015:CCI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Gupta:2015:CEO,
  author =       "Ujjwal Gupta and Umit Y. Ogras",
  title =        "Constrained Energy Optimization in Heterogeneous
                 Platforms Using Generalized Scaling Models",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "21--25",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2326603",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Platform energy consumption and responsiveness are two
                 major considerations for mobile systems since they
                 determine the battery life and user satisfaction,
                 respectively. We first present models for power
                 consumption, response time and energy consumption of
                 heterogeneous mobile platforms. Then, we use these
                 models to optimize the energy consumption of baseline
                 platforms under response time and temperature
                 constraints with and without introducing new resources.
                 We show that the optimal design choices depend on
                 dynamic power management algorithm, and adding new
                 resources is more energy efficient than scaling
                 existing resources alone.",
  acknowledgement = ack-nhfb,
  affiliation =  "Gupta, U (Reprint Author), Arizona State Univ, Sch
                 Elect Comp \& Energy Engn, Tempe, AZ 85281 USA. Gupta,
                 Ujjwal; Ogras, Umit Y., Arizona State Univ, Sch Elect
                 Comp \& Energy Engn, Tempe, AZ 85281 USA.",
  author-email = "ujjwal@asu.edu umit@asu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "battery life determine; Computers; constrained energy
                 optimization; dynamic power management algorithm;
                 Energy consumption; Energy optimization; generalized
                 scaling models; heterogeneous architectures;
                 heterogeneous mobile platforms; Mobile communication;
                 mobile computing; mobile platforms; mobile systems;
                 MpSoC; Multicore processing; Optimization; performance;
                 platform energy consumption; power aware computing;
                 power consumption; Power demand; response time;
                 temperature constraints; Time factors; user
                 satisfaction",
  keywords-plus = "AMDAHLS LAW; MULTIAMDAHL; ACCELERATOR; MANAGEMENT;
                 CPU; ERA",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Gupta:2015:CEO",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Farmahini-Farahani:2015:DAA,
  author =       "Amin Farmahini-Farahani and Jung Ho Ahn and Katherine
                 Morrow and Nam Sung Kim",
  title =        "{DRAMA}: An Architecture for Accelerated Processing
                 Near Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "26--29",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2333735",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Improving energy efficiency is crucial for both mobile
                 and high-performance computing systems while a large
                 fraction of total energy is consumed to transfer data
                 between storage and processing units. Thus, reducing
                 data transfers across the memory hierarchy of a
                 processor (i.e., off-chip memory, on-chip caches, and
                 register file) can greatly improve the energy
                 efficiency. To this end, we propose an architecture,
                 DRAMA, that 3D-stacks coarse-grain reconfigurable
                 accelerators (CGRAs) atop off-chip DRAM devices. DRAMA
                 does not require changes to the DRAM device
                 architecture, apart from through-silicon vias (TSVs)
                 that connect the DRAM device's internal I/O bus to the
                 CGRA layer. We demonstrate that DRAMA can reduce the
                 energy consumption to transfer data across the memory
                 hierarchy by 66-95 percent while achieving speedups of
                 up to 18 x over a commodity processor.",
  acknowledgement = ack-nhfb,
  affiliation =  "Farmahini-Farahani, A (Reprint Author), Univ
                 Wisconsin, Dept Elect \& Comp Engn, 1415 Johnson Dr,
                 Madison, WI 53706 USA. Farmahini-Farahani, Amin;
                 Morrow, Katherine; Kim, Nam Sung, Univ Wisconsin, Dept
                 Elect \& Comp Engn, Madison, WI 53706 USA. Ahn, Jung
                 Ho, Seoul Natl Univ, Dept Transdisciplinary Studies,
                 Seoul 151742, South Korea.",
  author-email = "farmahinifar@wisc.edu gajh@snu.ac.kr
                 kati@engr.wisc.edu nskim3@wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "3D-stacking; 3D-stacks coarse-grain reconfigurable
                 accelerators; accelerated near memory processing;
                 Acceleration; accelerator; Arrays; data transfers;
                 DRAM; DRAM chips; DRAM devices; DRAMA architecture;
                 dynamic random access memory; energy conservation;
                 energy consumption reduction; energy efficiency;
                 energy-efficient computing; high-performance computing
                 systems; Kernel; memory hierarchy; Memory management;
                 mobile computing systems; Near memory processing; Near
                 memory processing, DRAM, 3D-stacking, energy-efficient
                 computing, accelerator; processing units; Random access
                 memory; Registers; storage management; storage units;
                 through-silicon vias; total energy fraction; TSV",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "7",
  unique-id =    "Farmahini-Farahani:2015:DAA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Carlson:2015:EPM,
  author =       "Trevor E. Carlson and Siddharth Nilakantan and Mark
                 Hempstead and Wim Heirman",
  title =        "Epoch Profiles: Microarchitecture-Based Application
                 Analysis and Optimization",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "30--33",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2329873",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The performance of data-intensive applications, when
                 running on modern multi-and many-core processors, is
                 largely determined by their memory access behavior. Its
                 most important contributors are the frequency and
                 latency of off-chip accesses and the extent to which
                 long-latency memory accesses can be overlapped with
                 useful computation or with each other. In this paper we
                 present two methods to better understand application
                 and microarchitectural interactions. An epoch profile
                 is an intuitive way to understand the relationships
                 between three important characteristics: the on-chip
                 cache size, the size of the reorder window of an
                 out-of-order processor, and the frequency of processor
                 stalls caused by long-latency, off-chip requests
                 (epochs). By relating these three quantities one can
                 more easily understand an application's memory
                 reference behavior and thus significantly reduce the
                 design space. While epoch profiles help to provide
                 insight into the behavior of a single application,
                 developing an understanding of a number of applications
                 in the presence of area and core count constraints
                 presents additional challenges. Epoch-based
                 microarchitectural analysis is presented as a better
                 way to understand the trade-offs for memory-bound
                 applications in the presence of these physical
                 constraints. Through epoch profiling and optimization,
                 one can significantly reduce the multidimensional
                 design space for hardware/software optimization through
                 the use of high-level model-driven techniques.",
  acknowledgement = ack-nhfb,
  affiliation =  "Carlson, TE (Reprint Author), Univ Ghent, Sint
                 Pietersnieuwstr 41, B-9000 Ghent, East Flanders,
                 Belgium. Carlson, Trevor E., Univ Ghent, B-9000 Ghent,
                 East Flanders, Belgium. Nilakantan, Siddharth;
                 Hempstead, Mark, Drexel Univ, Dept Elect \& Comp Engn,
                 Bossone Res Ctr, Philadelphia, PA 19104 USA. Heirman,
                 Wim, Intel Corp, Leuven, Flemish Brabant, Belgium.",
  author-email = "trevor.carlson@elis.ugent.be sn446@drexel.edu
                 mhempstead@drexel.edu wim.heirman@intel.com",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; Computational modeling; Frequency
                 measurement; memory-level parallelism;
                 Microarchitecture; Microarchitecture analysis; Out of
                 order; System-on-chip; visualization",
  number-of-cited-references = "6",
  oa =           "Green Published",
  ORCID-numbers = "Carlson, Trevor/0000-0001-8742-134X Nilakantan,
                 Siddharth/0000-0003-1067-700X Heirman,
                 Wim/0000-0003-2286-1525",
  research-areas = "Computer Science",
  researcherid-numbers = "Carlson, Trevor/M-4945-2016",
  times-cited =  "0",
  unique-id =    "Carlson:2015:EPM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Power:2015:GGH,
  author =       "Jason Power and Joel Hestness and Marc S. Orr and Mark
                 D. Hill and David A. Wood",
  title =        "{gem5-gpu}: A Heterogeneous {CPU--GPU} Simulator",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "34--36",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2299539",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/pvm.bib",
  abstract =     "gem5-gpu is a new simulator that models tightly
                 integrated CPU-GPU systems. It builds on gem5, a
                 modular full-system CPU simulator, and GPGPU-Sim, a
                 detailed GPGPU simulator. gem5-gpu routes most memory
                 accesses through Ruby, which is a highly configurable
                 memory system in gem5. By doing this, it is able to
                 simulate many system configurations, ranging from a
                 system with coherent caches and a single virtual
                 address space across the CPU and GPU to a system that
                 maintains separate GPU and CPU physical address spaces.
                 gem5-gpu can run most unmodified CUDA 3.2 source code.
                 Applications can launch non-blocking kernels, allowing
                 the CPU and GPU to execute simultaneously. We present
                 gem5-gpu's software architecture and a brief
                 performance validation. We also discuss possible
                 extensions to the simulator. gem5-gpu is open source
                 and available at gem5-gpu.cs.wisc.edu.",
  acknowledgement = ack-nhfb,
  affiliation =  "Power, J (Reprint Author), Univ Wisconsin, Dept Comp
                 Sci, 1210 W Dayton St, Madison, WI 53706 USA. Power,
                 Jason; Hestness, Joel; Orr, Marc S.; Hill, Mark D.;
                 Wood, David A., Univ Wisconsin, Dept Comp Sci, Madison,
                 WI 53706 USA.",
  author-email = "powerjg@cs.wisc.edu hestness@cs.wisc.edu
                 morr@cs.wisc.edu markhill@cs.wisc.edu
                 david@cs.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Coherence; Computational modeling; Computer
                 architecture; computer architecture; gem5-gpu
                 simulator; general-purpose graphics processors;
                 GPGPUSim; Graphics processing units; graphics
                 processing units; heterogeneous (hybrid) systems;
                 heterogeneous CPU-GPU simulator; Kernel; Modeling
                 techniques; modular full-system CPU simulator;
                 nonblocking kernels; Object oriented modeling;
                 Protocols; simulators; software architecture",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "62",
  unique-id =    "Power:2015:GGH",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Manatunga:2015:HSS,
  author =       "Dilan Manatunga and Joo Hwan Lee and Hyesoon Kim",
  title =        "Hardware Support for Safe Execution of Native Client
                 Applications",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "37--40",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2309601",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Over the past few years, there has been vast growth in
                 the area of the web browser as an applications
                 platform. One example of this trend is Google's Native
                 Client (NaCl) platform, which is a software-fault
                 isolation mechanism that allows the running of native
                 x86 or ARM code on the browser. One of the security
                 mechanisms employed by NaCl is that all branches must
                 jump to the start of a valid instruction. In order to
                 achieve this criteria though, all return instructions
                 are replaced by a specific branch instruction sequence,
                 which we call NaCl returns, that are guaranteed to
                 return to a valid instruction. However, these NaCl
                 returns lose the advantage of the highly accurate
                 return-address stack (RAS) in exchange for the less
                 accurate indirect branch predictor. In this paper, we
                 propose a NaCl-RAS mechanism that can identify and
                 accurately predict 76.9 on average compared to the 39.5
                 of a traditional BTB predictor.",
  acknowledgement = ack-nhfb,
  affiliation =  "Manatunga, D (Reprint Author), Georgia Inst Technol,
                 Sch Comp Sci, Atlanta, GA 30332 USA. Manatunga, Dilan;
                 Lee, Joo Hwan; Kim, Hyesoon, Georgia Inst Technol, Sch
                 Comp Sci, Atlanta, GA 30332 USA.",
  author-email = "dmanatunga@gatech.edu joohwan.lee@gatech.edu
                 hyesoon@cc.gatech.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; ARM code; Benchmark testing; branch
                 instruction sequence; branch prediction accuracy;
                 branch prediction accuracy; BTB predictor; Detectors;
                 fault diagnosis; Google; Hardware; hardware support;
                 NaCl-RAS mechanism; Native client; native client
                 applications; native x86; online front-ends; return
                 address prediction; return-address stack; safe
                 execution; Security; security mechanism; security of
                 data; Software; software fault isolation;
                 software-fault isolation mechanism; Web browser",
  keywords-plus = "SANDBOX; CODE",
  number-of-cited-references = "5",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Manatunga:2015:HSS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Liu:2015:LHP,
  author =       "Longjun Liu and Chao Li and Hongbin Sun and Yang Hu
                 and Jingmin Xin and Nanning Zheng and Tao Li",
  title =        "Leveraging Heterogeneous Power for Improving
                 Datacenter Efficiency and Resiliency",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "41--45",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2363084",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Power mismatching between supply and demand has
                 emerged as a top issue in modern datacenters that are
                 under-provisioned or powered by intermittent power
                 supplies. Recent proposals are primarily limited to
                 leveraging uninterruptible power supplies (UPS) to
                 handle power mismatching, and therefore lack the
                 capability of efficiently handling the irregular peak
                 power mismatches. In this paper we propose hPower, the
                 first heterogeneous energy buffering strategy that
                 incorporates supercapacitors into existing datacenters
                 to handle power mismatch. Our technique exploits power
                 supply diversity and smart load assignment to provide
                 efficiency-aware and emergency-aware power mismatch
                 management. We show that hPower could improve energy
                 efficiency by 30 percent, extend UPS lifetime by 4.3 x,
                 and reduce system downtime by 36 percent. It allows
                 datacenters to adapt themselves to various power supply
                 anomalies, thereby improving operational efficiency and
                 resiliency.",
  acknowledgement = ack-nhfb,
  affiliation =  "Liu, LJ (Reprint Author), Xi An Jiao Tong Univ, Sch
                 Elect \& Informat Engn, Xian 710049, Peoples R China.
                 Liu, Longjun; Sun, Hongbin; Xin, Jingmin; Zheng,
                 Nanning, Xi An Jiao Tong Univ, Sch Elect \& Informat
                 Engn, Xian 710049, Peoples R China. Li, Chao, Shanghai
                 Jiao Tong Univ, Dept Comp Sci \& Engn, Shanghai 200030,
                 Peoples R China. Hu, Yang; Li, Tao, Univ Florida, Dept
                 Elect \& Comp Engn, Gainesville, FL USA.",
  author-email = "longjun.liu@stu.xjtu.edu.cn lichao@cs.sjtu.edu.cn
                 hsun@mail.xjtu.edu.cn huyang.ece@ufl.edu
                 jxin@mail.xjtu.edu.cn nnzheng@mail.xjtu.edu.cn
                 taoli@ece.ufl.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Batteries; computer centres; computer system
                 implementation; Computer System Implementation;
                 computer system implementation; data center efficiency;
                 data center resiliency; efficiency-aware power mismatch
                 management; emergency-aware power mismatch management;
                 energy conservation; Energy efficiency; Energy-aware
                 systems; Energy-Aware Systems; heterogeneous energy
                 buffering strategy; heterogeneous power; hPower;
                 performance of systems; Performance of Systems; power
                 aware computing; Power demand; power mismatching; power
                 supply anomalies; power supply diversity; Servers;
                 smart load assignment; Supercapacitors;
                 supercapacitors; system downtime reduction;
                 uninterruptible power supplies; Uninterruptible power
                 systems; UPS",
  number-of-cited-references = "16",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Liu:2015:LHP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wang:2015:LNV,
  author =       "Rui Wang and Wangyuan Zhang and Tao Li and Depei
                 Qian",
  title =        "Leveraging Non-Volatile Storage to Achieve Versatile
                 Cache Optimizations",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "46--49",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2298412",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The efficiency of caches plays a vital role in
                 microprocessor. In this paper, we introduce a novel and
                 flexible cache substrate that employs non-volatile yet
                 versatile SRAM (NV2-SRAM) cell design, which
                 synergistically integrates new memory devices into the
                 standard SRAM cells. Our experiments show that it can
                 achieve a 67 percent energy saving and 3: 1 x
                 reliability improvement over the SRAM based cache,
                 outperforming the drowsy cache design in terms of both
                 power efficiency and reliability. Moreover, the
                 proposed cache architecture can be used to improve the
                 performance of prefetching schemes by 10 percent.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wang, R (Reprint Author), Beihang Univ, Sch Comp Sci
                 \& Engn, State Key Lab Software Dev Environm, Beijing
                 100191, Peoples R China. Wang, Rui; Qian, Depei,
                 Beihang Univ, Sch Comp Sci \& Engn, State Key Lab
                 Software Dev Environm, Beijing 100191, Peoples R China.
                 Zhang, Wangyuan; Li, Tao, Univ Florida, ECE Dept,
                 Gainesville, FL 32611 USA.",
  author-email = "rui.wang@jsi.buaa.edu.cn zhangwangyuan@gmail.com
                 taoli@ece.ufl.edu depeiq@buaa.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache architecture; Cache memories; cache storage;
                 Computer architecture; energy saving; flexible cache
                 substrate; low-power design; Magnetic tunneling; memory
                 structures; microprocessor; Microprocessors;
                 Nonvolatile memory; nonvolatile storage; nonvolatile
                 yet versatile SRAM cell design; NV2-SRAM cell design;
                 Prefetching; prefetching schemes; reliability
                 improvement; SRAM; SRAM based cache; SRAM cells; SRAM
                 chips; storage management; versatile cache
                 optimizations",
  number-of-cited-references = "19",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Wang:2015:LNV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Mohammadi:2015:DDB,
  author =       "Milad Mohammadi and Song Han and Tor M. Aamodt and
                 William J. Dally",
  title =        "On-Demand Dynamic Branch Prediction",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "50--53",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2330820",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In out-of-order (OoO) processors, speculative
                 execution with high branch prediction accuracy is
                 employed to achieve good single thread performance. In
                 these processors the branch prediction unit tables
                 (BPU) are accessed in parallel with the instruction
                 cache before it is known whether a fetch group contains
                 branch instructions. For integer applications, we find
                 85 percent of BPU lookups are done for non-branch
                 operations and of the remaining lookups, 42 percent are
                 done for highly biased branches that can be predicted
                 statically with high accuracy. We evaluate on-demand
                 branch prediction (ODBP), a novel technique that uses
                 compiler generated hints to identify those instructions
                 that can be more accurately predicted statically to
                 eliminate unnecessary BPU lookups. We evaluate an
                 implementation of ODBP that combines static and dynamic
                 branch prediction. For a four wide superscalar
                 processor, ODBP delivers as much as 9 percent
                 improvement in average energy-delay (ED) product, 7
                 percent core average energy saving, and 3 percent
                 speedup. ODBP also enables the use of large BPU's for a
                 given power budget.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mohammadi, M (Reprint Author), Stanford Univ, Dept
                 Elect Engn, Stanford, CA 94305 USA. Mohammadi, Milad;
                 Han, Song; Dally, William J., Stanford Univ, Dept Elect
                 Engn, Stanford, CA 94305 USA. Aamodt, Tor M., Univ
                 British Columbia, Dept Elect \& Comp Engn, Vancouver,
                 BC V6T 1Z4, Canada.",
  author-email = "milad@stanford.edu songhan@stanford.edu
                 aamodt@ece.ubc.ca dally@stanford.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; ahead prediction; BPU lookup; branch
                 instruction; branch prediction accuracy; branch
                 prediction unit table; cache storage; compiler
                 generated hints; Computer architecture; core average
                 energy saving; ED product; Energy efficiency;
                 energy-delay product; energy-delay product
                 optimization; Equations; instruction cache; instruction
                 sets; Mathematical model; nonbranch operation; ODBP;
                 on-demand branch prediction; on-demand dynamic branch
                 prediction; OoO processor; out-of-order processor;
                 parallel processing; Pipelines; power budget; program
                 compilers; Program processors; single thread
                 performance; speculative execution; static and dynamic
                 branch prediction hybrid; static branch prediction;
                 superscalar processor; table lookup; Tin",
  keywords-plus = "MICROPROCESSOR; DESIGN",
  number-of-cited-references = "27",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Mohammadi:2015:DDB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Azriel:2015:PMT,
  author =       "Leonid Azriel and Avi Mendelson and Uri Weiser",
  title =        "Peripheral Memory: A Technique for Fighting Memory
                 Bandwidth Bottleneck",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "54--57",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2319077",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memory bottleneck has always been a major cause for
                 limiting the performance of computer systems. While in
                 the past latency was the major concern, today, lack of
                 bandwidth becomes a limiting factor as well, as a
                 result of exploiting more parallelism with the growing
                 number of cores per die, which intensifies the pressure
                 on the memory bus. In such an environment, any
                 additional traffic to memory, such as the I/O traffic
                 may lead to degradation of the overall performance of
                 the system. This work introduces the concept of
                 Peripheral Memory, a software controlled memory that
                 resides in the I/O domain and can be used for
                 offloading I/O traffic from CPU memory. The Peripheral
                 Memory handles `I/O exclusive data', data originated
                 and terminated at I/O domain, and which does not need
                 any processing by the CPU. The paper analyses the
                 impact of I/O traffic on the overall performance of the
                 current systems and demonstrates that in numerous
                 applications, I/O exclusive data occupies major part of
                 memory bandwidth, as a result, degrading CPU processing
                 performance and increasing power. The paper considers
                 four different implementations of the Peripheral
                 Memory: pageable, pinned, non-coherent split-traffic
                 and copy-on-access. Our full-system simulator indicates
                 that non-coherent split traffic configuration is the
                 most efficient implementation, which can provide up to
                 four times speedup in the I/O processing rate for
                 typical I/O intensive applications. In addition, based
                 on Power model and measurements tools, the paper
                 demonstrates that the Peripheral Memory in a server
                 system can lead to reduction of tens of Watts in the
                 overall system power consumption or 10-20 percent of
                 the system power budget.",
  acknowledgement = ack-nhfb,
  affiliation =  "Azriel, L (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
                 Azriel, Leonid; Mendelson, Avi; Weiser, Uri, Technion
                 Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa,
                 Israel.",
  author-email = "leonida@tx.technion.ac.il
                 avi.mendelson@tce.technion.ac.il
                 uri.weiser@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Bandwidth; bandwidth allocation; Benchmark testing;
                 computer system performance; CPU memory; full-system
                 simulator; I/O domain; I/O traffic offloading;
                 input/output devices; Instruction sets; interconnection
                 architectures; main memory; memory bandwidth
                 bottleneck; memory bus; Memory management; parallelism;
                 performance evaluation; Performance evaluation;
                 peripheral memory; Power demand; Power measurement;
                 server system; software controlled memory; storage
                 management; system buses",
  keywords-plus = "NETWORK; I/O",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Azriel:2015:PMT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Wang:2015:PTM,
  author =       "Zhaoguo Wang and Han Yi and Ran Liu and Mingkai Dong
                 and Haibo Chen",
  title =        "Persistent Transactional Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "58--61",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2329832",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper proposes persistent transactional memory
                 (PTM), a new design that adds durability to
                 transactional memory (TM) by incorporating with the
                 emerging non-volatile memory (NVM). PTM dynamically
                 tracks transactional updates to cache lines to ensure
                 the ACI (atomicity, consistency and isolation)
                 properties during cache flushes and leverages an undo
                 log in NVM to ensure PTM can always consistently
                 recover transactional data structures from a machine
                 crash. This paper describes the PTM design based on
                 Intel's restricted transactional memory. A preliminary
                 evaluation using a concurrent key/value store and a
                 database with a cache-based simulator shows that the
                 additional cache line flushes are small.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wang, ZG (Reprint Author), Shanghai Jiao Tong Univ,
                 Shanghai Key Lab Scalable Comp \& Syst, Shanghai
                 200030, Peoples R China. Wang, Zhaoguo; Yi, Han; Liu,
                 Ran; Dong, Mingkai; Chen, Haibo, Shanghai Jiao Tong
                 Univ, Shanghai Key Lab Scalable Comp \& Syst, Shanghai
                 200030, Peoples R China. Wang, Zhaoguo; Yi, Han; Liu,
                 Ran; Dong, Mingkai; Chen, Haibo, Shanghai Jiao Tong
                 Univ, Inst Parallel \& Distributed Syst, Shanghai
                 200030, Peoples R China.",
  author-email = "tigerwang1986@gmail.com ken.yihan1990@gmail.com
                 naruilone@gmail.com mingkaidong@gmail.com
                 haibochen@sjtu.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "ACI properties; Batteries; cache line flushes; cache
                 storage; cache-based simulator; Computer crashes; Data
                 structures; Databases; Hardware; Hardware transactional
                 memory; non-volatile random access memory; Nonvolatile
                 memory; nonvolatile memory; NVM; persistent
                 transactional memory; PTM design; Registers",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "5",
  unique-id =    "Wang:2015:PTM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Gibert:2015:PSR,
  author =       "Enric Gibert and Raul Mart{\'\i}nez and Carlos
                 Madriles and Josep M. Codina",
  title =        "Profiling Support for Runtime Managed Code: Next
                 Generation Performance Monitoring Units",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "62--65",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2321398",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Given the increase of runtime managed code
                 environments in desktop, server, and mobile segments,
                 agile, flexible, and accurate performance monitoring
                 capabilities are required in order to perform wise code
                 transformations and optimizations. Common profiling
                 strategies, mainly based on instrumentation and current
                 performance monitoring units (PMUs), are not adequate
                 and new innovative designs are necessary. In this
                 paper, we present the desired characteristics of what
                 we call next generation PMUs and advocate for
                 hardware/software collaborative approaches where
                 hardware implements the profiling hooks and mechanisms
                 and software implements the complex heuristics. We then
                 propose a first design in which the hardware uses a
                 small, yet flexible table to profile specific code
                 regions and the software decides what/when/how to
                 profile. This first design meets all required features
                 and we aim it as the seed for future PMUs extensions to
                 enable novel dynamic code transformations and
                 optimizations.",
  acknowledgement = ack-nhfb,
  affiliation =  "Gibert, E (Reprint Author), Intel Corp, Intel Labs,
                 Intel Barcelona Res Ctr IBRC, Edifici Nexus 2, Planta
                 0-D, Jordi Girona 29, Barcelona, Spain. Gibert, Enric;
                 Martinez, Raul; Madriles, Carlos; Codina, Josep M.,
                 Intel Corp, Intel Labs, Intel Barcelona Res Ctr IBRC,
                 Barcelona, Spain.",
  author-email = "enric.gibert.codina@intel.com raul.martinez@intel.com
                 carlos.madriles.gimeno@intel.com
                 josep.m.codina@intel.com",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "dynamic code optimizations; dynamic code
                 transformations; groupware; Hardware; hardware-software
                 collaborative approaches; instrumentation; Instruments;
                 just in time (JIT) compiler; Monitoring; next
                 generation performance monitoring units; optimising
                 compilers; Optimization; Performance monitoring unit
                 (PMU); Phasor measurement units; PMUs; profiling;
                 profiling hooks; profiling support; Runtime; runtime
                 managed code; runtime managed code environments;
                 Software; software performance evaluation; system
                 monitoring",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Gibert:2015:PSR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{You:2015:QSA,
  author =       "Daecheol You and Ki-Seok Chung",
  title =        "Quality of Service-Aware Dynamic Voltage and Frequency
                 Scaling for Embedded {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "66--69",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2319079",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Dynamic voltage and frequency scaling (DVFS) is a key
                 technique for reducing processor power consumption in
                 mobile devices. In recent years, mobile system-on-chips
                 (SoCs) has supported DVFS for embedded graphics
                 processing units (GPUs) as the processing power of
                 embedded GPUs has been increasing steadily. The major
                 challenge of applying DVFS to a processing unit is to
                 meet the quality of service (QoS) requirement while
                 achieving a reasonable power reduction. In the case of
                 GPUs, the QoS requirement can be specified as the
                 frame-per-second (FPS) which the target GPU should
                 achieve. The proposed DVFS technique ensures a
                 consistent GPU performance by scaling the operating
                 clock frequency in a way that it maintains a uniform
                 FPS.",
  acknowledgement = ack-nhfb,
  affiliation =  "You, D (Reprint Author), Hanyang Univ, Dept Elect Comp
                 \& Commun Engn, Embedded Syst Chip Lab, Seoul 133791,
                 South Korea. You, Daecheol; Chung, Ki-Seok, Hanyang
                 Univ, Dept Elect Comp \& Commun Engn, Embedded Syst
                 Chip Lab, Seoul 133791, South Korea.",
  author-email = "khsrdc@hanyang.ac.kr kchung@hanyang.ac.kr",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; Clocks; Correlation; DVFS; dynamic
                 voltage scaling; embedded GPU; Energy consumption;
                 energy-aware systems; frequency scaling; graphics
                 processing unit; Graphics processing units; graphics
                 processing units; Graphics processors;
                 hardware/software interfaces; low-power design; mobile
                 device; mobile system-on-chips; operating clock
                 frequency; power aware computing; processor power
                 consumption; Quality of service; quality of service;
                 SoC; System-on-chip; system-on-chip",
  number-of-cited-references = "9",
  research-areas = "Computer Science",
  times-cited =  "9",
  unique-id =    "You:2015:QSA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lee:2015:RDA,
  author =       "Sungjin Lee and Jihong Kim and Arvind",
  title =        "Refactored Design of {I/O} Architecture for Flash
                 Storage",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "70--74",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2329423",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Flash storage devices behave quite differently from
                 hard disk drives (HDDs); a page on flash has to be
                 erased before it can be rewritten, and the erasure has
                 to be performed on a block which consists of a large
                 number of contiguous pages. It is also important to
                 distribute writes evenly among flash blocks to avoid
                 premature wearing. To achieve interoperability with
                 existing block I/O subsystems for HDDs, NAND flash
                 devices employ an intermediate software layer, called
                 the flash translation layer (FTL), which hides these
                 differences. Unfortunately, FTL implementations require
                 powerful processors with a large amount of DRAM in
                 flash controllers and also incur many unnecessary I/O
                 operations which degrade flash storage performance and
                 lifetime. In this paper, we present a refactored design
                 of I/O architecture for flash storage which
                 dramatically increases storage performance and lifetime
                 while decreasing the cost of the flash controller. In
                 comparison with page-level FTL, our preliminary
                 experiments show a reduction of 19 percent in I/O
                 operations, improvement of I/O performance by 9 percent
                 and storage lifetime by 36 percent. In addition, our
                 scheme uses only 1/128 DRAM memory in the flash
                 controller.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lee, S (Reprint Author), MIT, 77 Massachusetts Ave,
                 Cambridge, MA 02139 USA. Lee, Sungjin; Arvind, MIT,
                 Cambridge, MA 02139 USA. Kim, Jihong, Seoul Natl Univ,
                 Sch Comp Sci \& Engn, Seoul, South Korea.",
  author-email = "chamdoo@gmail.com jihong@davinci.snu.ac.kr
                 arvind@csail.mit.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Benchmark testing; block I/O subsystems; Computer
                 architecture; DRAM chips; DRAM memory; file systems;
                 flash blocks; flash memories; flash storage; flash
                 translation layer; hard disk drives; HDDs; I/O
                 architecture; I/O architectures; input-output programs;
                 intermediate software layer; interoperability; NAND
                 circuits; NAND flash devices; NAND flash memory;
                 page-level FTL; Performance evaluation; premature
                 wearing; Random access memory; Runtime; Storage
                 management; Storage systems",
  number-of-cited-references = "15",
  research-areas = "Computer Science",
  times-cited =  "7",
  unique-id =    "Lee:2015:RDA",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yuan:2015:SGR,
  author =       "Fengkai Yuan and Zhenzhou Ji and Suxia Zhu",
  title =        "Set-Granular Regional Distributed Cooperative
                 Caching",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "75--78",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2319258",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The last level cache (LLC) in private configurations
                 offer lower latency and isolation but extinguishes the
                 possibility of sharing underutilized cache resources.
                 Cooperative Caching (CC) provides capacity sharing by
                 spilling a line evicted from one cache to another.
                 Current studies focus on efficient capacity sharing,
                 while the adaptability of CC to manycore environment
                 deserves more attentions. In this paper, we present
                 Set-granular Regional Distributed Cooperative Caching
                 to optimize CC in manycore CMPs with private LLCs. We
                 achieve efficient capacity sharing by a low-traffic
                 global receiver tracking mechanism and provide a method
                 to manage set-grain cache state transitions for
                 exclusive LLCs. Experiment results show that SRDCC
                 performs better than baseline system, running different
                 workloads varying in receiver-spiller number and
                 distribution, in execution time up to 15.55 percent and
                 memory access up to 40.25 percent, at a negligible cost
                 of network traffics (6.21 percent more than baseline
                 system at worst).",
  acknowledgement = ack-nhfb,
  affiliation =  "Yuan, FK (Reprint Author), Harbin Inst Technol, Sch
                 Comp Sci \& Technol, Harbin 150006, Heilongjiang,
                 Peoples R China. Yuan, Fengkai; Ji, Zhenzhou; Zhu,
                 Suxia, Harbin Inst Technol, Sch Comp Sci \& Technol,
                 Harbin 150006, Heilongjiang, Peoples R China.",
  author-email = "yuan.fengkai@gmail.com jizhenzhou@hit.edu.cn",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache coherence protocol; cache resource sharing;
                 Cache storage; cache storage; capacity sharing; CC;
                 chip multiprocessors; cooperative caching; Cooperative
                 caching; last level cache; LLC; manycore CMP;
                 multiprocessing systems; on-chip networks; private
                 cache configuration; Protocols; Radiation detectors;
                 receiver-spiller distribution; receiver-spiller number;
                 Receivers; set-grain cache state transition;
                 set-granular regional distributed cooperative caching;
                 Telecommunication traffic; Tiled CMP",
  keywords-plus = "CHIP MULTIPROCESSORS",
  number-of-cited-references = "9",
  ORCID-numbers = "Yuan, Fengkai/0000-0003-2615-8642",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Yuan:2015:SGR",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Lee:2015:SSI,
  author =       "Junghee Lee and Youngjae Kim and Jongman Kim and Galen
                 M. Shipman",
  title =        "Synchronous {I/O} Scheduling of Independent Write
                 Caches for an Array of {SSDs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "79--82",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2298394",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Solid-state drives (SSD) offer a significant
                 performance improvement over the hard disk drives
                 (HDD), however, it can exhibit a significant variance
                 in latency and throughput due to internal garbage
                 collection (GC) process on the SSD. When the SSDs are
                 configured in a RAID, the performance variance of
                 individual SSDs could significantly degrade the overall
                 performance of the RAID of SSDs. The internal cache on
                 the RAID controller can help mitigate the performance
                 variability issues of SSDs in the array; however, the
                 state-of-the-art cache algorithm of the RAID controller
                 does not consider the characteristics of SSDs. In this
                 paper, we examine the most recent write cache algorithm
                 for the array of disks, and propose a synchronous
                 independent write cache (SIW) algorithm. We also
                 present a pre-parity-computation technique for the RAID
                 of SSDs with parity computations, which calculates
                 parities of blocks in advance before they are stored in
                 the write cache. With this new technique, we propose a
                 complete paradigm shift in the design of write cache.
                 In our evaluation study, large write requests dominant
                 workloads show up to about 50 and 20 percent
                 improvements in average response times on RAID-0 and
                 RAID-5 respectively as compared to the state-of-the-art
                 write cache algorithm.",
  acknowledgement = ack-nhfb,
  affiliation =  "Lee, J (Reprint Author), Univ Texas San Antonio, San
                 Antonio, TX 78229 USA. Lee, Junghee, Univ Texas San
                 Antonio, San Antonio, TX 78229 USA. Kim, Youngjae, Ajou
                 Univ, Suwon 441749, South Korea. Kim, Jongman, Georgia
                 Inst Technol, Atlanta, GA 30332 USA. Shipman, Galen M.,
                 Oak Ridge Natl Lab, Oak Ridge, TN USA.",
  author-email = "junghee.lee@utsa.edu youkim@gmail.com
                 jkim@ece.gatech.edu gshipman@ornl.gov",
  da =           "2019-06-20",
  doc-delivery-number = "CL1QK",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Algorithm design and analysis; Arrays; cache storage;
                 Delays; disks array; flash memory; GC process; hard
                 disk drives; HDD; I/O scheduling; independent write
                 caches; input-output programs; internal cache; internal
                 garbage collection process; memory architecture;
                 pre-parity-computation technique; RAID; RAID
                 controller; Redundant array of independent disks
                 (RAID); Redundant Array of Independent Disks (RAID);
                 Redundant array of independent disks (RAID);
                 scheduling; SIW algorithm; solid-state drive (SSD);
                 Solid-State Drive (SSD); solid-state drive (SSD);
                 solid-state drives; SSD; Strips; Synchronization;
                 synchronous I/O scheduling; synchronous independent
                 write cache algorithm; Time factors; write cache; Write
                 cache; write cache; write cache design; write
                 requests",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Lee:2015:SSI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2015:RSW,
  author =       "Anonymous",
  title =        "Rock Stars of Wearables",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "83--83",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2447192",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:RSC,
  author =       "Anonymous",
  title =        "Rock Stars of Cybersecurity 2015 Conference",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "84--84",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2447191",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:TCa,
  author =       "Anonymous",
  title =        "Table of Contents",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "C1--C1",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2446391",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:ICAa,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Computer Architecture Letters}}
                 Editorial Board}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "C2--C2",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2446392",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:ICAb,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Computer Architecture Letters}}}
                 Information for Authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "C3--C3",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2446393",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:ICSa,
  author =       "Anonymous",
  title =        "{IEEE Computer Society}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "1",
  pages =        "C4--C4",
  month =        jan # "\slash " # jun,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2446394",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Shi:2015:CLM,
  author =       "Qingchuan Shi and Henry Hoffmann and Omer Khan",
  title =        "A Cross-Layer Multicore Architecture to Tradeoff
                 Program Accuracy and Resilience Overheads",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "85--89",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2365204",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "To protect multicores from soft-error perturbations,
                 resiliency schemes have been developed with high
                 coverage but high power/performance overheads (similar
                 to 2x). We observe that not all soft-errors affect
                 program correctness, some soft-errors only affect
                 program accuracy, i.e., the program completes with
                 certain acceptable deviations from soft-error free
                 outcome. Thus, it is practical to improve processor
                 efficiency by trading off resilience overheads with
                 program accuracy. We propose the idea of declarative
                 resilience that selectively applies resilience schemes
                 to both crucial and non-crucial code, while ensuring
                 program correctness. At the application level, crucial
                 and non-crucial code is identified based on its impact
                 on the program outcome. The hardware collaborates with
                 software support to enable efficient resilience with
                 100 percent soft-error coverage. Only program accuracy
                 is compromised in the worst-case scenario of a
                 soft-error strike during non-crucial code execution.
                 For a set of multithreaded benchmarks, declarative
                 resilience improves completion time by an average of 21
                 percent over state-of-the-art hardware resilience
                 scheme that protects all executed code. Its performance
                 overhead is similar to 1.38x over a multicore that does
                 not support resilience.",
  acknowledgement = ack-nhfb,
  affiliation =  "Shi, QC (Reprint Author), Univ Connecticut, Dept Elect
                 \& Comp Engn, Storrs, CT 06269 USA. Shi, Qingchuan;
                 Khan, Omer, Univ Connecticut, Dept Elect \& Comp Engn,
                 Storrs, CT 06269 USA. Hoffmann, Henry, Univ Chicago,
                 Dept Comp Sci, Chicago, IL 60637 USA.",
  author-email = "qingchuan.shi@uconn.edu hankhoffmann@cs.uchicago.edu
                 khan@uconn.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; Benchmark testing; code execution;
                 Instruction sets; multi-threading; multicore
                 architecture; Multicore processing; multicores;
                 multithreaded benchmark; program accuracy; Resilience;
                 resilience overhead; Soft errors; soft-error
                 perturbation; soft-errors; software architecture;
                 software fault tolerance",
  number-of-cited-references = "23",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Shi:2015:CLM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Zheng:2015:ACC,
  author =       "Zhong Zheng and Zhiying Wang and Mikko Lipasti",
  title =        "Adaptive Cache and Concurrency Allocation on
                 {GPGPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "90--93",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2359882",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memory bandwidth is critical to GPGPU performance.
                 Exploiting locality in caches can better utilize memory
                 bandwidth. However, memory requests issued by excessive
                 threads cause cache thrashing and saturate memory
                 bandwidth, degrading performance. In this paper, we
                 propose adaptive cache and concurrency allocation (CCA)
                 to prevent cache thrashing and improve the utilization
                 of bandwidth and computational resources, hence
                 improving performance. According to locality and reuse
                 distance of access patterns in GPGPU program, warps on
                 a stream multiprocessor are dynamically divided into
                 three groups: cached, bypassed, and waiting. The data
                 cache accommodates the footprint of cached warps.
                 Bypassed warps cannot allocate cache lines in the data
                 cache to prevent cache thrashing, but are able to take
                 advantage of available memory bandwidth and
                 computational resource. Waiting warps are de-scheduled.
                 Experimental results show that adaptive CCA can
                 significant improve benchmark performance, with 80
                 percent harmonic mean IPC improvement over the
                 baseline.",
  acknowledgement = ack-nhfb,
  affiliation =  "Zheng, Z (Reprint Author), Natl Univ Def Technol,
                 State Key Lab High Performance Comp, Changsha, Hunan,
                 Peoples R China. Zheng, Zhong; Wang, Zhiying, Natl Univ
                 Def Technol, State Key Lab High Performance Comp,
                 Changsha, Hunan, Peoples R China. Zheng, Zhong; Wang,
                 Zhiying, Natl Univ Def Technol, Sch Comp, Changsha,
                 Hunan, Peoples R China. Lipasti, Mikko, Univ Wisconsin,
                 Dept Elect \& Comp Engn, Madison, WI 54706 USA.",
  author-email = "zheng\_zhong@nudt.edu.cn zywang@nudt.edu.cn
                 mikko@engr.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "CSC; 863 Program [2012AA010905]; NSFC
                 [61070037, 61272143, 61272144, 61103016, 61202121];
                 NUDT [B120607]; RFDP [20114307120013]; NSF
                 [CCF-1318298]",
  funding-text = "This work was partially supported by CSC, 863 Program
                 (2012AA010905), NSFC (61070037, 61272143, 61272144,
                 61103016, 61202121), NUDT(B120607), RFDP
                 (20114307120013), and NSF (CCF-1318298).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "access patterns; adaptive cache-and-concurrency
                 allocation; Bandwidth; bandwidth utilization
                 improvement; benchmark performance improvement;
                 Benchmark testing; bypassed warps; cache; cache lines;
                 cache locality; Cache memory; cache storage; cache
                 thrashing prevention; cached warps; CCA; computational
                 resource utilization improvement; concurrency;
                 concurrency control; Concurrent computing; GPGPU; GPGPU
                 performance improvement; graphics processing units;
                 harmonic mean IPC improvement; Instruction sets; memory
                 bandwidth saturation; multi-threading; multiprocessing
                 systems; performance evaluation; Resource management;
                 reuse distance; stream multiprocessor; waiting warp
                 descheduling",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Zheng:2015:ACC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Nowatzki:2015:GBP,
  author =       "Tony Nowatzki and Venkatraman Govindaraju and
                 Karthikeyan Sankaralingam",
  title =        "A Graph-Based Program Representation for Analyzing
                 Hardware Specialization Approaches",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "94--98",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2476801",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware specialization has emerged as a promising
                 paradigm for future microprocessors. Unfortunately, it
                 is natural to develop and evaluate such architectures
                 within end-to-end vertical silos spanning application,
                 language/compiler, hardware design and evaluation
                 tools, leaving little opportunity for
                 cross-architecture analysis and innovation. This paper
                 develops a novel program representation suitable for
                 modeling heterogeneous architectures with specialized
                 hardware, called the transformable dependence graph
                 (TDG), which combines semantic information about
                 program properties and low-level hardware events in a
                 single representation. We demonstrate, using four
                 example architectures from the literature, that the TDG
                 is a feasible, simple, and accurate modeling technique
                 for transparent specialization architectures, enabling
                 cross-domain comparison and design-space exploration.",
  acknowledgement = ack-nhfb,
  affiliation =  "Nowatzki, T (Reprint Author), Univ Wisconsin, Dept
                 Comp Sci, 1210 W Dayton St, Madison, WI 53706 USA.
                 Nowatzki, Tony; Govindaraju, Venkatraman;
                 Sankaralingam, Karthikeyan, Univ Wisconsin, Dept Comp
                 Sci, Madison, WI 53706 USA.",
  author-email = "tjn@cs.wisc.edu venkatra@cs.wisc.edu
                 karu@cs.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Acceleration; accelerators; computer architecture;
                 Computer architecture; dependence graphs; graph theory;
                 graph-based program representation; Hardware
                 specialization; hardware specialization approach;
                 heterogeneous architecture modeling; Load modeling;
                 Microarchitecture; microprocessors; Microprocessors;
                 modelling; program representation; Specialization;
                 Specialization, accelerators, modelling, program
                 representation, dependence graphs; TDG; transformable
                 dependence graph; Transforms",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Nowatzki:2015:GBP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kim:2015:PEM,
  author =       "Seung Hun Kim and Dohoon Kim and Changmin Lee and Won
                 Seob Jeong and Won Woo Ro and Jean-Luc Gaudiot",
  title =        "A Performance-Energy Model to Evaluate Single Thread
                 Execution Acceleration",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "99--102",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2368144",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "It is well known that the cost of executing the
                 sequential portion of a program will limit and
                 sometimes even eclipse the gains brought by processing
                 in parallel the rest of the program. This means that
                 serious consideration should be brought to bear on
                 accelerating the execution of this unavoidable
                 sequential part. Such acceleration can be done by
                 boosting the operating frequency in a symmetric
                 multicore processor. In this paper, we derive a
                 performance and power model to describe the
                 implications of this approach. From our model, we show
                 that the ratio of performance over energy during the
                 sequential part improves with an increase in the number
                 of cores. In addition, we demonstrate how to determine
                 with the proposed model the optimal frequency boosting
                 ratio which maximizes energy efficiency.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kim, SH (Reprint Author), Yonsei Univ, Sch Elect \&
                 Elect Engn, Seoul 120749, South Korea. Kim, Seung Hun;
                 Kim, Dohoon; Lee, Changmin; Jeong, Won Seob; Ro, Won
                 Woo, Yonsei Univ, Sch Elect \& Elect Engn, Seoul
                 120749, South Korea. Gaudiot, Jean-Luc, Univ Calif
                 Irvine, Dept Elect Engn \& Comp Sci, Irvine, CA USA.",
  author-email = "kseunghun@gmail.com dohoon.kim@yonsei.ac.kr
                 exahz@yonsei.ac.kr ws.jeong@yonsei.ac.kr
                 wro@yonsei.ac.kr gaudiot@uci.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Basic Science Research Program through the
                 National Research Foundation of Korea (NRF) ---
                 Ministry of Education [2010-0013202]; National Science
                 Foundation [CCF-1439165]",
  funding-text = "This work was supported in part by the Basic Science
                 Research Program through the National Research
                 Foundation of Korea (NRF) funded by the Ministry of
                 Education (2010-0013202) and by the National Science
                 Foundation, under award CCF-1439165. Any opinions,
                 findings, and conclusions expressed in this material
                 are those of the authors and do not necessarily reflect
                 the views of the sponsors. W. W. Ro is the
                 corresponding author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "energy efficiency; Energy management; energy-aware
                 systems; Mathematical model; Microprocessors; Multicore
                 processing; multiprocessing systems; multiprocessor
                 systems; optimal frequency boosting ratio; parallel
                 processing; performance evaluation; Performance
                 evaluation; Performance modeling; performance-energy
                 model; power aware computing; Power demand; single
                 thread execution acceleration; symmetric multicore
                 processor",
  keywords-plus = "AMDAHLS LAW; ERA",
  number-of-cited-references = "11",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kim:2015:PEM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Song:2015:ARL,
  author =       "William Song and Saibal Mukhopadhyay and Sudhakar
                 Yalamanchili",
  title =        "Architectural Reliability: Lifetime Reliability
                 Characterization and Management of Many-Core
                 Processors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "103--106",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2340873",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "This paper presents a lifetime reliability
                 characterization of many-core processors based on a
                 full-system simulation of integrated microarchitecture,
                 power, thermal, and reliability models. Under normal
                 operating conditions, our model and analysis reveal
                 that the mean-time-to-failure of cores on the die show
                 normal distribution. From the processor-level
                 perspective, the key insight is that reducing the
                 variance of the distribution can improve lifetime
                 reliability by avoiding early failures. Based on this
                 understanding, we present two variance reduction
                 techniques for proactive reliability management; (i)
                 proportional dynamic voltage-frequency scaling (DVFS)
                 and (ii) coordinated thread swapping. A major advantage
                 of using variance reduction techniques is that the
                 improvement of system lifetime reliability can be
                 achieved without adding design margins or spare
                 components.",
  acknowledgement = ack-nhfb,
  affiliation =  "Song, W (Reprint Author), Georgia Inst Technol, Sch
                 Elect \& Comp Engn, Atlanta, GA 30332 USA. Song,
                 William; Mukhopadhyay, Saibal; Yalamanchili, Sudhakar,
                 Georgia Inst Technol, Sch Elect \& Comp Engn, Atlanta,
                 GA 30332 USA.",
  author-email = "wjhsong@gatech.edu saibal.mukhopadhyay@ece.gatech.edu
                 sudha.yalamanchili@ece.gatech.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Semiconductor Research Corporation
                 [2084.001]; IBM/SRC Graduate Fellowship; Sandia
                 National Laboratories",
  funding-text = "This research was supported by the Semiconductor
                 Research Corporation under task \#2084.001, IBM/SRC
                 Graduate Fellowship, and Sandia National
                 Laboratories.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "architectural reliability; Benchmark testing; Computer
                 architecture; Computer architecture, lifetime
                 estimation, modeling, semiconductor device reliability,
                 simulation; coordinated thread swapping; core
                 mean-time-to-failure; Degradation; design margins;
                 DVFS; full-system simulation; Gaussian distribution;
                 integrated circuit design; Integrated circuit
                 reliability; integrated microarchitecture; lifetime
                 estimation; lifetime reliability characterization;
                 many-core processors; Microarchitecture; microprocessor
                 chips; modeling; multiprocessing systems; normal
                 operating conditions; power aware computing; power
                 models; Program processors; proportional dynamic
                 voltage-frequency scaling; reliability models;
                 semiconductor device reliability; simulation; spare
                 components; thermal models; variance reduction
                 techniques",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "3",
  unique-id =    "Song:2015:ARL",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Poluri:2015:SET,
  author =       "Pavan Poluri and Ahmed Louri",
  title =        "A Soft Error Tolerant Network-on-Chip Router Pipeline
                 for Multi-Core Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "107--110",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2360686",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Network-on-Chip (NoC) paradigm is rapidly evolving
                 into an efficient interconnection network to handle the
                 strict communication requirements between the
                 increasing number of cores on a single chip.
                 Diminishing transistor size is making the NoC
                 increasingly vulnerable to both hard faults and soft
                 errors. This paper concentrates on soft errors in NoCs.
                 A soft error in an NoC router results in significant
                 consequences such as data corruption, packet
                 retransmission and deadlock among others. To this end,
                 we propose Soft Error Tolerant NoC Router (STNR)
                 architecture, that is capable of detecting and
                 recovering from soft errors occurring in different
                 control stages of the routing pipeline. STNR exploits
                 the use of idle cycles inherent in NoC packet routing
                 pipeline to perform time redundant executions necessary
                 for soft error tolerance. In doing so, STNR is able to
                 detect and correct all single transient faults in the
                 control stages of the pipeline. Simulation results
                 using PARSEC and SPLASH-2 benchmarks show that STNR is
                 able to accomplish such high level of soft error
                 protection with a minimal impact on latency (an
                 increase of 1.7 and 1.6 percent respectively).
                 Additionally, STNR incurs an area overhead of 7 percent
                 and power overhead of 13 percent as compared to the
                 baseline unprotected router.",
  acknowledgement = ack-nhfb,
  affiliation =  "Poluri, P (Reprint Author), Univ Arizona, Dept Elect
                 \& Comp Engn, Tucson, AZ 85721 USA. Poluri, Pavan;
                 Louri, Ahmed, Univ Arizona, Dept Elect \& Comp Engn,
                 Tucson, AZ 85721 USA.",
  author-email = "pavanp@email.arizona.edu louri@email.arizona.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "US National Science Foundation (NSF)
                 [CNS-1318997, ECCS-0725765, ECCS-1342702,
                 CCF-1420681]",
  funding-text = "This research was supported by US National Science
                 Foundation (NSF) awards CNS-1318997, ECCS-0725765,
                 ECCS-1342702 and CCF-1420681.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Computer architecture; data corruption; deadlock;
                 fault tolerance; hard faults; idle cycles; integrated
                 circuit reliability; interconnection network; Multicore
                 processing; multicore systems; multiprocessing systems;
                 network routing; Network-on-chip; network-on-chip;
                 Network-on-chip; NoC packet routing pipeline; packet
                 retransmission; PARSEC; performance; Pipelines; Ports
                 (Computers); radiation hardening (electronics);
                 reliability; Resource management; single chip; single
                 transient faults; soft error; soft error protection;
                 soft error tolerance; soft error tolerant
                 network-on-chip router pipeline; soft error tolerant
                 NoC router architecture; SPLASH-2 benchmarks; STNR
                 architecture; Switches; time redundant executions;
                 Transient analysis; transistor size",
  number-of-cited-references = "13",
  research-areas = "Computer Science",
  times-cited =  "6",
  unique-id =    "Poluri:2015:SET",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Xiao:2015:SCD,
  author =       "Canwen Xiao and Yue Yang and Jianwen Zhu",
  title =        "A Sufficient Condition for Deadlock-Free Adaptive
                 Routing in Mesh Networks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "111--114",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2363829",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Deadlock remains a central problem in interconnection
                 network. In this paper, we establish a new theory of
                 deadlock-free flow control for k-ary, n-cube mesh
                 network, which enables the use of any minimal-path
                 adaptive routing algorithms while avoiding deadlock. We
                 prove that the proposed flow control algorithm is a
                 sufficient condition for deadlock freedom in any
                 minimal path, adaptive routing algorithms on k-ary,
                 n-cube mesh network.",
  acknowledgement = ack-nhfb,
  affiliation =  "Xiao, CW (Reprint Author), Natl Univ Def Technol,
                 Changsha, Hunan, Peoples R China. Xiao, Canwen, Natl
                 Univ Def Technol, Changsha, Hunan, Peoples R China.
                 Yang, Yue; Zhu, Jianwen, Univ Toronto, Dept Elect \&
                 Comp Engn, Toronto, ON, Canada.",
  author-email = "cwxiao@nudt.edu.cn yyang@eecg.toronto.edu
                 jzhu@eecg.toronto.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "``863'' program of China [2012AA01A301,
                 2013AA014301]",
  funding-text = "This work is supported by ``863'' program of China
                 (2012AA01A301, 2013AA014301).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Adaptive systems; Aerospace electronics; concurrency
                 control; deadlock avoidance; Deadlock-Free;
                 deadlock-free adaptive routing; deadlock-free flow
                 control; flow control; interconnection network; k-ary;
                 k-ary mesh network; mesh networks; Mesh networks;
                 minimal path routing algorithm; minimal-path adaptive
                 routing algorithms; Multiprocessor interconnection;
                 multiprocessor interconnection networks; n-cube mesh
                 network; Routing; sufficient condition; System
                 recovery; Wireless mesh networks",
  number-of-cited-references = "7",
  research-areas = "Computer Science",
  researcherid-numbers = "Yang, Yue/N-8370-2019",
  times-cited =  "1",
  unique-id =    "Xiao:2015:SCD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Mittal:2015:ATE,
  author =       "Sparsh Mittal and Jeffrey S. Vetter",
  title =        "{AYUSH}: A Technique for Extending Lifetime of
                 {SRAM--NVM} Hybrid Caches",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "115--118",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2355193",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Recently, researchers have explored way-based hybrid
                 SRAM-NVM (non-volatile memory) last level caches (LLCs)
                 to bring the best of SRAM and NVM together. However,
                 the limited write endurance of NVMs restricts the
                 lifetime of these hybrid caches. We present AYUSH, a
                 technique to enhance the lifetime of hybrid caches,
                 which works by using data-migration to preferentially
                 use SRAM for storing frequently-reused data.
                 Microarchitectural simulations confirm that AYUSH
                 achieves larger improvement in lifetime than a previous
                 technique and also maintains performance and energy
                 efficiency. For single, dual and quad-core workloads,
                 the average increase in cache lifetime with AYUSH is
                 6.90, 24.06 and 47.62x, respectively.",
  acknowledgement = ack-nhfb,
  affiliation =  "Mittal, S (Reprint Author), Oak Ridge Natl Lab, Div
                 Math \& Comp Sci, Oak Ridge, TN 37831 USA. Mittal,
                 Sparsh; Vetter, Jeffrey S., Oak Ridge Natl Lab, Div
                 Math \& Comp Sci, Oak Ridge, TN 37831 USA.",
  author-email = "mittals@ornl.gov vetter@ornl.gov",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "AYUSH; Benchmark testing; Cache memory; cache storage;
                 data-migration; device lifetime; energy efficiency;
                 Energy loss; hybrid cache; last level caches;
                 microarchitectural simulation; Non-volatile memory
                 (NVM); nonvolatile memory; Nonvolatile memory;
                 Radiation detectors; Random access memory; SRAM; SRAM
                 chips; SRAM-NVM cache; SRAM-NVM hybrid caches; write
                 endurance",
  keywords-plus = "ENERGY; MODEL",
  number-of-cited-references = "17",
  ORCID-numbers = "Vetter, Jeffrey/0000-0002-2449-6720 Mittal,
                 Sparsh/0000-0002-2908-993X",
  research-areas = "Computer Science",
  times-cited =  "11",
  unique-id =    "Mittal:2015:ATE",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Manohar:2015:CSD,
  author =       "Rajit Manohar",
  title =        "Comparing Stochastic and Deterministic Computing",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "119--122",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2412553",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Technology scaling has raised the specter of myriads
                 of cheap, but unreliable and/or stochastic devices that
                 must be creatively combined to create a reliable
                 computing system. This has renewed the interest in
                 computing that exploits stochasticity-embracing, not
                 combating the device physics. If a stochastic
                 representation is used to implement a programmable
                 general-purpose architecture akin to CPUs, GPUs, or
                 FPGAs, the preponderance of evidence indicates that
                 most of the system energy will be expended in
                 communication and storage as opposed to computation.
                 This paper presents an analytical treatment of the
                 benefits and drawbacks of adopting a stochastic
                 approach by examining the cost of representing a value.
                 We show both scaling laws and costs for low precision
                 representations. We also analyze the cost of
                 multiplication implemented using stochastic versus
                 deterministic approaches, since multiplication is the
                 prototypical inexpensive stochastic operation. We show
                 that the deterministic approach compares favorably to
                 the stochastic approach when holding precision and
                 reliability constant.",
  acknowledgement = ack-nhfb,
  affiliation =  "Manohar, R (Reprint Author), Cornell Univ, Cornell
                 Tech, New York, NY 10011 USA. Manohar, Rajit, Cornell
                 Univ, Cornell Tech, New York, NY 10011 USA.",
  author-email = "rajit@csl.cornell.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Complexity theory; Computer architecture;
                 deterministic computing; Encoding; field programmable
                 gate arrays; FPGAs; general-purpose architecture; GPUs;
                 graphics processing units; Logic gates; Receivers;
                 reliable computing system; stochastic computing;
                 Stochastic processes; stochastic processes; stochastic
                 representation",
  number-of-cited-references = "18",
  research-areas = "Computer Science",
  times-cited =  "5",
  unique-id =    "Manohar:2015:CSD",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Seo:2015:DDF,
  author =       "Bon-Keun Seo and Seungryoul Maeng and Joonwon Lee and
                 Euiseong Seo",
  title =        "{DRACO}: A Deduplicating {FTL} for Tangible Extra
                 Capacity",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "123--126",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2350984",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The rapid random access of SSDs enables efficient
                 searching of redundant data and their deduplication.
                 However, the space earned from deduplication cannot be
                 used as permanent storage because it must be reclaimed
                 when deduplication is cancelled as a result of an
                 update to the deduplicated data. To overcome this
                 limitation, we propose a novel FTL scheme that enables
                 the gained capacity to be used as permanent storage
                 space for the file system layer. The proposed approach
                 determines the safe amount of gained capacity that can
                 be provided to the upper layer based on the compression
                 rate prediction scheme. It then secures the required
                 space by compressing cold data when capacity overflow
                 occurs from cancelled deduplication. Our evaluation
                 with a kernel source repository showed that the file
                 system obtained approximately 79 percent additional
                 capacity by the proposed scheme.",
  acknowledgement = ack-nhfb,
  affiliation =  "Seo, BK (Reprint Author), Korea Adv Inst Sci \&
                 Technol, Dept Comp Sci, Taejon 305701, South Korea.
                 Seo, Bon-Keun; Maeng, Seungryoul, Korea Adv Inst Sci \&
                 Technol, Dept Comp Sci, Taejon 305701, South Korea.
                 Lee, Joonwon; Seo, Euiseong, Sungkyunkwan Univ, Coll
                 Informat \& Commun Engn, Suwon 440746, South Korea.",
  author-email = "joonwon@skku.edu euiseong@skku.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Basic Science Research Program through the
                 National Research Foundation of Korea
                 [2012R1A1A2A10038823]",
  funding-text = "This research was supported by Basic Science Research
                 Program through the National Research Foundation of
                 Korea (2012R1A1A2A10038823).",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "capacity overflow; cold data compression; compression;
                 compression rate prediction scheme; data compression;
                 data deduplication; Data structures; deduplicating FTL;
                 deduplication; disc drives; DRACO; Entropy; file system
                 layer; file systems; File systems; file systems; flash
                 memories; flash memory; Flash memory; flash memory;
                 flash translation layer; FTL; kernel source repository;
                 Linux; over-provisioning; permanent storage space;
                 rapid random access; redundant data searching; SDRAM;
                 SSD; storage management; storage reclamation; tangible
                 extra capacity",
  number-of-cited-references = "6",
  research-areas = "Computer Science",
  researcherid-numbers = "Maeng, Seungryoul/C-1882-2011",
  times-cited =  "2",
  unique-id =    "Seo:2015:DDF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Seshadri:2015:FBB,
  author =       "Vivek Seshadri and Kevin Hsieh and Amirali Boroum and
                 Donghyuk Lee and Michael A. Kozuch and Onur Mutlu and
                 Phillip B. Gibbons and Todd C. Mowry",
  title =        "Fast Bulk Bitwise {AND} and {OR} in {DRAM}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "127--131",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2434872",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Bitwise operations are an important component of
                 modern day programming, and are used in a variety of
                 applications such as databases. In this work, we
                 propose a new and simple mechanism to implement bulk
                 bitwise AND and OR operations in DRAM, which is faster
                 and more efficient than existing mechanisms. Our
                 mechanism exploits existing DRAM operation to perform a
                 bitwise AND/OR of two DRAM rows completely within DRAM.
                 The key idea is to simultaneously connect three cells
                 to a bitline before the sense-amplification. By
                 controlling the value of one of the cells, the sense
                 amplifier forces the bitline to the bitwise AND or
                 bitwise OR of the values of the other two cells. Our
                 approach can improve the throughput of bulk bitwise
                 AND/OR operations by 9.7X and reduce their energy
                 consumption by 50.5.X. Since our approach exploits
                 existing DRAM operation as much as possible, it
                 requires negligible changes to DRAM logic. We evaluate
                 our approach using a real-world implementation of a
                 bit-vector based index for databases. Our mechanism
                 improves the performance of commonly-used range queries
                 by 30 percent on average.",
  acknowledgement = ack-nhfb,
  affiliation =  "Seshadri, V (Reprint Author), Carnegie Mellon Univ,
                 Pittsburgh, PA 15213 USA. Seshadri, Vivek; Hsieh,
                 Kevin; Boroum, Amirali; Lee, Donghyuk; Mutlu, Onur;
                 Mowry, Todd C., Carnegie Mellon Univ, Pittsburgh, PA
                 15213 USA. Kozuch, Michael A.; Gibbons, Phillip B.,
                 Intel Pittsburgh, Pittsburgh, PA USA.",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [0953246, 1212962, 1320531]; Intel
                 Science and Tech. Center; Samsung; Google; Facebook;
                 SRC",
  funding-text = "This work was supported by NSF (awards 0953246,
                 1212962, and 1320531), and Intel Science and Tech.
                 Center, Samsung, Google, Facebook, and SRC.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "bit-vector based index; bitwise AND/OR; bulk-bitwise
                 AND operation; bulk-bitwise OR operation; Capacitors;
                 cell value control; Computer architecture; database
                 indexing; Decoding; DRAM; DRAM chips; DRAM memory; DRAM
                 memory, bitwise AND/OR, performance; DRAM operation;
                 energy consumption reduction; logic gates; performance;
                 performance improvement; Program processors; Random
                 access memory; range queries; sense amplifier;
                 sense-amplification; Throughput; throughput
                 improvement",
  number-of-cited-references = "20",
  research-areas = "Computer Science",
  times-cited =  "21",
  unique-id =    "Seshadri:2015:FBB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Altaf:2015:LPM,
  author =       "Muhammad Shoaib Bin Altaf and David A. Wood",
  title =        "{LogCA}: A Performance Model for Hardware
                 Accelerators",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "132--135",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2360182",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
                 http://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "To address the Dark Silicon problem, architects have
                 increasingly turned to special-purpose hardware
                 accelerators to improve the performance and energy
                 efficiency of common computational kernels, such as
                 encryption and compression. Unfortunately, the latency
                 and overhead required to off-load a computation to an
                 accelerator sometimes outweighs the potential benefits,
                 resulting in a net decrease in performance or energy
                 efficiency. To help architects and programmers reason
                 about these trade-offs, we have developed the LogCA
                 model, a simple performance model for hardware
                 accelerators. LogCA provides a simplified abstraction
                 of a hardware accelerator characterized by five key
                 parameters. We have validated the model against a
                 variety of accelerators, ranging from on-chip
                 cryptographic accelerators in Sun's UltraSparc T2 and
                 Intel's Sandy Bridge to both discrete and integrated
                 GPUs.",
  acknowledgement = ack-nhfb,
  affiliation =  "Bin Altaf, MS (Reprint Author), Univ Wisconsin,
                 Madison, WI 53706 USA. Bin Altaf, Muhammad Shoaib;
                 Wood, David A., Univ Wisconsin, Madison, WI 53706
                 USA.",
  author-email = "shoaibbinalt@wisc.edu david@cs.wisc.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [CNS-1117280, CCF-1218323,
                 CNS-1302260]",
  funding-text = "We thank Mark Hill, Michael Swift, Rathijit Sen, and
                 the members of the Wisconsin Multifacet group for their
                 comments on the paper. This work is supported in part
                 with NSF grants CNS-1117280, CCF-1218323, and
                 CNS-1302260. The views expressed herein are not
                 necessarily those of the NSF. Professor Wood has
                 significant financial interests in AMD, Google and
                 Panasas.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accelerators; compression; computational kernel;
                 Computational modeling; cryptography; dark silicon
                 problem; encryption; energy conservation; energy
                 efficiency; GPU; graphics processing units; Hardware
                 accelerators; heterogeneous systems; Intel Sandy
                 Bridge; LogCA model; Modeling; modeling techniques;
                 modeling techniques,; on-chip cryptographic
                 accelerator; Performance evaluation; performance model;
                 performance of systems; special-purpose hardware
                 accelerator; UltraSparc T2",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Altaf:2015:LPM",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Diamantopoulos:2015:MMI,
  author =       "Dionysios Diamantopoulos and Sotirios Xydis and Kostas
                 Siozios and Dimitrios Soudris",
  title =        "Mitigating Memory-Induced Dark Silicon in
                 Many-Accelerator Architectures",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "136--139",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2410791",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Many-Accelerator (MA) systems have been introduced as
                 a promising architectural paradigm that can boost
                 performance and improve power of general-purpose
                 computing platforms. In this paper, we focus on the
                 problem of resource under-utilization, i.e., Dark
                 Silicon, in FPGA-based MA platforms. We show that
                 except the typically expected peak power budget,
                 on-chip memory resources form a severe
                 under-utilization factor in MA platforms, leading up to
                 75 percent of dark silicon. Recognizing that static
                 memory allocation-the de-facto mechanism supported by
                 modern design techniques and synthesis tools-forms the
                 main source of memory-induced Dark Silicon, we
                 introduce a novel framework that extends conventional
                 high level synthesis (HLS) with dynamic memory
                 management (DMM) features, enabling accelerators to
                 dynamically adapt their allocated memory to the runtime
                 memory requirements, thus maximizing the overall
                 accelerator count through effective sharing of FPGA's
                 memories resources. We show that our technique delivers
                 significant gains in FPGA's accelerators density, i.e.
                 3.8x, and application throughput up to 3.1x and 21.4x
                 for shared and private memory accelerators.",
  acknowledgement = ack-nhfb,
  affiliation =  "Diamantopoulos, D (Reprint Author), Natl Tech Univ
                 Athens, Sch Elect \& Comp Engn, Athens, Greece.
                 Diamantopoulos, Dionysios; Xydis, Sotirios; Siozios,
                 Kostas; Soudris, Dimitrios, Natl Tech Univ Athens, Sch
                 Elect \& Comp Engn, Athens, Greece.",
  author-email = "diamantd@microlab.ntua.gr sxydis@microlab.ntua.gr
                 ksiop@microlab.ntua.gr dsoudris@microlab.ntua.gr",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "E.C. [644906]",
  funding-text = "This research is partially supported by the E.C.
                 funded program AEGLE under H2020 Grant Agreement No:
                 644906.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "de-facto mechanism; DMM feature; dynamic memory
                 management; dynamic memory management feature; Dynamic
                 scheduling; Field programmable gate arrays; field
                 programmable gate arrays; FPGA-based MA platform;
                 high-level synthesis; high-level synthesis tool; HLS
                 tool; MA system; Many-accelerator architectures;
                 many-accelerator architectures; Many-accelerator
                 architectures; Memory management; memory-induced dark
                 silicon source; modern design technique; Network
                 architecture; on-chip memory resource; peak power
                 budget; power aware computing; Resource management;
                 severe under-utilization factor; silicon; static memory
                 allocation; storage management; System-on-chip;
                 Throughput",
  number-of-cited-references = "14",
  ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847 Siozios,
                 Kostas/0000-0002-0285-2202",
  research-areas = "Computer Science",
  researcherid-numbers = "Soudris, Dimitrios/O-8843-2019 Siozios,
                 Kostas/F-9726-2011",
  times-cited =  "1",
  unique-id =    "Diamantopoulos:2015:MMI",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Poremba:2015:NUF,
  author =       "Matthew Poremba and Tao Zhang and Yuan Xie",
  title =        "NVMain 2.0: A User-Friendly Memory Simulator to Model
                 (Non-) Volatile Memory Systems",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "140--143",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2402435",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "In this letter, a flexible memory simulator --- NVMain
                 2.0, is introduced to help the community for modeling
                 not only commodity DRAMs but also emerging memory
                 technologies, such as die-stacked DRAM caches,
                 non-volatile memories (e.g., STT-RAM, PCRAM, and ReRAM)
                 including multi-level cells (MLC), and hybrid
                 non-volatile plus DRAM memory systems. Compared to
                 existing memory simulators, NVMain 2.0 features a
                 flexible user interface with compelling simulation
                 speed and the capability of providing sub-array-level
                 parallelism, fine-grained refresh, MLC and data encoder
                 modeling, and distributed energy profiling.",
  acknowledgement = ack-nhfb,
  affiliation =  "Poremba, M (Reprint Author), Penn State Univ, Dept
                 Comp Sci \& Engn, University Pk, PA 16802 USA. Poremba,
                 Matthew; Zhang, Tao; Xie, Yuan, Penn State Univ, Dept
                 Comp Sci \& Engn, University Pk, PA 16802 USA.",
  author-email = "poremba@cse.psu.edu zhangtao@cse.psu.edu
                 yuanxie@cse.psu.edu",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "NSF [1218867, 1213052, 1409798]; Department
                 of Energy [DE-SC0005026]",
  funding-text = "Poremba, Zhang, and Xie were supported in part by NSF
                 1218867, 1213052, 1409798. This material was based on
                 work supported by the Department of Energy under Award
                 Number DE-SC0005026. Matthew Poremba is the
                 corresponding author.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache storage; commodity DRAM; Computational modeling;
                 Computer architecture; die-stacked DRAM cache; DRAM
                 chips; DRAM memory systems; flexible memory simulator;
                 flexible user interface; Memory architecture; memory
                 architecture; Memory architecture, random access
                 memory, nonvolatile memory, phase change memory, SDRAM;
                 Memory management; memory technology; multilevel cells;
                 nonvolatile memory; Nonvolatile memory; nonvolatile
                 memory system; NVMain 2.0; PCRAM; phase change
                 memories; phase change memory; Phase change random
                 access memory; random access memory; ReRAM; SDRAM;
                 STT-RAM; user interfaces; user-friendly memory
                 simulator",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "36",
  unique-id =    "Poremba:2015:NUF",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Vandierendonck:2015:EEB,
  author =       "Hans Vandierendonck and Ahmad Hassan and Dimitrios S.
                 Nikolopoulos",
  title =        "On the Energy-Efficiency of Byte-Addressable
                 Non-Volatile Memory",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "144--147",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2355195",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Non-volatile memory (NVM) technology holds promise to
                 replace SRAM and DRAM at various levels of the memory
                 hierarchy. The interest in NVM is motivated by the
                 difficulty faced in scaling DRAM beyond 22 nm and,
                 long-term, lower cost per bit. While offering higher
                 density and negligible static power (leakage and
                 refresh), NVM suffers increased latency and energy per
                 memory access. This paper develops energy and
                 performance models of memory systems and applies them
                 to understand the energy-efficiency of replacing or
                 complementing DRAM with NVM. Our analysis focusses on
                 the application of NVM in main memory. We demonstrate
                 that NVM such as STT-RAM and RRAM is energy-efficient
                 for memory sizes commonly employed in servers and
                 high-end workstations, but PCM is not. Furthermore, the
                 model is well suited to quickly evaluate the impact of
                 changes to the model parameters, which may be achieved
                 through optimization of the memory architecture, and to
                 determine the key parameters that impact system-level
                 energy and performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Vandierendonck, H (Reprint Author), Queens Univ
                 Belfast, Belfast BT7 1NN, Antrim, North Ireland.
                 Vandierendonck, Hans; Nikolopoulos, Dimitrios S.,
                 Queens Univ Belfast, Belfast BT7 1NN, Antrim, North
                 Ireland. Hassan, Ahmad, SAP Belfast, Belfast, Antrim,
                 North Ireland.",
  author-email = "h.vandierendonck@qub.ac.uk ahmad.hassan@sap.com
                 d.nikolopoulos@qub.ac.uk",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "People Programme (Marie Curie Actions) of
                 the European Union's Seventh Framework Programme
                 [327744]",
  funding-text = "This work was supported by the People Programme (Marie
                 Curie Actions) of the European Union's Seventh
                 Framework Programme (FP7/2007-2013), grant agreement
                 no. 327744.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "byte-addressable nonvolatile memory technology;
                 Computational modeling; DRAM; DRAM chips; energy;
                 energy conservation; energy efficiency; Enery
                 efficiency; impact system-level energy; Main memory
                 systems; Main memory systems, non-volatile memory,
                 energy, modeling; Mathematical model; memory
                 architecture; memory hierarchy; Memory management;
                 memory systems; modeling; non-volatile memory;
                 Nonvolatile memory; NVM technology; PCM; Phase change
                 materials; Random access memory; RRAM; SRAM; SRAM
                 chips; static power; STT-RAM",
  number-of-cited-references = "15",
  oa =           "Green Published",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Vandierendonck:2015:EEB",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Yavits:2015:RAP,
  author =       "Leonid Yavits and Shahar Kvatinsky and Amir Morad and
                 Ran Ginosar",
  title =        "Resistive Associative Processor",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "148--151",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2374597",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Associative Processor (AP) combines data storage and
                 data processing, and functions simultaneously as a
                 massively parallel array SIMD processor and memory.
                 Traditionally, AP is based on CMOS technology, similar
                 to other classes of massively parallel SIMD processors.
                 The main component of AP is a Content Addressable
                 Memory (CAM) array. As CMOS feature scaling slows down,
                 CAM experiences scalability problems. In this work, we
                 propose and investigate an AP based on resistive
                 CAM-the Resistive AP (ReAP). We show that resistive
                 memory technology potentially allows scaling the AP
                 from a few millions to a few hundred millions of
                 processing units on a single silicon die. We compare
                 the performance and power consumption of a ReAP to a
                 CMOS AP and a conventional SIMD accelerator (GPU) and
                 show that ReAP, although exhibiting higher power
                 density, allows better scalability and higher
                 performance.",
  acknowledgement = ack-nhfb,
  affiliation =  "Yavits, L (Reprint Author), Technion Israel Inst
                 Technol, Dept Elect Engn, IL-3200000 Haifa, Israel.
                 Yavits, Leonid; Kvatinsky, Shahar; Morad, Amir;
                 Ginosar, Ran, Technion Israel Inst Technol, Dept Elect
                 Engn, IL-3200000 Haifa, Israel.",
  author-email = "yavits@txtechnion.ac.il skva@txtechnion.ac.il
                 amirm@txtechnion.ac.il ran@ee.technion.ac.il",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Intel Collaborative Research Institute for
                 Computational Intelligence; Hasso-Plattner-Institut",
  funding-text = "The authors would like to thank Uri Weiser for
                 inspiring this research. This work was partially funded
                 by the Intel Collaborative Research Institute for
                 Computational Intelligence and by
                 Hasso-Plattner-Institut.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Associative processing; associative processor;
                 Associative Processor; associative processor; CAM
                 array; CMOS feature scaling; CMOS integrated circuits;
                 CMOS technology; complimentary metal oxide
                 semiconductor; Computer aided manufacturing; content
                 addressable memory array; content-addressable storage;
                 data processing; data storage; GPU; graphics processing
                 unit; in-memory computing; In-Memory Computing;
                 in-memory computing; massively parallel array SIMD
                 processor; memory function; memristor; Memristor;
                 memristor; Memristors; parallel processing; Random
                 access memory; ReAP; resistive associative processor;
                 resistive RAM; Resistive RAM; resistive RAM; SIMD; SIMD
                 accelerator",
  number-of-cited-references = "17",
  research-areas = "Computer Science",
  times-cited =  "22",
  unique-id =    "Yavits:2015:RAP",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Kang:2015:SRT,
  author =       "Suk Chan Kang and Chrysostomos Nicopoulos and Ada
                 Gavrilovska and Jongman Kim",
  title =        "Subtleties of Run-Time Virtual Address Stacks",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "152--155",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2337299",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The run-time virtual address (VA) stack has some
                 unique properties, which have garnered the attention of
                 researchers. The stack one-dimensionally grows and
                 shrinks at its top, and contains data that is seemingly
                 local/private to one thread, or process. Most prior
                 related research has focused on these properties.
                 However, this article aims to demonstrate how
                 conventional wisdom pertaining to the run-time VA stack
                 fails to capture some critical subtleties and
                 complexities. We first explore two widely established
                 assumptions surrounding the VA stack area: (1) Data
                 accesses can be classified as falling either under
                 VA-stack-area accesses, or non-stack-area accesses,
                 with no aliasing; (2) The VA stack data is completely
                 private and invisible to other threads/processes.
                 Subsequently, we summarize a representative selection
                 of related work that pursued the micro-architectural
                 concept of using run-time VA stacks to extend the
                 general-purpose register file. We then demonstrate why
                 these assumptions are invalid, by using examples from
                 prior work to highlight the potential hazards regarding
                 data consistency, shared memory consistency, and cache
                 coherence. Finally, we suggest safeguards against these
                 hazards. Overall, we explore the function-critical
                 issues that future operating systems and compilers
                 should address to effectively reap all the benefits of
                 using run-time VA stacks.",
  acknowledgement = ack-nhfb,
  affiliation =  "Kang, SC (Reprint Author), Georgia Inst Technol,
                 Atlanta, GA 30332 USA. Kang, Suk Chan; Gavrilovska,
                 Ada; Kim, Jongman, Georgia Inst Technol, Atlanta, GA
                 30332 USA. Nicopoulos, Chrysostomos, Univ Cyprus,
                 CY-1678 Nicosia, Cyprus.",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "cache coherence; cache storage; data consistency; data
                 decoupling; data integrity; data privacy;
                 function-critical issue; general-purpose register file;
                 Instruction sets; memory consistency;
                 microarchitectural concept; nonstack-area access;
                 register file; Run time; Run-time stack; run-time VA
                 stack data access; run-time virtual address stack;
                 shared memory; shared memory consistency; shared memory
                 systems; synonym page; VA-stack-area accesses;
                 Virtualization",
  number-of-cited-references = "12",
  ORCID-numbers = "Nicopoulos, Chrysostomos/0000-0001-6389-6068",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Kang:2015:SRT",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Rodopoulos:2015:TPV,
  author =       "Dimitrios Rodopoulos and Francky Catthoor and
                 Dimitrios Soudris",
  title =        "Tackling Performance Variability Due to {RAS}
                 Mechanisms with {PID}-Controlled {DVFS}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "156--159",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2385713",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "As technology nodes approach deca-nanometer
                 dimensions, many phenomena threaten the binary
                 correctness of processor operation. Computer architects
                 typically enhance their designs with reliability,
                 availability and serviceability (RAS) schemes to
                 correct such errors, in many cases at the cost of extra
                 clock cycles, which, in turn, leads to processor
                 performance variability. The goal of the current paper
                 is to absorb this variability using Dynamic Voltage and
                 Frequency Scaling (DVFS). A closed-loop implementation
                 is proposed, which configures the clock frequency based
                 on observed metrics that encapsulate performance
                 variability due to RAS mechanisms. That way,
                 performance dependability and predictability is
                 achieved. We simulate the transient and steady state
                 behavior of our approach, reporting responsiveness
                 within less than 1 ms. We also assess our idea using
                 the power model of real processor and report a maximum
                 energy overhead of roughly 10 percent for dependable
                 performance in the presence of RAS temporal
                 overheads.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rodopoulos, D (Reprint Author), Natl Tech Univ Athens,
                 MicroLab, Sch Elect \& Comp Engn, Athens 15780, Greece.
                 Rodopoulos, Dimitrios; Soudris, Dimitrios, Natl Tech
                 Univ Athens, MicroLab, Sch Elect \& Comp Engn, Athens
                 15780, Greece. Catthoor, Francky, ESAT KU Leuven,
                 Leuven, Belgium. Catthoor, Francky, SSET IMEC, Leuven,
                 Belgium.",
  author-email = "drodo@microlab.ntua.gr catthoor@imec.be
                 dsoudris@microlab.ntua.gr",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "HARPA EC project [FP7-612069]",
  funding-text = "The authors thank Prof. Y. Sazeides and Prof. C.
                 Nicopoulos of UCY, Cyprus for the insightful
                 discussions. They also acknowledge the constructive
                 feedback of the reviewers. This work was partially
                 supported by the FP7-612069-HARPA EC project. Dimitrios
                 Rodopoulos is the corresponding author. Finally, the
                 authors acknowledge conversations with Dr. Antonis
                 Papanikolaou.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "availability; availability and serviceability;
                 Availability and Serviceability; availability and
                 serviceability; binary correctness; closed loop
                 systems; closed-loop implementation; computer
                 architects; computer architecture; deca-nanometer
                 dimensions; Dynamic voltage and frequency scaling;
                 dynamic voltage and frequency scaling; Dynamic voltage
                 and frequency scaling; Dynamic Voltage and Frequency
                 Scaling; Mathematical model; microcomputers;
                 Performance evaluation; performance variability;
                 performance vulnerability factor; Performance
                 Vulnerability Factor; PID-controlled DVFS; Process
                 control; processor operation; RAS mechanisms;
                 reliability; Reliability; reliability; Reliability;
                 serviceability; three-term control; Voltage control",
  number-of-cited-references = "21",
  ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847",
  research-areas = "Computer Science",
  researcherid-numbers = "Soudris, Dimitrios/O-8843-2019",
  times-cited =  "4",
  unique-id =    "Rodopoulos:2015:TPV",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Markovic:2015:TLS,
  author =       "Nikola Markovic and Daniel Nemirovsky and Osman Unsal
                 and Mateo Valero and Adrian Cristal",
  title =        "Thread Lock Section-Aware Scheduling on Asymmetric
                 Single-{ISA} Multi-Core",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "160--163",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2014.2357805",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
                 http://www.math.utah.edu/pub/tex/bib/multithreading.bib",
  abstract =     "As thread level parallelism in applications has
                 continued to expand, so has research in chip multi-core
                 processors. As more and more applications become
                 multi-threaded we expect to find a growing number of
                 threads executing on a machine. As a consequence, the
                 operating system will require increasingly larger
                 amounts of CPU time to schedule these threads
                 efficiently. Instead of perpetuating the trend of
                 performing more complex thread scheduling in the
                 operating system, we propose a scheduling mechanism
                 that can be efficiently implemented in hardware as
                 well. Our approach of identifying multi-threaded
                 application bottlenecks such as thread synchronization
                 sections complements the Fairness-aware Scheduler
                 method. It achieves an average speed up of 11.5 percent
                 (geometric mean) compared to the state-of-the-art
                 Fairness-aware Scheduler.",
  acknowledgement = ack-nhfb,
  affiliation =  "Markovic, N (Reprint Author), Barcelona Supercomputing
                 Ctr, Barcelona, Spain. Markovic, Nikola; Nemirovsky,
                 Daniel; Unsal, Osman; Valero, Mateo, Barcelona
                 Supercomputing Ctr, Barcelona, Spain. Markovic, Nikola;
                 Nemirovsky, Daniel; Valero, Mateo, Univ Politecn
                 Cataluna, Barcelona, Spain. Cristal, Adrian, Univ
                 Politecn Cataluna, Barcelona Supercomputing Ctr,
                 E-08028 Barcelona, Spain. Cristal, Adrian, Artificial
                 Intelligence Res Inst Spanish Natl Res, Barcelona,
                 Spain.",
  author-email = "nikola.markovic@bsc.es daniel.nemirovsky@bsc.es
                 osman.unsal@bsc.es mateo.valero@bsc.es
                 adrian.cristal@bsc.es",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Asymmetric chip multiprocessor (ACMP); asymmetric
                 single-ISA multicore processor; chip multicore
                 processors; Context modeling; fairness-aware scheduler
                 method; HW/SW thread scheduling; Instruction sets;
                 microprocessor chips; multi-threaded applications;
                 multi-threading; Multicore processing; multiprocessing
                 systems; multithreaded application; operating system;
                 Operating systems; operating systems (computers);
                 scheduling; Scheduling; Synchronization; thread lock
                 section-aware scheduling mechanism; thread
                 synchronization",
  number-of-cited-references = "17",
  ORCID-numbers = "UNSAL, OSMAN/0000-0002-0544-9697 Valero,
                 Mateo/0000-0003-2917-2482",
  research-areas = "Computer Science",
  researcherid-numbers = "UNSAL, OSMAN/B-9161-2016 Valero,
                 Mateo/L-5709-2014",
  times-cited =  "7",
  unique-id =    "Markovic:2015:TLS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Pekhimenko:2015:TAC,
  author =       "Gennady Pekhimenko and Evgeny Bolotin and Mike
                 O'Connor and Onur Mutlu and Todd C. Mowry and Stephen
                 W. Keckler",
  title =        "Toggle-Aware Compression for {GPUs}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "164--168",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2430853",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/datacompression.bib;
                 http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Memory bandwidth compression can be an effective way
                 to achieve higher system performance and energy
                 efficiency in modern data-intensive applications by
                 exploiting redundancy in data. Prior works studied
                 various data compression techniques to improve both
                 capacity (e.g., of caches and main memory) and
                 bandwidth utilization (e.g., of the on-chip and
                 off-chip interconnects). These works addressed two
                 common shortcomings of compression: (i)
                 compression/decompression overhead in terms of latency,
                 energy, and area, and (ii) hardware complexity to
                 support variable data size. In this paper, we make the
                 new observation that there is another important problem
                 related to data compression in the context of the
                 communication energy efficiency: transferring
                 compressed data leads to a substantial increase in the
                 number of bit toggles (communication channel switchings
                 from 0 to 1 or from 1 to 0). This, in turn, increases
                 the dynamic energy consumed by on-chip and off-chip
                 buses due to more frequent charging and discharging of
                 the wires. Our results, for example, show that the bit
                 toggle count increases by an average of 2.2x with some
                 compression algorithms across 54 mobile GPU
                 applications. We characterize and demonstrate this new
                 problem across a wide variety of 221 GPU applications
                 and six different compression algorithms. To mitigate
                 the problem, we propose two new toggle-aware
                 compression techniques: energy control and Metadata
                 Consolidation. These techniques greatly reduce the bit
                 toggle count impact of the six data compression
                 algorithms we examine, while keeping most of their
                 bandwidth reduction benefits.",
  acknowledgement = ack-nhfb,
  affiliation =  "Pekhimenko, G (Reprint Author), Carnegie Mellon Univ,
                 Dept Comp Sci, Pittsburgh, PA 15206 USA. Pekhimenko,
                 Gennady; Mutlu, Onur; Mowry, Todd C., Carnegie Mellon
                 Univ, Dept Comp Sci, Pittsburgh, PA 15206 USA. Bolotin,
                 Evgeny; O'Connor, Mike; Keckler, Stephen W., NVIDA,
                 Santa Clara, CA USA. O'Connor, Mike; Keckler, Stephen
                 W., Univ Texas Austin, Austin, TX 78712 USA.",
  author-email = "gpekhimento@gmail.com ebolotin@nvidia.com
                 moconnor@nvidia.com omutlu@gmail.com tcm@cs.cmu.edu
                 skeckler@nvidia.com",
  da =           "2019-06-20",
  doc-delivery-number = "CZ7DC",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  funding-acknowledgement = "Intel Science and Technology Center for
                 Cloud Computing; US National Science Foundation
                 [1212962, 1409723, 1423172]; US Department of Energy",
  funding-text = "The authors acknowledge the support of Intel Science
                 and Technology Center for Cloud Computing; US National
                 Science Foundation grants 1212962, 1409723, and
                 1423172; and the US Department of Energy.",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "bandwidth utilization; bit toggle count impact; bit
                 toggles; Communication channels; communication energy
                 efficiency; Compression algorithms;
                 compression/decompression overhead; Data compression;
                 data compression; data compression algorithms; data
                 compression techniques; Data compression,
                 interconnected systems, memory; data redundancy;
                 dynamic energy; energy control; graphics processing
                 units; Graphics processing units; hardware complexity;
                 interconnected systems; memory; memory bandwidth
                 compression; metadata consolidation; Mobile
                 communication; mobile GPU applications; modern
                 data-intensive applications; off-chip buses; on-chip
                 buses; power aware computing; System-on-chip;
                 toggle-aware compression; variable data size",
  number-of-cited-references = "29",
  research-areas = "Computer Science",
  times-cited =  "2",
  unique-id =    "Pekhimenko:2015:TAC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2015:TCb,
  author =       "Anonymous",
  title =        "Table of Contents",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "C1--C1",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2510172",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:ICAc,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Computer Architecture Letters}}
                 Editorial Board}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "C2--C2",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2510173",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:ICAd,
  author =       "Anonymous",
  title =        "{{\booktitle{IEEE Computer Architecture Letters}}}
                 Information for Authors",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "C3--C3",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2510174",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Anonymous:2015:ICSb,
  author =       "Anonymous",
  title =        "{IEEE Computer Society}",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "14",
  number =       "2",
  pages =        "C4--C4",
  month =        jul # "\slash " # dec,
  year =         "2015",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2510176",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}

@Article{Wu:2016:MCN,
  author =       "Wo-Tak Wu and Ahmed Louri",
  title =        "A Methodology for Cognitive {NoC} Design",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "1--4",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2447535",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "The number of cores in a multicore chip design has
                 been increasing in the past two decades. The rate of
                 increase will continue for the foreseeable future. With
                 a large number of cores, the on-chip communication has
                 become a very important design consideration. The
                 increasing number of cores will push the communication
                 complexity level to a point where managing such highly
                 complex systems requires much more than what designers
                 can anticipate for. We propose a new design methodology
                 for implementing a cognitive network-on-chip that has
                 the ability to recognize changes in the environment and
                 to learn new ways to adapt to the changes. This
                 learning capability provides a way for the network to
                 manage itself. Individual network nodes work
                 autonomously to achieve global system goals, e.g., low
                 network latency, higher reliability, power efficiency,
                 adaptability, etc. We use fault-tolerant routing as a
                 case study. Simulation results show that the cognitive
                 design has the potential to outperform the conventional
                 design for large applications. With the great inherent
                 flexibility to adopt different algorithms, the
                 cognitive design can be applied to many applications.",
  acknowledgement = ack-nhfb,
  affiliation =  "Wu, WT (Reprint Author), Univ Arizona, Dept Elect \&
                 Comp Engn, Tucson, AZ 85721 USA. Wu, Wo-Tak; Louri,
                 Ahmed, Univ Arizona, Dept Elect \& Comp Engn, Tucson,
                 AZ 85721 USA.",
  author-email = "wotakwu@email.arizona.edu louri@ece.arizona.edu",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "adaptive; Algorithm design and analysis; cognitive
                 network-on-chip; cognitive NoC design; cognitive
                 process; communication complexity; communication
                 complexity level; Fault tolerance; fault tolerant
                 computing; Fault tolerant systems; fault-tolerant;
                 fault-tolerant routing; individual network nodes;
                 integrated circuit design; intelligent agent; learning
                 (artificial intelligence); learning capability; machine
                 learning; multicore; multicore chip design; Multicore
                 processing; multiprocessing systems; network routing;
                 network-on-chip; NoC; on-chip communication; Routing;
                 Software",
  number-of-cited-references = "8",
  research-areas = "Computer Science",
  times-cited =  "1",
  unique-id =    "Wu:2016:MCN",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Anonymous:2016:IICa,
  author =       "Anonymous",
  title =        "2015 Index {{\booktitle{IEEE Computer Architecture
                 Letters}}} Vol. 14",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "1--6",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2513858",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Indexes",
}

@Article{Anonymous:2016:IICb,
  author =       "Anonymous",
  title =        "2015 Index {{\booktitle{IEEE Computer Architecture
                 Letters}}} Vol. 14",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "1--6",
  month =        jan # "\slash " # jun,
  year =         "2016",
  DOI =          "https://doi.org/10.1109/LCA.2015.2513858",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Fri Jun 21 08:36:31 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Presents the 2015 author/subject index for this
                 publication.",
  acknowledgement = ack-nhfb,
  fjournal =     "IEEE Computer Architecture Letters",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Indexes",
}

@Article{Rezaei:2016:DRS,
  author =       "Seyyed Hossein Seyyedaghaei Rezaei and Abbas Mazloumi
                 and Mehdi Modarressi and Pejman Lotfi-Kamran",
  title =        "Dynamic Resource Sharing for High-Performance {$3$-D}
                 Networks-on-Chip",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "5--8",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2448532",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "3D logic-on-logic technology is a promising approach
                 for extending the validity of Moore's law when
                 technology scaling stops. 3D technology can also lead
                 to a paradigm shift in on-chip communication design by
                 providing orders of magnitude higher bandwidth and
                 lower latency for inter-layer communication. To turn
                 the 3D technology bandwidth and latency benefits into
                 network latency reductions and performance improvement,
                 we need networks-on-chip (NoCs) that are specially
                 designed to take advantage of what 3D technology has to
                 offer. While in parallel workloads many packets
                 experience blocking in the network due to losing
                 arbitration for crossbars' input/output ports, we
                 observe that in a considerable fraction of these cases
                 in a 3D NoC, the corresponding input and output ports
                 of the crossbar in the above or below router are idle.
                 Given this observation, we propose FRESH, a router
                 microarchitecture with Fine-grained 3D REsource SHaring
                 capability that leverages the ultra-low latency
                 vertical links of a 3D chip to share crossbars and
                 links at a fine granularity between vertically stacked
                 routers. It enables packets that lose arbitration for
                 crossbars' input/output ports to use idle resources of
                 the above or below routers, and effectively eliminates
                 the unnecessary packet blocking time. We will show that
                 our proposal lowers network latency by up to 21 percent
                 over the state-of-the-art 3D NoC.",
  acknowledgement = ack-nhfb,
  affiliation =  "Rezaei, SHS (Reprint Author), Univ Tehran, Coll Engn,
                 Dept Elect \& Comp Engn, Tehran, Iran. Rezaei, Seyyed
                 Hossein Seyyedaghaei; Mazloumi, Abbas; Modarressi,
                 Mehdi, Univ Tehran, Coll Engn, Dept Elect \& Comp Engn,
                 Tehran, Iran. Lotfi-Kamran, Pejman, Inst Res
                 Fundamental Sci IPM, Sch Comp Sci, Tehran, Iran.",
  author-email = "s.hseyyedaghaei@ut.ac.ir y.mazloomi@gmail.com
                 modarressi@ut.ac.ir plotfi@ipm.ir",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "3-D integration; 3D integration; 3D networks-on-chip;
                 3D NoC; Bandwidth; crossbars input-output ports;
                 fine-grained 3D resource sharing capability; FRESH;
                 network latency; network routing; network-on-chip;
                 Ports (Computers); Resource management; Resource
                 sharing; router microarchitecture; Routing; Switches;
                 Three-dimensional displays; Through-silicon vias",
  keywords-plus = "3D; ROUTER",
  number-of-cited-references = "12",
  research-areas = "Computer Science",
  times-cited =  "4",
  unique-id =    "Rezaei:2016:DRS",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Gorgues:2016:EPC,
  author =       "Miguel Gorgues and Jose Flich",
  title =        "End-Point Congestion Filter for Adaptive Routing with
                 Congestion-Insensitive Performance",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "9--12",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2429130",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Interconnection networks are a critical component in
                 most modern systems nowadays. Both off-chip networks,
                 in HPC systems, data centers, and cloud servers, and
                 on-chip networks, in chip multiprocessors (CMPs) and
                 multiprocessors system-on-chip (MPSoCs), play an
                 increasing role as their performance is vital for the
                 performance of the whole system. One of the key
                 components of any interconnect is the routing
                 algorithm, which steers packets through the network.
                 Adaptive routing algorithms have demonstrated their
                 superior performance by maximizing network resources
                 utilization. However, as systems increase in size (both
                 in off-chip and on-chip), new problems emerge. One of
                 them is congestion where traffic jams inside the
                 network lead to low throughput and high packet latency,
                 significantly impacting overall system performance. We
                 propose a mechanism to eradicate this phenomena and to
                 allow adaptive routing algorithms to achieve the
                 expected performance even in the presence of congestion
                 situations. End-Point Congestion Filter, EPC, detects
                 congestion formed at the end-points of the network, and
                 prevents the congestion from spreading through the
                 network. Basically, EPC disables adaptivity in
                 congested packets. Preliminary results for mid and high
                 congestion situations show EPC is able to totally
                 decouple congestion from routing.",
  acknowledgement = ack-nhfb,
  affiliation =  "Gorgues, M (Reprint Author), Univ Politecn Valencia,
                 Dept Comp Architecture, E-46022 Valencia, Spain.
                 Gorgues, Miguel; Flich, Jose, Univ Politecn Valencia,
                 Dept Comp Architecture, E-46022 Valencia, Spain.",
  author-email = "migoral@disca.upv.es jflich@disca.upv.es",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Adaptive filters; Adaptive routing algorithms;
                 adaptive routing algorithms; Adaptive routing
                 algorithms; adaptive routing algorithms; Adaptive
                 routing algorithms; Adaptive systems; chip
                 multiprocessors; cloud servers; CMP; Congestion;
                 congestion; Congestion; congestion;
                 congestion-insensitive performance; data centers;
                 digital filters; end-point congestion filter; EPC; HPC
                 systems; Information filters; interconnection networks;
                 interconnects; MPSoC; multiprocessor interconnection
                 networks; multiprocessors system-on-chip; network
                 resources utilization; network routing; on-chip
                 networks; packet latency; performance evaluation; Ports
                 (Computers); Routing; system-on-chip; Throughput;
                 traffic jams",
  keywords-plus = "NETWORKS",
  number-of-cited-references = "10",
  research-areas = "Computer Science",
  times-cited =  "0",
  unique-id =    "Gorgues:2016:EPC",
  web-of-science-categories = "Computer Science, Hardware \&
                 Architecture",
}

@Article{Panda:2016:EPP,
  author =       "Biswabandan Panda and Shankar Balachandran",
  title =        "Expert Prefetch Prediction: An Expert Predicting the
                 Usefulness of Hardware Prefetchers",
  journal =      j-IEEE-COMPUT-ARCHIT-LETT,
  volume =       "15",
  number =       "1",
  pages =        "13--16",
  month =        jan # "\slash " # jun,
  year =         "2016",
  CODEN =        "????",
  DOI =          "https://doi.org/10.1109/LCA.2015.2428703",
  ISSN =         "1556-6056 (print), 1556-6064 (electronic)",
  ISSN-L =       "1556-6056",
  bibdate =      "Tue Jun 25 07:41:05 2019",
  bibsource =    "http://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
  abstract =     "Hardware prefetching improves system performance by
                 hiding and tolerating the latencies of lower levels of
                 cache and off-chip DRAM. An accurate prefetcher
                 improves system performance whereas an inaccurate
                 prefetcher can cause cache pollution and consume
                 additional bandwidth. Prefetch address filtering
                 techniques improve prefetch accuracy by predicting the
                 usefulness of a prefetch address and based on the
                 outcome of the prediction, the prefetcher decides
                 whether or not to issue a prefetch request. Existing
                 techniques use only one signature to predict the
                 usefulness of a prefetcher but no single predictor
                 works well across all the applications. In this work,
                 we propose weighted-majority filter, an expert way of
                 predicting the usefulness of prefetch addresses. The
                 proposed filter is adaptive in nature and uses the
                 prediction of the best predictor(s) from a pool of
                 predictors. Our filter is orthogonal to the underlying
                 prefetching algorithm. We evaluate the effectiveness of
                 our technique on 22 SPEC-2000/2006 applications. On an
                 average, when employed with three state-of-the-art
                 prefetchers such as AMPM, SMS, and GHB-PC/DC, our
                 filter provides performance improvement of 8.1, 9.3,
                 and 11 percent respectively.",
  acknowledgement = ack-nhfb,
  affiliation =  "Panda, B (Reprint Author), Indian Inst Technol, Dept
                 Comp Sci \& Engn, Madras, Tamil Nadu, India. Panda,
                 Biswabandan; Balachandran, Shankar, Indian Inst
                 Technol, Dept Comp Sci \& Engn, Madras, Tamil Nadu,
                 India.",
  author-email = "biswa.uce@gmail.com",
  da =           "2019-06-20",
  doc-delivery-number = "DY1XQ",
  eissn =        "1556-6064",
  fjournal =     "IEEE Computer Architecture Letters",
  journal-iso =  "IEEE Comput. Archit. Lett.",
  journal-URL =  "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
  keywords =     "Accuracy; AMPM; cache; Cache; cache; Cache; cache;
                 cache storage; fi