@Preamble{ "\ifx \undefined \booktitle \def \booktitle#1{{{\em #1}}} \fi" }
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|https://www.math.utah.edu/~beebe/|"}
@String{j-IEEE-COMPUT-ARCHIT-LETT = "IEEE Computer Architecture Letters"}
@Article{Alvarez:2002:IRF,
author = "C. Alvarez and J. Corbal and E. Salami and M. Valero",
title = "Initial Results on Fuzzy Floating Point Computation
for Multimedia Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "1--1",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "During the recent years the market of mid low end
portable systems such as PDAs or mobile digital phones
have experimented a revolution in both selling volume
and features as handheld devices incorporate Multimedia
applications. This fact brings to an increase in the
computational demands of the devices while still having
the limitation of power and energy consumption.
Instruction memoization is a promising technique to
help alleviate the problem of power consumption of
expensive functional units such as the floating point
one. Unfortunately this technique could be energy
inefficient for low end systems due to the additional
power consumption of the relatively big tables
required. In this paper we present a novel way of
understanding multimedia floating point operations
based on the fuzzy computation paradigm losses in the
computation precision may exchange performance for
negligible errors in the output. Exploiting the
implicit characteristics of media FP computation we
propose a new technique called fuzzy memoization. Fuzzy
memoization expands the capabilities of classic
memoization by attaching entries with similar inputs to
the same output. We present a case of study for a SH
like processor and report good performance and power
delay improvements with feasible hardware
requirements",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Delay; Energy consumption; Fuzzy systems; Handheld
computers; Joining processes; Mobile computing;
Multimedia systems; Performance loss; Personal digital
assistants; Portable computers",
}
@Article{Gordon-Ross:2002:EFP,
author = "A. Gordon-Ross and S. Cotterell and F. Vahid",
title = "Exploiting Fixed Programs in Embedded Systems: a Loop
Cache Example",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "2--2",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Embedded systems commonly execute one program for
their lifetime. Designing embedded system architectures
with configurable components, such that those
components can be tuned to that one program based on a
program pre-analysis, can yield significant power and
performance benefits. We illustrate such benefits by
designing a loop cache specifically with tuning in
mind. Our results show a 70\% reduction in instruction
memory access, for MIPS and 8051 processors
representing twice the reduction from a regular loop
cache, translating to good power savings.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architecture tuning; Computer architecture; Computer
science; Costs; Digital cameras; Embedded computing;
Embedded system; embedded systems.; fixed program; Loop
cache; low power; Microcomputers; Microprocessor chips;
Portable computers; Power engineering computing",
}
@Article{Choi:2002:LPT,
author = "Jin-Hyuck Choi and Jung-Hoon Lee and Seh-Woong Jeong
and Shin-Dug Kim and C. Weems",
title = "A Low Power {TLB} Structure for Embedded Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "3--3",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We present a new two-level TLB (translation look-aside
buffer) architecture that integrates a 2-way banked
filter TLB with a 2-way banked main TLB. The objective
is to reduce power consumption in embedded processors
by distributing the accesses to TLB entries across the
banks in a balanced manner. First, an advanced
filtering technique is devised to reduce access power
by adopting a sub-bank structure. Second, a
bank-associative structure is applied to each level of
the TLB hierarchy. Simulation results show that the
Energy*Delay product can be reduced by about 40.9\%
compared to a fully associative TLB, 24.9\% compared to
a micro-TLB with 4+32 entries, and 12.18\% compared to
a micro-TLB with 16+32 entries.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bank associative structure; CADCAM; Circuits; Computer
aided manufacturing; Degradation; Embedded system;
Energy consumption; Filter bank; filter mechanism;
Filtering; low power design; Power filters; translation
look-aside buffer; Virtual private networks",
}
@Article{Towles:2002:WCT,
author = "B. Towles and W. J. Dally",
title = "Worst-case Traffic for Oblivious Routing Functions",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "4--4",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper presents an algorithm to find a worst-case
traffic pattern for any oblivious routing algorithm on
an arbitrary interconnection network topology. The
linearity of channel loading offered by oblivious
routing algorithms enables the problem to be mapped to
a bipartite maximum-weight matching, which can be
solved in polynomial time for routing functions with a
polynomial number of paths. Finding exact worst case
performance was previously intractable, and we
demonstrate an example case where traditional
characterization techniques overestimate the throughput
of a particular routing algorithm by 47\%.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bipartite graph; Linearity; Multiprocessor
interconnection networks; Network topology; oblivious
routing; Pattern matching; Polynomials; Routing;
Telecommunication traffic; Throughput; worst-case
throughput",
}
@Article{Unsal:2002:CFC,
author = "O. S. Unsal and C. M. Krishna and C. A. Mositz",
title = "{Cool-Fetch}: Compiler-Enabled Power-Aware Fetch
Throttling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "5--5",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper, we present an architecture compiler
based approach to reduce energy consumption in the
processor. While we mainly target the fetch unit, an
important side-effect of our approach is that we obtain
energy savings in many other parts in the processor.
The explanation is that the fetch unit often runs
substantially ahead of execution, bringing in
instructions to different stages in the processor that
may never be executed. We have found, that although the
degree of Instruction Level Parallelism (ILP)of a
program tends to vary over time, it can be statically
predicted by the compiler with considerable accuracy.
Our Instructions Per Clock (IPC) prediction scheme is
using a dependence-testing-based analysis and simple
heuristics, to guide a front-end fetch-throttling
mechanism. We develop the necessary architecture
support and include its power overhead. We perform
experiments over a wide number of architectural
configurations, using SPEC2000 applications. Our
results are very encouraging: we obtain up to 15\%total
energy savings in the processor with generally little
performance degradation. In fact, in some cases our
intelligent throttling scheme even increases
performance.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Clocks; compiler architecture interaction;
Degradation; Energy consumption; fetch-throttling;
instruction level parallelism; Low power design;
Program processors",
}
@Article{Shang:2002:PEI,
author = "Li Shang and L. Peh and N. K. Jha",
title = "Power-efficient Interconnection Networks: Dynamic
Voltage Scaling with Links",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "6--6",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Power consumption is a key issue in high performance
interconnection network design. Communication links,
already a significant consumer of power now, will take
up an ever larger portion of the power budget as demand
for network bandwidth increases. In this paper, we
motivate the use of dynamic voltage scaling (DVS) for
links, where the frequency and voltage of links are
dynamically adjusted to minimize power consumption. We
propose a history-based DVS algorithm that judiciously
adjusts DVS policies based on past link utilization.
Despite every conservative assumptions about DVS link
characteristics, our approach realizes up to 4.5X power
savings (3.2X average), with just an average 27.4\%
latency increase and 2.5\% throughput reduction. To the
best of our knowledge, this is the first study that
targets dynamic power optimization of interconnection
networks.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Clocks; Dynamic voltage scaling; Frequency
synthesizers; interconnection network; Multiprocessor
interconnection networks; power optimization.;
Regulators",
}
@Article{KleinOsowski:2002:MNS,
author = "A. J. KleinOsowski and D. J. Lilja",
title = "{MinneSPEC}: a New {SPEC} Benchmark Workload for
Simulation-Based Computer Architecture Research",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "7--7",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Computer architects must determine how to most
effectively use finite computational resources when
running simulations to evaluate new architectural
ideas. To facilitate efficient simulations with a range
of benchmark programs, we have developed the MinneSPEC
input set for the SPEC CPU 2000 benchmark suite. This
new workload allows computer architects to obtain
simulation results in a reasonable time using existing
simulators. While the MinneSPEC workload is derived
from the standard SPEC CPU 2000 work load, it is a
valid benchmark suite in and of itself for
simulation-based research. MinneSPEC also may be used
to run large numbers of simulations to find ``sweet
spots'' in the evaluation parameters pace. This small
number of promising design points subsequently may be
investigated in more detail with the full SPEC
reference workload. In the process of developing the
MinneSPEC datasets, we quantify its differences in
terms of function-level execution patterns, instruction
mixes, and memory behaviors compared to the SPEC
programs when executed with the reference inputs. We
find that for some programs, the MinneSPEC profiles
match the SPEC reference dataset program behavior very
closely. For other programs, however, the MinneSPEC
inputs produce significantly different program
behavior. The MinneSPEC workload has been recognized by
SPEC and is distributed with Version 1.2 and higher of
the SPEC CPU 2000 benchmark suite.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computational modeling; Computer architecture;
Computer simulation",
}
@Article{Vandierendonck:2002:ATC,
author = "H. Vandierendonck and K. {De Bosschere}",
title = "An Address Transformation Combining Block- and
Word-Interleaving",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "8--8",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As future superscalar processors employ higher issue
widths, an increasing number of load/store instructions
needs to be executed each cycle to sustain high
performance. Multi-bank data caches attempt to address
this issue in a cost-effective way. R multi-bank cache
consists of multiple cache banks that each support one
load/store instruction per clock cycle. The
interleaving of cache blocks over the banks is of
primary importance. Two common choices are
block-interleaving and word-interleaving. AC through
word-interleaving leads to higher PC, it is more
expensive to implement than block-interleaving since it
requires the tag array of the cache to be multi-ported.
By swapping the bits in the effective address that are
used by word-interleaving with those used by
block-interleaving, it is possible to implement a
word-interleaved cache with the same cost, cycle time
and power consumption of a block interleaved cache.
Because this makes the L1 data cache blocks sparse,
additional costs are incurred at different levels of
the memory hierarchy.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Block-Interleaving; Clocks; Costs; Data cache; Energy
consumption; Interleaved codes; Multi-Banking;
Word-Interleaving.",
}
@Article{Tambat:2002:PLB,
author = "S. Tambat and S. Vajapeyam",
title = "Page-Level Behavior of Cache Contention",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "9--9",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Cache misses in small, limited-associativity primary
caches very often replace live cache blocks, given the
dominance of capacity and conflict misses. Towards
motivating novel cache organizations, we study the
comparative characteristics of the virtual memory
address pairs involved in typical primary-cache
contention (block replacements) for the SPEC2000integer
benchmarks. We focus on the cache tag bits, and results
show that (i) often just a few tag bits differ between
contending addresses, and (ii) accesses to certain
segments or page groups of the virtual address space
(i.e., certain tag-bit groups) contend frequently.
Cache conscious virtual address space allocation can
further reduce the number of conflicting tag bits. We
mention two directions for exploiting such page-level
contention patterns to improve cache cost and
performance.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Automation; Benchmark testing; Cache Contention; Cache
Tags; Computer science; Data Cache; Libraries; Memory
Access Characterization; Microprocessors; Optimizing
compilers; Traffic control; Workstations",
}
@Article{Juang:2002:IDT,
author = "Philo Juang and P. Diodato and S. Kaxiras and K.
Skadron and Zhigang Hu and M. Martonosi and D. W.
Clark",
title = "Implementing Decay Techniques using {4T} Quasi-Static
Memory Cells",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "10--10",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper proposes the use of four-transistor (4T)
cache and branch predictor array cell designs to
address increasing worries regarding leakage power
dissipation. While 4T designs lose state when
infrequently accessed, they have very low leakage,
smaller area, and no capacitive loads to switch. This
short paper gives an overview of 4T implementation
issues and a preliminary evaluation of leakage-energy
savings that shows improvements of 60-80\%",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Circuit simulation; Delay; Leakage current; Libraries;
Microarchitecture; Power dissipation; Power generation;
Random access memory; Switches; Transistors",
}
@Article{Sohn:2002:RRE,
author = "YoungChul Sohn and NaiHoon Jung and Seungryoul Maeng",
title = "Request Reordering to Enhance the Performance of
Strict Consistency Models",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "11--11",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Advances in ILP techniques enable strict consistency
models to relax memory order through speculative
execution of memory operations. However, ordering
constraints still hinder the performance because
speculatively executed operations cannot be committed
out of program order for the possibility of
mis-speculation. In this paper, we propose a new
technique which allows memory operations to be
non-speculatively committed out of order without
violating consistency constraints.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ILP; memory consistency model; multiprocessor",
}
@Article{Shaw:2002:MSC,
author = "K. A. Shaw and W. J. Dally",
title = "Migration in Single Chip Multiprocessors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "1",
number = "1",
pages = "12--12",
month = jan,
year = "2002",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2002.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Global communication costs in future single-chip
multiprocessors will increase linearly with distance.
In this paper, we revisit the issues of locality and
load balance in order to take advantage of these new
costs. We present a technique which simultaneously
migrates data and threads based on vectors specifying
locality and resource usage. This technique improves
performance on applications with distinguishable
locality and imbalanced resource usage. 64\% of the
ideal reduction in execution time was achieved on an
application with these traits while no improvement was
obtained on a balanced application with little
locality.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Cost function; Delay; Global communication;
Laboratories; Logic; Monitoring; Multiprocessing
systems; Wire",
}
@Article{Sihn:2003:SCS,
author = "K.-H. Sihn and Joonwon Lee and Jung-Wan Cho",
title = "A Speculative Coherence Scheme using Decoupling
Synchronization for Multiprocessor Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "1--1",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper proposes a new speculative coherence
scheme, SCDS, for hardware distributed shared memory
systems to reduce the overhead of coherence action in
directory-based cache-coherence protocol. SCDS has two
main features, predicting accurate timing of
speculative coherence with synchronization information
and detecting write pattern(migratory and
non-migratory) for exclusive blocks' speculative
coherence action. In our simulation, SCDS outperforms
existing schemes (DSI and LTP) for well-synchronized
applications.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Access protocols; Coherence; Costs; Delay; Hardware;
Multiprocessing systems; Personal communication
networks; Runtime; Timing; Watches",
}
@Article{Kumar:2003:PPR,
author = "R. Kumar and K. Farkas and N. P. Jouppi and P.
Ranganathan and D. M. Tullsen",
title = "Processor Power Reduction Via Single-{ISA}
Heterogeneous Multi-Core Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "2--2",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper proposes a single-ISA heterogeneous
multi-core architecture as a mechanism to reduce
processor power dissipation. It assumes a single chip
containing a diverse set of cores that target different
performance levels and consume different levels of
power. During an application's execution, system
software dynamically chooses the most appropriate core
to meet specific performance and power requirements. It
describes an example architecture with five cores of
varying performance and complexity. Initial results
demonstrate a five-fold reduction in energy at a cost
of only 25\% performance.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; chip multiprocessor; Computer
architecture; Computer science; Costs; Energy
consumption; Fans; low-power architecture; Packaging;
Power dissipation; Power engineering and energy; System
software",
}
@Article{Sendag:2003:ACE,
author = "R. Sendag and Peng-fei Chuang and D. J. Lilja",
title = "Address Correlation: Exceeding the Limits of
Locality",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "3--3",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We investigate a program phenomenon, Address
Correlation, which links addresses that reference the
same data. This work shows that different addresses
containing the same data can often be correlated at
run-time to eliminate a load miss or a partial hit. For
ten of the SPEC CPU2000 benchmarks, 57 to 99\% of all
L1 data cache load misses, and 4 to 85\% of all partial
hits, can be supplied from a correlated address already
found in the cache. Our source code-level analysis
shows that semantically equivalent information,
duplicated references, and frequent values are the
major causes of address correlations. We also show
that, on average, 68\% of the potential correlated
addresses that could supply data on a miss of an
address containing the same value can be correlated at
run time. These correlated addresses correspond to an
average of 62\% of all misses in the benchmark programs
tested.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Delay; Electronic mail; Hardware;
History; Microarchitecture; Object oriented modeling;
Out of order; Runtime; Tellurium",
}
@Article{Milenkovic:2003:SBT,
author = "A. Milenkovic and M. Milenkovic",
title = "Stream-Based Trace Compression",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "4--4",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Trace-driven simulation has long been used in both
processor and memory studies. The large size of traces
motivated different techniques for trace reduction.
These techniques often combine standard compression
algorithms with trace-specific solutions, taking into
account the tradeoff between reduction in the trace
size and simulation slowdown due to decompression. This
paper introduces SBC, a new algorithm for instruction
and data address trace compression based on instruction
streams. The proposed technique significantly reduces
trace size and simulation time, and it is orthogonal to
general compression algorithms. When combined with
gzip, SBC reduces the size of SPEC CPU2000 traces
94-71968 times.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Clocks; Compression algorithms; Computational
modeling; Computer architecture; Computer simulation;
Data mining; Information analysis; instruction and
address trace; Instruments; Predictive models;
Redundancy; simulation; trace compression",
}
@Article{Zhang:2003:WHC,
author = "Chuanjun Zhang and F. Vahid and Jun Yang and W.
Walid",
title = "A Way-Halting Cache for Low-Energy High-Performance
Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "5--5",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We have designed a low power four-way set associative
cache that stores the four lowest-order bits of all way
stags into a fully associative memory, which we call
the halt tag array. The comparison of the halt tag
array with the desired tag occurs concurrently with the
address decoding that determines which tag and data
ways to read from. The halt tag array predetermines
most tags that cannot match due to their low-order four
bits mismatching. Further accesses to ways with known
mismatching tags are then halted, thus saving power.
Our halt tag array has the additional feature of using
static logic only, rather than dynamic logic used in
highly-associative caches, making our cache consumes
even less power. Our result shows55\% savings of memory
access related energy over a conventional four-way
set-associative cache. We show nearly 2x energy savings
compared with highly associative caches, while imposing
no performance overhead and only 2\% cache area over
head.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Cams; Circuits; Computer science; Decoding; Design
engineering; Embedded computing; Logic arrays; Power
engineering and energy; Power engineering computing;
Switches",
}
@Article{Cohen:2003:EOP,
author = "A. Cohen and F. Finkelstein and A. Mendelson and R.
Ronen and D. Rudoy",
title = "On Estimating Optimal Performance of {CPU} Dynamic
Thermal Management",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "6--6",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper we focus on dynamic thermal management
(DTM) strategies that use dynamic voltage scaling
(DVS)for power control. We perform a theoretical
analysis targeted at estimating the optimal strategy,
and show two facts: (1) when there is a gap between the
initial and the limit temperatures, it is best to start
with a high (though not necessarily maximal)frequency
and decrease it exponentially until the limit
temperature is reached; (2) when being close to the
limit temperature, the best strategy is to stay there.
We use the patterns exhibited by the optimal strategy
in order to analyze some existing DTM techniques.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Costs; DTM; DVS; Energy management; Frequency
estimation; Microprocessors; optimal control; Pattern
analysis; Performance analysis; Temperature control;
Temperature sensors; Thermal management; Voltage
control",
}
@Article{Cristal:2003:CRC,
author = "A. Cristal and J. F. Martinez and J. Llosa and M.
Valero",
title = "A case for resource-conscious out-of-order
processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "2",
number = "1",
pages = "7--7",
month = jan,
year = "2003",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2003.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Modern out-of-order processors tolerate long-latency
memory operations by supporting a large number of
in-flight instructions. This is achieved in part
through proper sizing of critical resources, such as
register files or instruction queues. In light of the
increasing gap between processor speed and memory
latency, tolerating upcoming latencies in this way
would require impractical sizes of such critical
resources. To tackle this scalability problem, we make
a case for resource-conscious out-of-order processors.
We present quantitative evidence that critical
resources are increasingly underutilized in these
processors. We advocate that better use of such
resources should be a priority in future research in
processor architectures.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bars; checkpointing.; Computer aided instruction;
Delay; instruction-level parallelism; Laboratories;
memory latency; Optimal control; Out of order;
Out-of-order processor; Queueing analysis; Registers;
Resource management; resource utilization; Voltage
control",
}
@Article{Citron:2004:ELE,
author = "D. Citron",
title = "Exploiting Low Entropy to Reduce Wire Delay",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "1--1",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Wires shrink less efficiently than transistors.
Smaller dimensions increase relative delay and the
probability of crosstalk. Solutions to this problem
include adding additional latency with pipelining,
using ``fat wires'' at higher metal levels, and
advances in process and material technology. We propose
a stopgap solution to this problem by applying a decade
old technique called bus-expanding to the problem. By
exploiting low spatial and temporal entropy of data it
is possible to transfer m bits of data over a n-bit
wide bus in a single cycle (m > n ). High entropy data
will be routed directly over the bus while low entropy
data will be compacted using small lookup tables. A
table index will be transferred in the case of a
successful lookup, otherwise the full value will be
transferred in several cycles. Reducing the number of
wires per bus, enables the use of wider wires, which in
turn reduces the wire delay. Examination of projected
process technologies shows that by shrinking the number
of bits in a bus (64 > 48) instead of shrinking the
individual wires maintains a constant wire delay. Tests
on SPEC CPU2000 have shown that for the 64-bit buses
leading from the L1 caches to the processor core it is
possible to transfer all data types (addresses,
integers, instructions and floating-points) using
40-bits per bus on the average.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Area measurement; Compaction; Crosstalk; Delay;
Entropy; Materials science and technology; Pipeline
processing; Power measurement; Transistors; Wire",
}
@Article{Singh:2004:GAL,
author = "A. Singh and W. J. Dally and B. Towles and A. K.
Gupta",
title = "Globally Adaptive Load-Balanced Routing on Tori",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "2--2",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We introduce a new method of adaptive routing on k-ary
n-cubes, Globally Adaptive Load-Balance (GAL). GAL
makes global routing decisions using global
information. In contrast, most previous adaptive
routing algorithms make local routing decisions using
local information (typically channel queue depth). GAL
senses global congestion using segmented injection
queues to decide the directions to route in each
dimension. It further load balances the network by
routing in the selected directions adaptively. Using
global information, GAL achieves the performance
(latency and throughput) of minimal adaptive routing on
benign traffic patterns and performs as well as the
best obliviously load-balanced routing algorithm (GOAL)
on adversarial traffic.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Chaos; Delay; Nearest neighbor searches; Routing;
Stability; Switches; Telecommunication traffic;
Throughput; Tornadoes; Traffic control",
}
@Article{Gomez:2004:EFT,
author = "M. E. Gomez and J. Duato and J. Flich and P. Lopez and
A. Robles and N. A. Nordbotten and O. Lysne and T.
Skeie",
title = "An Efficient Fault-Tolerant Routing Methodology for
Meshes and Tori",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "3--3",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper we present a methodology to design
fault-tolerant routing algorithms for regular direct
interconnection networks. It supports fully adaptive
routing, does not degrade performance in the absence of
faults, and supports a reasonably large number of
faults without significantly degrading performance. The
methodology is mainly based on the selection of an
intermediate node (if needed) for each
source-destination pair. Packets are adaptively routed
to the intermediate node and, at this node, without
being ejected, they are adaptively forwarded to their
destinations. In order to allow deadlock-free minimal
adaptive routing, the methodology requires only one
additional virtual channel (for a total of three), even
for tori. Evaluation results for a 4 x 4 x 4 torus
network show that the methodology is 5-fault tolerant.
Indeed, for up to 14 link failures, the percentage of
fault combinations supported is higher than 99.96\%.
Additionally, network throughput degrades by less than
10\% when injecting three random link faults without
disabling any node. In contrast, a mechanism similar to
the one proposed in the BlueGene/L, that disables some
network planes, would strongly degrade network
throughput by 79\%.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithm design and analysis; Circuit faults;
Degradation; Design methodology; Electronic mail; Fault
tolerance; Multiprocessor interconnection networks;
Routing; Switches; Throughput",
}
@Article{Stine:2004:CAR,
author = "J. M. Stine and N. P. Carter and J. Flich",
title = "Comparing Adaptive Routing and Dynamic Voltage Scaling
for Link Power Reduction",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "4--4",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We compare techniques that dynamically scale the
voltage of individual network links to reduce power
consumption with an approach in which all links in the
network are set to the same voltage and adaptive
routing is used to distribute load across the network.
Our results show that adaptive routing with static
network link voltages outperforms dimension-order
routing with dynamic link voltages in all cases,
because the adaptive routing scheme can respond more
quickly to changes in network demand. Adaptive routing
with static link voltages also outperforms adaptive
routing with dynamic link voltages in many cases,
although dynamic link voltage scaling gives better
behavior as the demand on the network grows.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Clocks; Dynamic voltage scaling; Energy
consumption; Frequency; Network-on-a-chip; Routing;
Telecommunication traffic; Traffic control; Voltage
control",
}
@Article{Robatmili:2004:TSI,
author = "B. Robatmili and N. Yazdani and S. Sardashti and M.
Nourani",
title = "Thread-Sensitive Instruction Issue for {SMT}
Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "5--5",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Simultaneous Multi Threading (SMT) is a processor
design method in which concurrent hardware threads
share processor resources like functional units and
memory. The scheduling complexity and performance of an
SMT processor depend on the topology used in the fetch
and issue stages. In this paper, we propose a thread
sensitive issue policy for a partitioned SMT processor
which is based on a thread metric. We propose the
number of ready-to-issue instructions of each thread as
priority metric. To evaluate our method, we have
developed a reconfigurable SMT-simulator on top of the
SimpleScalar Toolset. We simulated our modeled
processor under several workloads composed of SPEC
benchmarks. Experimental results show around 30\%
improvement compared to the conventional OLDEST\_FIRST
mixed topology issue policy. Additionally, the hardware
implementation of our architecture with this metric in
issue stage is quite simple.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Clocks; Delay; Frequency; Intrusion detection;
Laboratories; Logic; Processor scheduling;
Surface-mount technology; Topology",
}
@Article{Luo:2004:EES,
author = "Yue Luo and L. K. John",
title = "Efficiently Evaluating Speedup Using Sampled Processor
Simulation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "6--6",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Cycle accurate simulation of processors is extremely
time consuming. Sampling can greatly reduce simulation
time while retaining good accuracy. Previous research
on sampled simulation has been focusing on the accuracy
of CPI. However, most simulations are used to evaluate
the benefit of some microarchitectural enhancement, in
which the speedup is a more important metric than CPI.
We employ the ratio estimator from statistical sampling
theory to design efficient sampling to measure speedup
and to quantify its error. We show that to achieve a
given relative error limit for speedup, it is not
necessary to estimate CPI to the same accuracy. In our
experiment, estimating speedup requires about 9X fewer
instructions to be simulated in detail in comparison to
estimating CPI for the same relative error limit.
Therefore using the ratio estimator to evaluate speedup
is much more cost-effective and offers great potential
for reducing simulation time. We also discuss the
reason for this interesting and important result.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; Clocks; Computational modeling;
Computer errors; Computer simulation; Frequency;
Microarchitecture; Sampling methods; Size measurement;
Velocity measurement",
}
@Article{Ceze:2004:CHL,
author = "L. Ceze and K. Strauss and J. Tuck and J. Renau and J.
Torrellas",
title = "{CAVA}: Hiding {L2} Misses with Checkpoint-Assisted
Value Prediction",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "7--7",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Load misses in on-chip L2 caches often end up stalling
modern superscalars. To address this problem, we
propose hiding L2 misses with Checkpoint-Assisted VAlue
prediction (CAVA). When a load misses in L2, a
predicted value is returned to the processor. If the
missing load reaches the head of the reorder buffer
before the requested data is received from memory, the
processor checkpoints, consumes the predicted value,
and speculatively continues execution. When the
requested data finally arrives, it is compared to the
predicted value. If the prediction was correct,
execution continues normally; otherwise, execution
rolls back to the checkpoint. Compared to a baseline
aggressive superscalar, CAVA speeds up execution by a
geometric mean of 1.14 for SPECint and 1.34 for SPECfp
applications. Additionally, CAVA is faster than an
implementation of Runahead execution, and Runahead with
value prediction.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; Checkpointing; Costs; Delay;
Hardware; Microarchitecture; Out of order; Pipelines;
Prefetching; Recycling",
}
@Article{Singh:2004:BDB,
author = "A. Singh and W. J. Dally",
title = "Buffer and Delay Bounds in High Radix Interconnection
Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "8--8",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We apply recent results in queueing theory to propose
a methodology for bounding the buffer depth and packet
delay in high radix interconnection networks. While
most work in interconnection networks has been focused
on the throughput and average latency in such systems,
few studies have been done providing statistical
guarantees for buffer depth and packet delays. These
parameters are key in the design and performance of a
network. We present a methodology for calculating such
bounds for a practical high radix network and through
extensive simulations show its effectiveness for both
bursty and non-bursty injection traffic. Our results
suggest that modest speedups and buffer depths enable
reliable networks without flow control to be
constructed.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Convergence; Delay; Intelligent networks;
Multiprocessor interconnection networks; Queueing
analysis; Supercomputers; Switches; Telecommunication
traffic; Throughput; Traffic control",
}
@Article{Holloway:2004:CPS,
author = "A. L. Holloway and G. S. Sohi",
title = "Characterization of Problem Stores",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "3",
number = "1",
pages = "9--9",
month = jan,
year = "2004",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2004.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper introduces the concept of problem stores:
static stores whose dependent loads often miss in the
cache. Accurately identifying problem stores allows the
early determination of addresses likely to cause later
misses, potentially allowing for the development of
novel, proactive prefetching and memory hierarchy
management schemes. We present a detailed empirical
characterization of problem stores using the SPEC2000
CPU benchmarks. The data suggests several key
observations about problem stores. First, we find that
the number of important problem stores is typically
quite small; the worst 100 problem stores write the
values that will lead to about 90\% of non-cold misses
for a variety of cache configurations. We also find
that problem stores only account for 1 in 8 dynamic
stores, though they result in 9 of 10 misses.
Additionally, the problem stores dependent loads miss
in the L2 cache a larger fraction of the time than
loads not dependent on problem stores. We also observe
the set of problem stores is stable across a variety of
cache configurations. Finally, we found that the
instruction distance from problem store to miss and
problem store to evict is often greater than one
million instructions, but the value is often needed
within 100,000 instructions of the eviction.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Delay; Hardware; Memory management; Prefetching;
Proposals; Timing",
}
@Article{Sazeides:2005:DIB,
author = "Y. Sazeides and R. Kumar and D. M. Tullsen and T.
Constantinou",
title = "The Danger of Interval-Based Power Efficiency Metrics:
When Worst Is Best",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "4",
number = "1",
pages = "1--1",
month = jan,
year = "2005",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2005.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper shows that if the execution of a program is
divided into distinct intervals, it is possible for one
processor or configuration to provide the best power
efficiency over every interval, and yet have worse
overall power efficiency over the entire execution than
other configurations. This unintuitive behavior is a
result of a seemingly intuitive use of power efficiency
metrics, and can result in suboptimal design and
execution decisions. This behavior may occur when using
the energy-delay product and energy-delay product
metrics but not with the energy metric.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Battery charge measurement; Clocks; Computer science;
Delay; Design optimization; Frequency; Out of order;
Power engineering and energy; Power measurement",
}
@Article{Mutlu:2005:RRP,
author = "O. Mutlu and Hyesoon Kim and J. Stark and Y. N. Patt",
title = "On Reusing the Results of Pre-Executed Instructions in
a Runahead Execution Processor",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "4",
number = "1",
pages = "2--2",
month = jan,
year = "2005",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2005.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Previous research on runahead execution took it for
granted as a prefetch-only technique. Even though the
results of instructions independent of an L2 miss are
correctly computed during runahead mode, previous
approaches discarded those results instead of trying to
utilize them in normal mode execution. This paper
evaluates the effect of reusing the results of
preexecuted instructions on performance. We find that,
even with an ideal scheme, it is not worthwhile to
reuse the results of preexecuted instructions. Our
analysis provides insights into why result reuse does
not provide significant performance improvement in
runahead processors and concludes that runahead
execution should be employed as a prefetching mechanism
rather than a full-blown prefetching/result-reuse
mechanism.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Computational modeling; Computer aided
instruction; Delay; Energy consumption;
Microprocessors; Performance analysis; Prefetching;
Registers",
}
@Article{Zhang:2006:BIC,
author = "Chuanjun Zhang",
title = "Balanced instruction cache: reducing conflict misses
of direct-mapped caches through balanced subarray
accesses",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "2--5",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "It is observed that the limited memory space of
direct-mapped caches is not used in balance therefore
incurs extra conflict misses. We propose a novel cache
organization of a balanced cache, which balances
accesses to cache sets at the granularity of cache
subarrays. The key technique of the balanced cache is a
programmable subarray decoder through which the mapping
of memory reference addresses to cache subarrays can be
optimized hence conflict misses of direct-mapped caches
can be resolved. The experimental results show that the
miss rate of balanced cache is lower than that of the
same sized two-way set-associative caches on average
and can be as low as that of the same sized four-way
set-associative caches for particular applications.
Compared with previous techniques, the balanced cache
requires only one cycle to access all cache hits and
has the same access time as direct-mapped caches",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "balanced instruction cache; balanced subarray
accesses; Bridges; Cache memory; cache organization;
cache storage; Clocks; conflict miss reduction;
Decoding; Delay; Frequency; High performance computing;
programmable subarray decoder; storage allocation",
}
@Article{Ottoni:2006:SPC,
author = "G. Ottoni and R. Rangan and A. Stoler and M. J.
Bridges and D. I. August",
title = "From sequential programs to concurrent threads",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "6--9",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Chip multiprocessors are of increasing importance due
to difficulties in achieving higher clock frequencies
in uniprocessors, but their success depends on finding
useful work for the processor cores. This paper
addresses this challenge by presenting a simple
compiler approach that extracts non-speculative
thread-level parallelism from sequential codes. We
present initial results from this technique targeting a
validated dual-core processor model, achieving speedups
ranging from 9-48\% with an average of 25\% for
important benchmark loops over their single-threaded
versions. We also identify important next steps found
during our pursuit of higher degrees of automatic
threading",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "automatic threading; Bridges; Clocks; Computer
science; concurrency control; concurrent threads;
Frequency; Hardware; Microprocessors; multi-threading;
nonspeculative thread-level parallelism; Parallel
processing; Pipeline processing; program compiler;
program compilers; Program processors; sequential
programs",
}
@Article{Gupta:2006:TOI,
author = "A. K. Gupta and W. J. Dally",
title = "Topology optimization of interconnection networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "10--13",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper describes an automatic optimization tool
that searches a family of network topologies to select
the topology that best achieves a specified set of
design goals while satisfying specified packaging
constraints. Our tool uses a model of signaling
technology that relates bandwidth, cost and distance of
links. This model captures the distance-dependent
bandwidth of modern high-speed electrical links and the
cost differential between electrical and optical links.
Using our optimization tool, we explore the design
space of hybrid Clos-torus (C-T) networks. For a
representative set of packaging constraints we
determine the optimal hybrid C-T topology to minimize
cost and the optimal C-T topology to minimize latency
for various packet lengths. We then use the tool to
measure the sensitivity of the optimal topology to
several important packaging constraints such as pin
count and critical distance",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Constraint optimization; Costs; Design
optimization; hybrid Clos-torus networks;
interconnection networks; Multiprocessor
interconnection networks; multistage interconnection
networks; Network topology; Optical fiber
communication; Packaging; signaling technology;
signalling; Space exploration; Space technology;
telecommunication network topology; topology
optimization tool",
}
@Article{Gaudiot:2006:F,
author = "J.-L. Gaudiot and Y. Patt and K. Skadon",
title = "Foreword",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "11--11",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Forward for issue 1 of 2006",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; Computer Society; Concrete;
Delay; Footwear; Software libraries; Vehicles",
}
@Article{Morad:2006:PPE,
author = "T. Y. Morad and U. C. Weiser and A. Kolodnyt and M.
Valero and E. Ayguade",
title = "Performance, power efficiency and scalability of
asymmetric cluster chip multiprocessors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "14--17",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper evaluates asymmetric cluster chip
multiprocessor (ACCMP) architectures as a mechanism to
achieve the highest performance for a given power
budget. ACCMPs execute serial phases of multithreaded
programs on large high-performance cores whereas
parallel phases are executed on a mix of large and many
small simple cores. Theoretical analysis reveals a
performance upper bound for symmetric multiprocessors,
which is surpassed by asymmetric configurations at
certain power ranges. Our emulations show that
asymmetric multiprocessors can reduce power consumption
by more than two thirds with similar performance
compared to symmetric multiprocessors",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ACCMP; Application software; asymmetric cluster chip
multiprocessors; Chip Multiprocessors; Emulation;
Frequency; microprocessor chips; multi-threading;
multiprocessing systems; multithreaded program;
Optimized production technology; Parallel processing;
parallel processing; power consumption reduction; power
efficiency; Power Efficiency; Power system modeling;
Queueing analysis; Scalability; Upper bound; Voltage",
}
@Article{Riley:2006:PCU,
author = "N. Riley and C. Zilles",
title = "Probabilistic counter updates for predictor hysteresis
and bias",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "18--21",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware predictor designers have incorporated
hysteresis and/or bias to achieve desired behavior by
increasing the number of bits per counter. Some
resulting proposed predictor designs are currently
impractical because their counter tables are too large.
We describe a method for dramatically reducing the
amount of storage required for a predictor's counter
table with minimal impact on prediction accuracy.
Probabilistic updates to counter state are implemented
using a hardware pseudo-random number generator to
increment or decrement counters a fraction of the time,
meaning fewer counter bits are required. We demonstrate
the effectiveness of probabilistic updates in the
context of Fields et al.'s critical path predictor,
which employs a biased 6-bit counter. Averaged across
the SPEC CINT2000 benchmarks, our 2-bit and 3-bit
probabilistic counters closely approximate a 6-bit
deterministic one (achieving speedups of 7.75\% and
7.91\% compared to 7.94\%) when used for
criticality-based scheduling in a clustered machine.
Performance degrades gracefully, enabling even a 1-bit
probabilistic counter to outperform the best 3-bit
deterministic counter we found",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; clustered machine; computer architecture;
Computer science; Costs; Counting circuits; critical
path predictor; criticality-based scheduling;
Degradation; Hardware; hardware predictor design;
hardware pseudorandom number generator; Hysteresis;
Microarchitecture; Pipelines; predictor bias; predictor
hysteresis; predictors counter table; probabilistic
counter update; probability; Processor scheduling;
processor scheduling; random number generation",
}
@Article{Zhou:2006:CFT,
author = "Huiyang Zhou",
title = "A case for fault tolerance and performance enhancement
using chip multi-processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "22--25",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper makes a case for using multi-core
processors to simultaneously achieve transient-fault
tolerance and performance enhancement. Our approach is
extended from a recent latency-tolerance proposal,
dual-core execution (DCE). In DCE, a program is
executed twice in two processors, named the front and
back processors. The front processor pre-processes
instructions in a very fast yet highly accurate way and
the back processor re-executes the instruction stream
retired from the front processor. The front processor
runs faster as it has no correctness constraints
whereas its results, including timely prefetching and
prompt branch misprediction resolution, help the back
processor make faster progress. In this paper, we
propose to entrust the speculative results of the front
processor and use them to check the un-speculative
results of the back processor. A discrepancy, either
due to a transient fault or a mispeculation, is then
handled with the existing mispeculation recovery
mechanism. In this way, both transient-fault tolerance
and performance improvement can be delivered
simultaneously with little hardware overhead",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "back processor; chip multiprocessors; Computer aided
software engineering; dual-core execution; Error
analysis; Fault tolerance; fault tolerant computing;
front processor; Hardware; latency-tolerance proposal;
microprocessor chips; mispeculation recovery mechanism;
Multicore processing; multiprocessing systems;
prefetching; Prefetching; prompt branch misprediction
resolution; Proposals; Redundancy; storage management;
Throughput; transient-fault tolerance; Transistors",
}
@Article{Lee:2006:ASC,
author = "Moon-Sang Lee and Sang-Kwon Lee and Joonwon Lee and
Seung-Ryoul Maeng",
title = "Adopting system call based address translation into
user-level communication",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "26--29",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "User-level communication alleviates the software
overhead of the communication subsystem by allowing
applications to access the network interface directly.
For that purpose, efficient address translation of
virtual address to physical address is critical. In
this study, we propose a system call based address
translation scheme where every translation is done by
the kernel instead of a translation cache on a network
interface controller as in the previous cache based
address translation. According to our experiments, our
scheme achieves up to 4.5\% reduction in application
execution time compared to the previous cache based
approach",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; cache based approach; cache
storage; Communication system software; Control
systems; Costs; Delay; Electronic mail; Hardware;
Kernel; network interface controller; network
interfaces; Network interfaces; operating system
kernels; Protocols; software overhead; system call
based address translation; user-level communication",
}
@Article{Ahn:2006:DPA,
author = "Jung Ho Ahn and W. J. Dally",
title = "Data parallel address architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "30--33",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Data parallel memory systems must maintain a large
number of outstanding memory references to fully use
increasing DRAM bandwidth in the presence of increasing
latency. At the same time, the throughput of modern
DRAMs is very sensitive to access pattern's due to the
time required to precharge and activate banks and to
switch between read and write access. To achieve memory
reference parallelism a system may simultaneously issue
references from multiple reference threads.
Alternatively multiple references from a single thread
can be issued in parallel. In this paper, we examine
this tradeoff and show that allowing only a single
thread to access DRAM at any given time significantly
improves performance by increasing the locality of the
reference stream and hence reducing precharge/activate
operations and read/write turnaround. Simulations of
scientific and multimedia applications show that
generating multiple references from a single thread
gives, on average, 17\% better performance than
generating references from two parallel threads",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Computer architecture; data parallel
address architecture; data parallel memory systems;
Delay; DRAM bandwidth; DRAM chips; Memory management;
parallel architectures; parallel memories; Parallel
processing; Random access memory; read access;
Scheduling; Streaming media; Switches; write access",
}
@Article{Eisley:2006:NCC,
author = "N. Eisley and Li-Shiuan Peh and Li Shang",
title = "In-network cache coherence",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "34--37",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We propose implementing cache coherence protocols
within the network, demonstrating how an in-network
implementation of the MSI directory-based protocol
allows for in-transit optimizations of read and write
delay. Our results show 15\% and 24\% savings on
average in memory access latency for SPLASH-2 parallel
benchmarks running on a 4times4 and a 16times16
multiprocessor respectively",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Access protocols; benchmark testing; cache coherence;
cache storage; Coherence; Delay; delays; Fabrics;
interconnection network; memory access latency; Memory
architecture; memory architecture; memory protocols;
Moore's Law; MSI directory-based protocol;
Multiprocessor interconnection networks; network cache
coherence protocols; parallel processing; read delay;
SPLASH-2 parallel benchmarks; write delay",
}
@Article{Srinivasan:2006:PMU,
author = "R. Srinivasan and J. Cook and O. Lubeck",
title = "Performance modeling using {Monte Carlo} simulation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "1",
pages = "38--41",
month = jan,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/intel-ia-64.bib",
abstract = "Cycle accurate simulation has long been the primary
tool for micro-architecture design and evaluation.
Though accurate, the slow speed often imposes
constraints on the extent of design exploration. In
this work, we propose a fast, accurate Monte-Carlo
based model for predicting processor performance. We
apply this technique to predict the CPI of in-order
architectures and validate it against the Itanium-2.
The Monte Carlo model uses micro-architecture
independent application characteristics, and cache,
branch predictor statistics to predict CPI with an
average error of less than 7\%. Since prediction is
achieved in a few seconds, the model can be used for
fast design space exploration that can efficiently cull
the space for cycle-accurate simulations. Besides
accurately predicting CPI, the model also breaks down
CPI into various components, where each component
quantifies the effect of a particular stall condition
(branch misprediction, cache miss, etc.) on overall
CPI. Such a CPI decomposition can help processor
designers quickly identify and resolve critical
performance bottlenecks",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "branch predictor statistics; Computational modeling;
Computer architecture; CPI decomposition; design space
exploration; Error analysis; Itanium-2; Laboratories;
Mathematical analysis; memory architecture;
microarchitecture design; microarchitecture evaluation;
Monte Carlo methods; Monte Carlo simulation;
performance evaluation; Predictive models; Process
design; processor performance modeling; program
processors; Sampling methods; Space exploration",
}
@Article{Ergin:2006:ENV,
author = "O. Ergin and O. Unsal and X. Vera and A. Gonzalez",
title = "Exploiting Narrow Values for Soft Error Tolerance",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "12--12",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Soft errors are an important challenge in contemporary
microprocessors. Particle hits on the components of a
processor are expected to create an increasing number
of transient errors with each new microprocessor
generation. In this paper we propose simple mechanisms
that effectively reduce the vulnerability to soft
errors In a processor. Our designs are generally
motivated by the fact that many of the produced and
consumed values in the processors are narrow and their
upper order bits are meaningless. Soft errors canted by
any particle strike to these higher order bits can be
avoided by simply identifying these narrow values.
Alternatively soft errors can be detected or corrected
on the narrow values by replicating the vulnerable
portion of the value inside the storage space provided
for the upper order bits of these operands. We offer a
variety of schemes that make use of narrow values and
analyze their efficiency in reducing soft error
vulnerability of level-1 data cache of the processor",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache storage; Cache storage; contemporary
microprocessors; data cache; Data Cache; Error
correction; error correction; Error Correction; error
correction; error detection; Hardware; Impurities;
Manufacturing; microprocessor chips; Microprocessors;
Multithreading; Narrow Values; narrow values; Neutrons;
particle strike; Process design; radiation effects;
Random access memory; soft error tolerance; Soft
Errors; system recovery; transient errors; transients",
}
@Article{Li:2006:PBH,
author = "W. Li and S. Mohanty and K. Kavi",
title = "A Page-based Hybrid (Software--Hardware) Dynamic
Memory Allocator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "13--13",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/java2000.bib",
abstract = "Modern programming languages often include complex
mechanisms for dynamic memory allocation and garbage
collection. These features drive the need for more
efficient implementation of memory management
functions, both in terms of memory usage and execution
performance. In this paper, we introduce a software and
hardware co-design to improve the speed of the software
allocator used in free-BSD systems. The hardware
complexity of our design is independent of the dynamic
memory size, thus making the allocator suitable for any
memory size. Our design improves the performance of
memory management intensive benchmarks by as much as
43\%. To oar knowledge, this is the first-ever work of
this kind, introducing ``hybrid memory allocator''",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; Computer languages; Computer
science; Costs; Delay; Dynamic programming; garbage
collection; Hardware; hardware complexity;
hardware-software codesign; hybrid dynamic memory
allocator; Java; memory allocator; memory architecture;
memory management; Memory management; modern
programming languages; software allocator; Software
performance; software-hardware co-design;
software/hardware co-design; storage allocation;
storage management",
}
@Article{Donald:2006:EPP,
author = "J. Donald and M. Martonosi",
title = "An Efficient, Practical Parallelization Methodology
for Multicore Architecture Simulation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "14--14",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Multiple core designs have become commonplace in the
processor market, and are hence a major focus in modern
computer architecture research. Thus, for both product
development and research, multiple core processor
simulation environments are necessary. A well-known
positive feedback property of computer design is that
we use today's computers to design tomorrow's. Thus,
with the emergence of chip multiprocessors, it is
natural to re-examine simulation environments written
to exploit parallelism. In this paper we present a
programming methodology for directly converting
existing uniprocessor simulators into parallelized
multiple-core simulators. Our method not only takes
significantly less development effort compared to some
prior used programming techniques, but also possesses
advantages by retaining a modular and comprehensible
programming structure. We demonstrate our case with
actual developed products after applying this method to
two different simulators, one developed from IBM
Ibrandot and the other from the SimpleScalar tool set.
Our SimpleScalar-based framework achieves a parallel
speedup of 2.2times on a dual-CPU dual-core (4-way)
Opteron server",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "chip multiprocessors; comprehensible programming
structure; Computational modeling; Computer
architecture; Computer simulation; Feedback; IBM
Ibrandot; logic simulation; microcomputers; modern
computer architecture; modular programming structure;
multicore; multicore architecture simulation; Multicore
processing; multiple core processor simulation;
multiprocessing systems; Object oriented modeling;
parallel architectures; Parallel processing; Parallel
programming; parallelism; parallelization method;
parallelized multiple-core simulators; positive
feedback property; Process planning; Product
development; programming methodology; SimpleScalar tool
set; simulation",
}
@Article{Bracy:2006:DAC,
author = "A. Bracy and K. Doshi and Q. Jacobson",
title = "Disintermediated Active Communication",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "15--15",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Disintermediated active communication (DAC) is a new
paradigm of communication in which a sending thread
actively engages a receiving thread when sending it a
message via shared memory. DAC is different than
existing approaches that use passive communication
through shared-memory --- based on intermittently
checking for messages --- or that use preemptive
communication but must rely on intermediaries such as
the operating system or dedicated interrupt channels.
An implementation of DAC builds on existing cache
coherency support and exploits light-weight user-level
interrupts. Inter-thread communication occurs via
monitored memory locations where the receiver thread
responds to invalidations of monitored addresses with a
light-weight user-level software-defined handler.
Address monitoring is supported by cache line
user-bits, or CLUbits. CLUbits reside in the cache next
to the coherence state, are private per thread, and
maintain user-defined per-cache-line state. A light
weight software library can demultiplex asynchronous
notifications and handle exceptional cases. In
DAC-based programs threads coordinate with one another
by explicit signaling and implicit resource monitoring.
With the simple and direct communication primitives of
DAC, multi-threaded workloads synchronize at a finer
granularity and more efficiently utilize the hardware
of upcoming multi-core designs. This paper introduces
DAC, presents several signaling models for DAC-based
programs, and describes a simple memory-based framework
that supports DAC by leveraging existing
cache-coherency models. Our framework is general enough
to support uses beyond DAC",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "address monitoring; cache coherency; cache line
user-bits; cache storage; CLUbits; Computer aided
instruction; Concurrent computing; disintermediated
active communication; Hardware; High performance
computing; interrupts; interthread communication;
memory locations; Monitoring; multi-threading;
multicore designs; Operating systems; Processor
scheduling; Programming profession; resource
monitoring; shared memory; shared memory systems;
signaling models; software libraries; Software
libraries; software library; storage allocation;
user-level interrupts",
}
@Article{Mallik:2006:UDF,
author = "A. Mallik and B. Lin and G. Memik and P. Dinda and R.
P. Dick",
title = "User-Driven Frequency Scaling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "16--16",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We propose and evaluate user-driven frequency scaling
(UDFS) for improved power management on processors that
support dynamic voltage and frequency scaling (DVFS),
e.g, those used in current laptop and desktop
computers. UDFS dynamically adapts CPU frequency to the
individual user and the workload through a simple user
feedback mechanism, unlike currently-used DVFS methods
which rely only on CPU utilization. Our UDFS algorithms
dramatically reduce typical operating frequencies while
maintaining performance at satisfactory levels for each
user. We evaluated our techniques through user studies
conducted on a Pentium M laptop running Windows
applications. The UDFS scheme reduces measured system
power by 22.1\%, averaged across all our users and
applications, compared to the Windows XP DVFS scheme",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Central Processing Unit; computer power supplies; CPU
frequency; DVFS; dynamic frequency scaling; Dynamic
voltage scaling; dynamic voltage scaling; Energy
consumption; Energy management; Engineering management;
Feedback; Frequency control; improved power management;
microprocessor chips; Pentium M laptop; Portable
computers; power aware computing; Power engineering
computing; Power Management; Power measurement; user
feedback mechanism; User-aware computing; user-driven
frequency scaling; Windows XP DVFS scheme",
}
@Article{Blundell:2006:STM,
author = "C. Blundell and E. C. Lewis and M. M. K. Martin",
title = "Subtleties of transactional memory atomicity
semantics",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "17--17",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.18",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Transactional memory has great potential for
simplifying multithreaded programming by allowing
programmers to specify regions of the program that must
appear to execute atomically. Transactional memory
implementations then optimistically execute these
transactions concurrently to obtain high performance.
This work shows that the same atomic guarantees that
give transactions their power also have unexpected and
potentially serious negative effects on programs that
were written assuming narrower scopes of atomicity. We
make four contributions: (1) we show that a direct
translation of lock-based critical sections into
transactions can introduce deadlock into otherwise
correct programs, (2) we introduce the terms strong
atomicity and weak atomicity to describe the
interaction of transactional and non-transactional
code, (3) we show that code that is correct under weak
atomicity can deadlock under strong atomicity, and (4)
we demonstrate that sequentially composing
transactional code can also introduce deadlocks. These
observations invalidate the intuition that transactions
are strictly safer than lock-based critical sections,
that strong atomicity is strictly safer than weak
atomicity, and that transactions are always
composable",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer languages; Computer Systems Organization;
Concurrent distributed and parallel languages;
deadlock; direct translation; Hardware; Information
science; Interference; Interleaved codes; Language
Classifications; Law; lock-based critical sections;
Multi-core/single-chip multiprocessors;
multi-threading; Multiple Data Stream Architectures
(Multiprocessors); multithreaded programming;
nontransactional code; operating systems (computers);
Parallel Architectures; Processor Architectures;
program verification; Programming Languages;
Programming profession; sequentially composing
transactional code; Software performance;
Software/Software Engineering; strong atomicity; System
recovery; Transaction databases; transaction
processing; transactional memory atomicity semantics;
weak atomicity",
}
@Article{Price:2006:CCT,
author = "G. Price and M. Vachharajani",
title = "A Case for Compressing Traces with {BDDs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "5",
number = "2",
pages = "18--18",
month = feb,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2006.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Instruction-level traces are widely used for program
and hardware analysis. However, program traces for just
a few seconds of execution are enormous, up to several
terabytes in size, uncompressed. Specialized
compression can shrink traces to a few gigabytes, but
trace analyzers typically stream the decompressed trace
through the analysis engine. Thus, the complexity of
analysis depends on the decompressed trace size (even
though the decompressed trace is never stored to disk).
This makes many global or interactive analyses
infeasible. This paper presents a method to compress
program traces using binary decision diagrams (BDDs).
BDDs intrinsically support operations common to many
desirable program analyses and these analyses operate
directly on the BDD. Thus, they are often polynomial in
the size of the compressed representation. The paper
presents mechanisms to represent a variety of trace
data using BDDs and shows that BDDs can store, in 1 GB
of RAM, the entire data-dependence graph of traces with
over 1 billion instructions. This allows rapid
computation of global analyses such as heap-object
liveness and dynamic slicing",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "binary decision diagrams; Binary decision diagrams;
Boolean functions; Data analysis; Data structures;
data-dependence graph; dynamic slicing; Engines; global
analyses; Hardware; hardware analysis; heap-object
liveness; instruction-level traces; Performance
analysis; Polynomials; program analysis; program
slicing; program traces; rapid computation; Read-write
memory; Software Engineering; Software Processor
validation Engineering; Software/Program Verification;
Software/Software; Software/Software Engineering;
specialized compression; Testing and Debugging; trace
analyzers; traces compression; Tracing; Validation;
Visualization",
}
@Article{MoretoPlanas:2007:EDC,
author = "M. {Moreto Planas} and F. Cazorla and A. Ramirez and
M. Valero",
title = "Explaining Dynamic Cache Partitioning Speed Ups",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "1",
pages = "1--4",
month = jan,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Cache partitioning has been proposed as an interesting
alternative to traditional eviction policies of shared
cache levels in modern CMP architectures: throughput is
improved at the expense of a reasonable cost. However,
these new policies present different behaviors
depending on the applications that are running in the
architecture. In this paper, we introduce some metrics
that characterize applications and allow us to give a
clear and simple model to explain final throughput
speed ups.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B Hardware; B.3 Memory Structures; B.3.2 Design
Styles; B.3.2.b Cache memories; B.3.3 Performance
Analysis and Design Aids; C Computer Systems
Organization; C.1 Processor Architectures; C.1.4
Parallel Architectures; C.1.4.e Multi-core/single-chip
multiprocessors; C.1.5 Micro-architecture
implementation considerations; C.1.5.e Memory
hierarchy; C.4 Performance of Systems; C.4.d Modeling
techniques; cache storage; chip multiprocessing;
Computer architecture; Counting circuits; dynamic cache
partitioning; microprocessor chips; Parallel
processing; Process design; Resource management; shared
cache levels; Streaming media; Surface-mount
technology; Throughput; Uninterruptible power systems",
}
@Article{Jerger:2007:CSC,
author = "N. Enright Jerger and M. Lipasti and L. Peh",
title = "Circuit-Switched Coherence",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "1",
pages = "5--8",
month = jan,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Circuit-switched networks can significantly lower the
communication latency between processor cores, when
compared to packet-switched networks, since once
circuits are set up, communication latency approaches
pure interconnect delay. However, if circuits are not
frequently reused, the long set up time and poorer
interconnect utilization can hurt overall performance.
To combat this problem, we propose a hybrid router
design which intermingles packet-switched flits with
circuit-switched flits. Additionally, we co-design a
prediction-based coherence protocol that leverages the
existence of circuits to optimize pair-wise sharing
between cores. The protocol allows pair-wise sharers to
communicate directly with each other via circuits and
drives up circuit reuse. Circuit-switched coherence
provides overall system performance improvements of up
to 17\% with an average improvement of 10\% and reduces
network latency by up to 30\%.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; C Computer Systems Organization; C.1
Processor Architectures; C.1.4 Parallel Architectures;
C.1.4.e Multi-core/single-chip multiprocessors; C.1.4.g
On-chip interconnection networks; C.1.5
Micro-architecture implementation considerations;
C.1.5.e Memory hierarchy; circuit switching;
circuit-switched network; Coupling circuits; Delay;
Fabrics; hybrid router design; Integrated circuit
interconnections; multiprocessor interconnection
networks; network latency; Network-on-a-chip; packet
switching; Packet switching; packet switching;
pair-wise sharing; Pipelines; prediction-based
coherence protocol; processor core; Protocols; routing
protocols; System performance",
}
@Article{Kodakara:2007:CRM,
author = "S. Kodakara and J. Kim and D. Lilja and D. Hawkins and
W. Hsu and P. Yew",
title = "{CIM}: a Reliable Metric for Evaluating Program Phase
Classifications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "1",
pages = "9--12",
month = jan,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We propose the use of the confidence interval of
estimated mean (CIM), a metric based on statistical
sampling theory, to evaluate the quality of a given
phase classification and for comparing different phase
classification schemes. Previous research on phase
classification used the weighted average of coefficient
of variation (CoVwa) to estimate phase classification
quality. We found that the phase quality indicated by
CoVwa could be inconsistent across different phase
classifications. We explain the reasons behind this
inconsistency and demonstrate the inconsistency using
data from several SPEC CPU2000 benchmark programs. We
show that the confidence interval of estimated mean
(CIM) correctly estimates the quality of phase
classification with a meaningful statistical
interpretation.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; Benchmark Analysis; Clustering
algorithms; Computer architecture; computer
architecture; Computer integrated manufacturing;
confidence interval; estimated mean; estimation theory;
pattern classification; Phase Classification; Phase
detection; Phase estimation; Phase measurement; phase
quality estimation; program compilers; program
diagnostics; program phase classification; Quality
Metric; reliable metric; Sampling methods; sampling
methods; SPEC CPU2000 benchmark program; statistical
interpretation; Statistical Sampling; statistical
sampling theory; Statistics; Surges",
}
@Article{Dieter:2007:LCM,
author = "W. R. Dieter and A. Kaveti and H. G. Dietz",
title = "Low-Cost Microarchitectural Support for Improved
Floating-Point Accuracy",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "1",
pages = "13--16",
month = jan,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Some processors designed for consumer applications,
such as graphics processing units (CPUs) and the CELL
processor, promise outstanding floating-point
performance for scientific applications at commodity
prices. However, IEEE single precision is the most
precise floating-point data type these processors
directly support in hardware. Pairs of native
floating-point numbers can be used to represent a base
result and a residual term to increase accuracy, but
the resulting order of magnitude slowdown dramatically
reduces the price/performance advantage of these
systems. By adding a few simple microarchitectural
features, acceptable accuracy can be obtained with
relatively little performance penalty. To reduce the
cost of native-pair arithmetic, a residual register is
used to hold information that would normally have been
discarded after each floating-point computation. The
residual register dramatically simplifies the code,
providing both lower latency and better
instruction-level parallelism.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; B Hardware; B.2 Arithmetic and
Logic Structures; B.2.4 High-Speed Arithmetic; B.2.4.b
Cost/performance; C Computer Systems Organization; C.0
General; C.0.b Hardware/software interfaces; C.1
Processor Architectures; C.1.5 Micro-architecture
implementation considerations; CELL processor; computer
architecture; Costs; floating point arithmetic;
floating-point accuracy; Floating-point arithmetic; G
Mathematics of Computing; G.1 Numerical Analysis; G.1.0
General; G.1.0.e Multiple precision arithmetic;
Graphics; graphics processing units; Hardware; I
Computing Methodologies; I.3 Computer Graphics; I.3.1
Hardware Architecture; I.3.1.a Graphics processors;
IEEE single precision; instruction-level parallelism;
microarchitectural support; Microarchitecture; parallel
processing; Pipelines; Registers; Software algorithms;
Software performance",
}
@Article{Etsion:2007:PPT,
author = "Y. Etsion and D. G. Feitelson",
title = "Probabilistic Prediction of Temporal Locality",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "1",
pages = "17--20",
month = jan,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The increasing gap between processor and memory
speeds, as well as the introduction of multi-core CPUs,
have exacerbated the dependency of CPU performance on
the memory subsystem. This trend motivates the search
for more efficient caching mechanisms, enabling both
faster service of frequently used blocks and decreased
power consumption. In this paper we describe a novel,
random sampling based predictor that can distinguish
transient cache insertions from non-transient ones. We
show that this predictor can identify a small set of
data cache resident blocks that service most of the
memory references, thus serving as a building block for
new cache designs and block replacement policies.
Although we only discuss the L1 data cache, we have
found this predictor to be efficient also when handling
L1 instruction caches and shared L2 caches.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B Hardware; B.3 Memory Structures; B.3.2 Design
Styles; B.3.2.b Cache memories; B.3.3 Performance
Analysis and Design Aids; cache storage; Computer
science; Data analysis; data cache; Distributed
computing; Energy consumption; Extraterrestrial
phenomena; memory subsystem; multi-core CPU; power
aware computing; probabilistic prediction; random
sampling; Sampling methods; temporal locality;
transient cache insertions; Visualization",
}
@Article{Guz:2007:NCO,
author = "Z. Guz and I. Keidar and A. Kolodny and U. Weiser",
title = "{Nahalal}: Cache Organization for Chip
Multiprocessors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "1",
pages = "21--24",
month = jan,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper addresses cache organization in chip
multiprocessors (CMPs). We show that in CMP systems it
is valuable to distinguish between shared data, which
is accessed by multiple cores, and private data
accessed by a single core. We introduce Nahalal, an
architecture whose novel floorplan topology partitions
cached data according to its usage (shared versus
private data), and thus enables fast access to shared
data for all processors while preserving the vicinity
of private data to each processor. Nahalal exhibits
significant improvements in cache access latency
compared to a traditional cache design.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Cache memories; cache organization; cache
storage; chip multiprocessors; circuit layout; CMP
systems; Computer integrated manufacturing; Computer
Systems Organization; Design Styles; floorplan topology
partitions; Hardware; Memory Structures; microprocessor
chips; Multi-core/single-chip multiprocessors; Nahalal;
Parallel Architectures; Processor Architectures;
Writing",
}
@Article{Joao:2007:DPI,
author = "J. A. Joao and O. Mutlu and H. Kim and Y. N. Patt",
title = "Dynamic Predication of Indirect Jumps",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "25--28",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Indirect jumps are used to implement
increasingly-common programming language constructs
such as virtual function calls, switch-case statements,
jump tables, and interface calls. Unfortunately, the
prediction accuracy of indirect jumps has remained low
because many indirect jumps have multiple targets that
are difficult to predict even with specialized
hardware. This paper proposes a new way of handling
hard-to-predict indirect jumps: dynamically predicating
them. The compiler identifies indirect jumps that are
suitable for predication along with their control-flow
merge (CFM) points. The microarchitecture predicates
the instructions between different targets of the jump
and its CFM point if the jump turns out to be
hard-to-predict at run time. We describe the new
indirect jump predication architecture, provide code
examples showing why it could reduce the performance
impact of jumps, derive an analytical cost-benefit
model for deciding which jumps and targets to
predicate, and present preliminary evaluation
results.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; Analytical models; and statically-scheduled
implementation; Computer languages; Computer Systems
Organization; control-flow merge point;
dynamically-scheduled; dynamically-scheduled and
statically-scheduled implementation; hard-to-predict
indirect jump handling; Hardware; Instruction fetch;
Instruction sets; interface call; jump table;
Micro-architecture implementation considerations;
Microarchitecture; microarchitecture dynamic
predication; Object oriented modeling; parallel
architectures; Performance analysis; Pipeline
processors; Pipelines; Processor Architectures; program
compiler; program compilers; program control
structures; programming language construct; Single Data
Stream Architectures; Superscalar; switch-case
statement; Switches; system monitoring; virtual
function call",
}
@Article{Das:2007:MMC,
author = "A. Das and S. Ozdemir and G. Memik and J. Zambreno and
A. Choudhary",
title = "Microarchitectures for Managing Chip Revenues under
Process Variations",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "29--32",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As transistor feature sizes continue to shrink into
the sub-90 nm range and beyond, the effects of process
variations on critical path delay and chip yields have
amplified. A common concept to remedy the effects of
variation is speed-binning, by which chips from a
single batch are rated by a discrete range of
frequencies and sold at different prices. In this
paper, we discuss strategies to modify the number of
chips in different bins and hence enhance the profits
obtained from them. Particularly, we propose a scheme
that introduces a small Substitute Cache associated
with each cache way to replicate the data elements that
will be stored in the high latency lines. Assuming a
fixed pricing model, this method increases the revenue
by as much as 13.8\% without any impact on the
performance of the chips.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Cache Memories; cache memory; cache storage; Circuits;
Computer Architecture; computer architecture; Computer
Architecture; Computer architecture; critical path
delay; Fabrication; Fault-tolerant Computing.; fixed
pricing model; Frequency; Logic arrays;
Microarchitecture; microarchitecture chip;
microprocessor chips; Microprocessors; optimisation;
process variation; Process Variations; Registers; Size
control; Voltage control",
}
@Article{Zebchuk:2007:BBC,
author = "J. Zebchuk and A. Moshovos",
title = "A Building Block for Coarse-Grain Optimizations in the
On-Chip Memory Hierarchy",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "33--36",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Current on-chip block-centric memory hierarchies
exploit access patterns at the fine-grain scale of
small blocks. Several recently proposed memory
hierarchy enhancements for coherence traffic reduction
and prefetching suggest that additional useful patterns
emerge with a macroscopic, coarse-grain view. This
paper presents RegionTracker, a dual-grain, on-chip
cache design that exposes coarse-grain behavior while
maintaining block-level communication. RegionTracker
eliminates the extraneous, often imprecise coarse-grain
tracking structures of previous proposals. It can be
used as the building block for coarse-grain
optimizations, reducing their overall cost and easing
their adoption. Using full-system simulation of a
quad-core chip multiprocessor and commercial workloads,
we demonstrate that RegionTracker overcomes the
inefficiencies of previous coarse-grain cache designs.
We also demonstrate how RegionTracker boosts the
benefits and reduces the cost of a previously proposed
snoop reduction technique.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "access patterns; Bandwidth; cache storage; Cache
storage; coarse-grain optimizations; coherence traffic
reduction; Cost function; Design optimization;
Explosions; Information management; Memory management;
Multithreading; on-chip memory hierarchy; optimising
compilers; Prefetching; prefetching; Proposals;
quad-core chip multiprocessor; RegionTracker dual-grain
on-chip cache design; system-on-chip",
}
@Article{Kim:2007:FBT,
author = "J. Kim and J. Balfour and W. J. Dally",
title = "Flattened Butterfly Topology for On-Chip Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "37--40",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "With the trend towards increasing number of cores in a
multicore processors, the on-chip network that connects
the cores needs to scale efficiently. In this work, we
propose the use of high-radix networks in on-chip
networks and describe how the flattened butterfly
topology can be mapped to on-chip networks. By using
high-radix routers to reduce the diameter of the
network, the flattened butterfly offers lower latency
and energy consumption than conventional on-chip
topologies. In addition, by properly using bypass
channels in the flattened butterfly network,
non-minimal routing can be employed without increasing
latency or the energy consumption.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Computer networks; Delay; Energy
consumption; flattened butterfly; flattened butterfly
topology; high-radix networks; high-radix routers;
Laboratories; Multicore processing; multicore
processors; Multiprocessor interconnection networks;
Network topology; network topology; Network-on-a-chip;
network-on-chip; on-chip networks; Routing; topology",
}
@Article{Xiao:2007:NPD,
author = "X. Xiao and J. Lee",
title = "A Novel Parallel Deadlock Detection Algorithm and
Hardware for Multiprocessor System-on-a-Chip",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "41--44",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Given the projected dramatic increase in the number of
processors and resources in a system-on-a-chip, a
quadratic increase in the likelihood of deadlock is
predicted due to complex system behavior. To deal with
this issue, we here present a novel parallel
hardware-oriented deadlock detection algorithm with $
O(1) $ DEADLOCK DETECTION AND $ O(\MIN (M, N)) $
preparation, where $m$ and $n$ are the numbers of
processes and resources, respectively. Our
contributions are (i) the first $ O(1)$ deadlock
detection hardware implementation and (ii) a new
algorithmic method of achieving $ O(\min (m, n))$
overall run-time complexity. We implement our algorithm
in Verilog HDL and demonstrate that deadlock detection
always takes only two clock cycles regardless of the
size of a system (i.e., $m$ and $n$).",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithms implemented in hardware; computational
complexity; deadlock detection hardware; Deadlocks;
Detection algorithms; Hardware design languages;
microprocessor chips; Multiprocessing systems;
multiprocessing systems; multiprocessor
system-on-a-chip; operating systems (computers);
Parallel algorithms; parallel algorithms; parallel
deadlock detection algorithm; Processor scheduling;
Real time systems; Real-time and embedded systems;
Resource management; run-time complexity; Runtime;
Software performance; System recovery; system-on-chip",
}
@Article{August:2007:UOS,
author = "D. August and J. Chang and S. Girbal and D.
Gracia-Perez and G. Mouchard and D. A. Penry and O.
Temam and N. Vachharajani",
title = "{UNISIM}: an Open Simulation Environment and Library
for Complex Architecture Design and Collaborative
Development",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "45--48",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Simulator development is already a huge burden for
many academic and industry research groups; future
complex or heterogeneous multi-cores, as well as the
multiplicity of performance metrics and required
functionality, will make matters worse. We present a
new simulation environment, called UNISIM, which is
designed to rationalize simulator development by making
it possible and efficient to distribute the overall
effort over multiple research groups, even without
direct cooperation. UNISIM achieves this goal with a
combination of modular software development,
distributed communication protocols, multilevel
abstract modeling, interoperability capabilities, a set
of simulator services APIs, and an open
library/repository for providing a consistent set of
simulator modules.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "API; application program interfaces; Collaboration;
collaborative development; complex architecture design;
Computational modeling; Computer architecture; Computer
industry; Computer science; Design engineering;
distributed communication protocols; groupware;
interoperability capability; Libraries; Measurement;
modular software development; multilevel abstract
modeling; open library; open repository; open
simulation environment; open systems; Operating
systems; Performance and Reliability; Processor
Architectures; Programming; simulator development;
simulator modules; simulator services; software
architecture; UNISIM",
}
@Article{Sendag:2007:BMP,
author = "R. Sendag and J. Yi and P. Chuang",
title = "Branch Misprediction Prediction: Complementary Branch
Predictors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "49--52",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper, we propose a new class of branch
predictors, complementary branch predictors, which can
be easily added to any branch predictor to improve the
overall prediction accuracy. This mechanism differs
from conventional branch predictors in that it focuses
only on mispredicted branches. As a result, this
mechanism has the advantages of scalability and
flexibility (can be implemented with any branch
predictor), but is not on the critical path. More
specifically, this mechanism improves the branch
prediction accuracy by predicting which future branch
will be mispredicted next and when that will occur, and
then it changes the predicted direction at the
predicted time. Our results show that a branch
predictor with the branch misprediction predictor
achieves the same prediction accuracy as a conventional
branch predictor that is 4 to 16 times larger, but
without significantly increasing the overall complexity
or lengthening the critical path.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; branch misprediction prediction; branch
predictor; computational complexity; Computer networks;
Costs; Delay; Emerging technologies; History; parallel
architectures; Performance loss; Pipeline processors;
Pipelines; Prediction algorithms; Scalability;
Testing",
}
@Article{Yalcin:2007:UTM,
author = "G. Yalcin and O. Ergin",
title = "Using tag-match comparators for detecting soft
errors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "6",
number = "2",
pages = "53--56",
month = feb,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Soft errors caused by high energy particle strikes are
becoming an increasingly important problem in
microprocessor design. With increasing transistor
density and die sizes, soft errors are expected to be a
larger problem in the near future. Recovering from
these unexpected faults may be possible by reexecuting
some part of the program only if the error can be
detected. Therefore it is important to come up with new
techniques to detect soft errors and increase the
number of errors that are detected. Modern
microprocessors employ out-of-order execution and
dynamic scheduling logic. Comparator circuits, which
are used to keep track of data dependencies, are
usually idle. In this paper, we propose various schemes
to exploit on-chip comparators to detect transient
faults. Our results show that around 50\% of the errors
on the wakeup logic can be detected with minimal
hardware overhead by using the proposed techniques.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "and Fault-Tolerance; Broadcasting; Circuit faults;
comparators (circuits); Computer errors; Control
Structure Reliability; dynamic scheduling logic;
Electrical fault detection; Fault detection;
identification technology; Logic; logic design; logic
testing; microprocessor chips; microprocessor design;
Microprocessors; Out of order; out-of-order execution;
Pipelines; Processor Architectures; Registers;
scheduling; soft error detection; tag-match comparator;
Testing; Testing and Fault-Tolerance",
}
@Article{Joao:2008:DPI,
author = "J. A. Joao and O. Mutlu and H. Kim and Y. N. Patt",
title = "Dynamic Predication of Indirect Jumps",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "1--4",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Indirect jumps are used to implement increasingly
common programming language constructs such as virtual
function calls, switch-case statements, jump tables,
and interface calls. Unfortunately, the prediction
accuracy of indirect jumps has remained low because
many indirect jumps have multiple targets that are
difficult to predict even with specialized hardware.
This paper proposes a new way of handling
hard-to-predict indirect jumps: dynamically predicating
them. The compiler identifies indirect jumps that are
suitable for predication along with their control-flow
merge (CFM) points. The microarchitecture predicates
the instructions between different targets of the jump
and its CFM point if the jump turns out to be
hardto-predict at run time. We describe the new
indirect jump predication architecture, provide code
examples showing why it could reduce the performance
impact of jumps, derive an analytical cost-benefit
model for deciding which jumps and targets to
predicate, and present preliminary evaluation
results.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; Analytical models; B Hardware; B.3 Memory
Structures; Cache memories; Computer languages;
Computer Systems Organization; Design Styles; Hardware;
Instruction sets; Microarchitecture;
Multi-core/single-chip multiprocessors; Object oriented
modeling; Parallel Architectures; Performance analysis;
Pipelines; Processor Architectures; Switches",
}
@Article{Das:2008:MMC,
author = "A. Das and S. Ozdemir and G. Memik and J. Zambreno and
A. Choudhary",
title = "Microarchitectures for Managing Chip Revenues under
Process Variations",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "5--8",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As transistor feature sizes continue to shrink into
the sub-90nm range and beyond, the effects of process
variations on critical path delay and chip yields have
amplified. A common concept to remedy the effects of
variation is speed-binning, by which chips from a
single batch are rated by a discrete range of
frequencies and sold at different prices. In this
paper, we discuss strategies to modify the number of
chips in different bins and hence enhance the profits
obtained from them. Particularly, we propose a scheme
that introduces a small substitute cache associated
with each cache way to replicate the data elements that
will be stored in the high latency lines. Assuming a
fixed pricing model, this method increases the revenue
by as much as 13.8\% without any impact on the
performance of the chips.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Cache Memories; Computer Architecture; Computer
architecture; Cost function; Delay effects; Design
optimization; Fabrication; Fault-tolerant Computing.;
Frequency; Manufacturing; Microarchitecture; Pricing;
Process Variations; Transistors",
}
@Article{Roth:2008:PRR,
author = "A. Roth",
title = "Physical register reference counting",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "9--12",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Several proposed techniques including CPR (checkpoint
processing and recovery) and NoSQ (no store queue) rely
on reference counting to manage physical registers.
However, the register reference counting mechanism
itself has received surprisingly little attention. This
paper fills this gap by describing potential register
reference counting schemes for NoSQ, CPR, and a
hypothetical NoSQ/CPR hybrid. Although previously
described in terms of binary counters, we find that
reference counts are actually more naturally
represented as matrices. Binary representations can be
used as an optimization in specific situations.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "and statically-scheduled implementation; binary
representations; checkpoint processing; checkpointing;
Counting circuits; dynamically-scheduled;
dynamically-scheduled and statically-scheduled
implementation; Engines; Information science; matrices;
Micro-architecture implementation considerations;
Microarchitecture; no store queue; physical register
reference counting; Physics computing; Proposals;
recovery technique; Registers; shift registers;
Superscalar",
}
@Article{Flich:2008:LBD,
author = "J. Flich and J. Duato",
title = "Logic-Based Distributed Routing for {NoCs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "13--16",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "The design of scalable and reliable interconnection
networks for multicore chips (NoCs) introduces new
design constraints like power consumption, area, and
ultra low latencies. Although 2D meshes are usually
proposed for NoCs, heterogeneous cores, manufacturing
defects, hard failures, and chip virtualization may
lead to irregular topologies. In this context,
efficient routing becomes a challenge. Although
switches can be easily configured to support most
routing algorithms and topologies by using routing
tables, this solution does not scale in terms of
latency and area. We propose a new circuit that removes
the need for using routing tables. The new mechanism,
referred to as logic-based distributed routing (LBDR),
enables the implementation in NoCs of many routing
algorithms for most of the practical topologies we
might find in the near future in a multicore chip. From
an initial topology and routing algorithm, a set of
three bits per switch output port is computed. By using
a small logic block, LHDR mimics (demonstrated by
evaluation) the behavior of routing algorithms
implemented with routing tables. This result is
achieved both in regular and irregular topologies.
Therefore, LBDR removes the need for using routing
tables for distributed routing, thus enabling flexible,
fast and power-efficient routing in NoCs.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "chip virtualization; circuit reliability; Circuit
topology; Delay; Energy consumption; heterogeneous
cores; interconnection network reliability;
interconnections; logic circuits; logic-based
distributed routing; Manufacturing; manufacturing
defects; Multi-core/single-chip multiprocessors;
Multicore processing; Multiprocessor interconnection
networks; network routing; network topology; Network
topology; Network-on-a-chip; network-on-chip; networks
for multicore chips; NoC; On-chip interconnection
networks; Routing; Switches",
}
@Article{Yoon:2008:CHP,
author = "J. H. Yoon and E. H. Nam and Y. J. Seong and H. Kim
and B. Kim and S. L. Min and Y. Cho",
title = "{Chameleon}: a High Performance Flash\slash {FRAM}
Hybrid Solid State Disk Architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "17--20",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Flash memory solid state disk (SSD) is gaining
popularity and replacing hard disk drive (HDD) in
mobile computing systems such as ultra mobile PCs
(UMPCs) and notebook PCs because of lower power
consumption, faster random access, and higher shock
resistance. One of the key challenges in designing a
high-performance flash memory SSD is an efficient
handling of small random writes to non-volatile data
whose performance suffers from the inherent limitation
of flash memory that prohibits in-placc update. In this
paper, we propose a high performance Flash/FRAM hybrid
SSD architecture called Chameleon. In Chameleon,
metadata used by the flash translation layer (FTL), a
software layer in the flash memory SSD, is maintained
in a small FRAM since this metadata is a target of
intensive small random writes, whereas the bulk data is
kept in the flash memory. Performance evaluation based
on an FPGA implementation of the Chameleon architecture
shows that the use of FRAM in Chameleon improves the
performance by 21.3\%. The results also show that even
for bulk data that cannot be maintained in FRAM because
of the size limitation, the use of fine-grained write
buffering is critically important because of the
inability of flash memory to perform in-placc update of
data.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Chameleon; Computer architecture; Design studies; disc
drives; Energy consumption; Ferroelectric films; field
programmable gate arrays; flash memories; Flash memory;
flash memory solid state disk; flash translation layer;
flash-FRAM hybrid SSD architecture; FPGA
implementation; FTL; hard discs; hard disk drive; Hard
disks; HDD; Mass storage; memory architecture; Mobile
computing; mobile computing systems; Nonvolatile
memory; notebook PCs; Personal communication networks;
Random access memory; random-access storage; Solid
state circuits; SSD; ultra mobile PCs; UMPC",
}
@Article{Biswas:2008:CAA,
author = "A. Biswas and P. Racunas and J. Emer and S.
Mukherjee",
title = "Computing Accurate {AVFs} using {ACE} Analysis on
Performance Models: a Rebuttal",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "21--24",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.19",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "ACE (architecturally correct execution) analysis
computes AVFs (architectural vulnerability factors) of
hardware structures. AVF expresses the fraction of
radiation-induced transient faults that result in
user-visible errors. Architects usually perform this
analysis on a high-level performance model to quickly
compute per-structure AVFs. If, however, low-level
details of a microarchitecture are not modeled
appropriately, then their effects may not be reflected
in the per-structure AVFs. In this paper we refute
Wang, et al.'s (2007) claim that this detail is
difficult to model and imposes a practical threshold on
ACE analysis that forces its estimates to have a high
error margin. We show that carefully choosing a small
amount of additional detail can result in a much
tighter AVF bound than Wang, et al. were able to
achieve in their refined ACE analysis. Even the
inclusion of small details, such as read/write pointers
and appropriate inter-structure dependencies, can
increase the accuracy of the AVF computation by 40\% or
more. We argue that this is no different than modeling
the IPC (instructions per cycle) of a microprocessor
pipeline. A less detailed performance model will
provide less accurate IPCs. AVFs are no different.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "and Fault-Tolerance; architectural vulnerability
factors; architecturally correct execution analysis;
Computational modeling; Hardware; hardware structures;
High performance computing; instructions per cycle;
inter-structure dependencies; Microarchitecture;
microprocessor pipeline; Microprocessors; Performance
analysis; Performance and Reliability; performance
evaluation; performance models; Pipelines; Protection;
radiation-induced transient faults; read pointers;
Reliability; Target tracking; Testing; Testing and
Fault-Tolerance; user-visible errors; write pointers",
}
@Article{Cho:2008:CAL,
author = "S. Cho and R. Melhem",
title = "Corollaries to {Amdahl's Law} for Energy",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "25--28",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2007.18",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper studies the important interaction between
parallelization and energy consumption in a
parallelizable application. Given the ratio of serial
and parallel portion in an application and the number
of processors, we first derive the optimal frequencies
allocated to the serial and parallel regions in the
application to minimize the total energy consumption,
while the execution time is preserved (i.e., speedup =
1). We show that dynamic energy improvement due to
parallelization has a function rising faster with the
increasing number of processors than the speed
improvement function given by the well-known Amdahl's
Law. Furthermore, we determine the conditions under
which one can obtain both energy and speed improvement,
as well as the amount of improvement. The formulas we
obtain capture the fundamental relationship between
parallelization, speedup, and energy consumption and
can be directly utilized in energy aware processor
resource management. Our results form a basis for
several interesting research directions in the area of
power and energy aware parallel processing.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Amdahl's Law; Application software; Computer science;
Concurrent computing; dynamic energy improvement;
energy aware processor resource management; Energy
capture; energy consumption; Energy consumption; energy
consumption; Energy management; Equations; Hardware;
Parallel Architectures; parallel processing; Parallel
processing; parallelization; Power Management; Radio
spectrum management; Resource management",
}
@Article{Balfour:2008:EEP,
author = "J. Balfour and W. Dally and D. Black-Schaffer and V.
Parikh and J. Park",
title = "An Energy-Efficient Processor Architecture for
Embedded Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "1",
pages = "29--32",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We present an efficient programmable architecture for
compute-intensive embedded applications. The processor
architecture uses instruction registers to reduce the
cost of delivering instructions, and a hierarchical and
distributed data register organization to deliver data.
Instruction registers capture instruction reuse and
locality in inexpensive storage structures that arc
located near to the functional units. The data register
organization captures reuse and locality in different
levels of the hierarchy to reduce the cost of
delivering data. Exposed communication resources
eliminate pipeline registers and control logic, and
allow the compiler to schedule efficient instruction
and data movement. The architecture keeps a significant
fraction of instruction and data bandwidth local to the
functional units, which reduces the cost of supplying
instructions and data to large numbers of functional
units. This architecture achieves an energy efficiency
that is 23x greater than an embedded RISC processor.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Communication system control; compute-intensive
embedded applications; Computer applications; computer
architecture; Computer architecture; Costs; data
movement; distributed data register organization;
Embedded computing; embedded RISC processor; Embedded
system; embedded systems; Energy efficiency;
energy-efficient processor architecture; hierarchical
organization; inexpensive storage structures;
instruction registers; instruction sets; Logic; Mobile
processors; pipeline processing; pipeline registers;
Pipelines; Registers",
}
@Article{Anonymous:2008:FC,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "c1--c1",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Presents the front cover for this issue of the
publication.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2008:EBC,
author = "Anonymous",
title = "Editorial Board [Cover2]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "c2--c2",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Provides a listing of current society officers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Pao:2008:PAM,
author = "D. Pao and W. Lin and B. Liu",
title = "Pipelined Architecture for Multi-String Matching",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "33--36",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This letter presents a new oblivious routing algorithm
for 3D mesh networks called randomized
partially-minimal (RPM) routing that provably achieves
optimal worst- case throughput for 3D meshes when the
network radix fc is even and within a factor of 1/k2 of
optimal when k is odd. Although this optimality result
has been achieved with the minimal routing algorithm
OITURN for the 2D case, the worst-case throughput of
OITURN degrades tremendously in higher dimensions.
Other existing routing algorithms suffer from either
poor worst-case throughput (DOR, ROMM) or poor latency
(VAL). RPM on the other hand achieves near optimal
worst-case and good average-case throughput as well as
good latency performance.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "3D mesh networks; Automata; computer architecture;
Computer architecture; Computer science; Costs;
deterministic finite automaton; Hardware; Intrusion
detection; network intrusion detection; network radix;
OITURN; Partial response channels; pipelined
processing; Pipelines; randomized partially-minimal
routing; string matching; Table lookup;
three-dimensional mesh networks; Throughput",
}
@Article{Ramanujam:2008:RPM,
author = "R. Sunkam Ramanujam and B. Lin",
title = "Randomized Partially-Minimal Routing on
Three-Dimensional Mesh Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "37--40",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This letter presents a new oblivious routing algorithm
for 3D mesh networks called Randomized Partially-
Minimal (RPM) routing that provably achieves optimal
worst-case throughput for 3D meshes when the network
radix k is even and within a factor of 1/k2 of optimal
when k is odd. Although this optimality result has been
achieved with the minimal routing algorithm O1TURN [9]
for the 2D case, the worst-case throughput of O1TURN
degrades tremendously in higher dimensions. Other
existing routing algorithms suffer from either poor
worst-case throughput (DOR [10], ROMM [8]) or poor
latency (VAL [14]). RPM on the other hand achieves near
optimal worst-case and good average-case throughput as
well as good latency performance.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Degradation; Delay; Emerging technologies; Fabrics;
Interconnection architectures; Mesh networks; Network
communications; Network topology; On-chip
interconnection networks; Packet-switching networks;
Routing; Silicon; Technological innovation;
Telecommunication traffic; Throughput",
}
@Article{Black-Schaffer:2008:HIR,
author = "D. Black-Schaffer and J. Balfour and W. Dally and V.
Parikh and J. Park",
title = "Hierarchical Instruction Register Organization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "41--44",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper analyzes a range of architectures for
efficient delivery of VLIW instructions for embedded
media kernels. The analysis takes an efficient filter
cache as a baseline and examines the benefits from (1)
removing the tag overhead, (2) distributing the
storage, (3) adding indirection, (4) adding efficient
NOP generation, and (5) sharing instruction memory. The
result is a hierarchical instruction register
organization that provides a 56\% energy and 40\% area
savings over an already efficient filter cache.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache storage; Cache storage; Computer aided
instruction; Computer architecture; Computer integrated
manufacturing; distributed shared memory systems;
Embedded computing; embedded media kernel; embedded
processor architecture; embedded systems; filter cache;
Filters; hierarchical instruction register
organization; Instruction fetch; instruction memory
sharing; instruction sets; Kernel; Laboratories;
Low-power design; NOP generation; parallel
architectures; Registers; RISC/CISC; VLIW; VLIW
architectures; VLIW instruction delivery",
}
@Article{Lee:2008:PDD,
author = "J. Lee and X. Xiao",
title = "A Parallel Deadlock Detection Algorithm with {$ O(1)
$} Overall Run-time Complexity",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "45--48",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This article proposes a novel parallel,
hardware-oriented deadlock detection algorithm for
multiprocessor system-on-chips. The proposed algorithm
takes full advantage of hardware parallelism in
computation and maintains information needed by
deadlock detection through classifying all resource
allocation events and performing class specific
operations, which together make the overall run-time
complexity of the new method O(1). We implement the
proposed algorithm in Verilog HDL and demonstrate in
the simulation that each algorithm invocation takes at
most four clock cycles in hardware.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithms implemented in hardware; clock cycle;
Computational modeling; Concurrent computing;
Deadlocks; Detection algorithms; Event detection;
hardware description languages; Hardware design
languages; hardware-oriented deadlock detection;
Multiprocessing systems; multiprocessing systems;
multiprocessor system-on-chips; operating systems
(computers); parallel deadlock detection; Parallel
processing; Real-time and embedded systems; resource
allocation; Resource management; run-time complexity;
Runtime; System recovery; system-on-chip; Verilog HDL",
}
@Article{GomezRequena:2008:BFT,
author = "C. {Gomez Requena} and F. Gilabert Villamon and M.
Gomez and P. Lopez and J. Duato",
title = "Beyond Fat-tree: Unidirectional Load--Balanced
Multistage Interconnection Network",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "49--52",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
note = "See comment \cite{Antelo:2009:CBF}.",
abstract = "The fat-tree is one of the most widely-used topologies
by interconnection network manufacturers. Recently, it
has been demonstrated that a deterministic routing
algorithm that optimally balances the network traffic
can not only achieve almost the same performance than
an adaptive routing algorithm but also outperforms it.
On the other hand, fat-trees require a high number of
switches with a non-negligible wiring complexity. In
this paper, we propose replacing the fat-tree by a
unidirectional multistage interconnection network
(UMIN) that uses a traffic balancing deterministic
routing algorithm. As a consequence, switch hardware is
almost reduced to the half, decreasing, in this way,
the power consumption, the arbitration complexity, the
switch size itself, and the network cost. Preliminary
evaluation results show that the UMIN with the load
balancing scheme obtains lower latency than fat-tree
for low and medium traffic loads. Furthermore, in
networks with a high number of stages or with high
radix switches, it obtains the same, or even higher,
throughput than fat-tree.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "adaptive routing algorithm; Butterfly Network;
computational complexity; Cost-efficiency; Costs;
Deterministic Routing; Energy consumption; Fat-trees;
Hardware; interconnection network manufacturers;
Manufacturing; Multiprocessor interconnection networks;
Multistage Interconnection Networks; Network
Architecture and Design; Network topology; network
traffic; nonnegligible wiring complexity; power
consumption; radix switches; Routing; Switches;
telecommunication network routing; telecommunication
switching; Telecommunication traffic; telecommunication
traffic; Traffic Balancing; traffic balancing
deterministic routing algorithm; trees (mathematics);
unidirectional load-balanced multistage interconnection
network; Wiring",
}
@Article{Li:2008:TAN,
author = "Z. Li and C. Zhu and L. Shang and R. Dick and Y. Sun",
title = "Transaction-Aware Network-on-Chip Resource
Reservation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "53--56",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Performance and scalability are critically-important
for on-chip interconnect in many-core
chip-multiprocessor systems. Packet-switched
interconnect fabric, widely viewed as the de facto
on-chip data communication backplane in the many-core
era, offers high throughput and excellent scalability.
However, these benefits come at the price of router
latency due to run-time multi-hop data buffering and
resource arbitration. The network accounts for a
majority of on-chip data transaction latency. In this
work, we propose dynamic in-network resource
reservation techniques to optimize run-time on-chip
data transactions. This idea is motivated by the need
to preserve existing abstraction and general-purpose
network performance while optimizing for
frequently-occurring network events such as data
transactions. Experimental studies using multithreaded
benchmarks demonstrate that the proposed techniques can
reduce on-chip data access latency by 28.4\% on average
in a 16-node system and 29.2\% on average in a 36-node
system.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Backplanes; buffer storage; Computer buffers; data
communication; Data communication; de facto on-chip
data communication backplane; Delay; dynamic in-network
resource reservation techniques; Fabrics;
frequently-occurring network events; Interconnection
architectures; Interconnections (Subsystems); many-core
chip-multiprocessor systems; multiprocessor
interconnection networks; Network-on-a-chip; on-chip
data transaction latency; On-chip interconnection
networks; packet switching; packet-switched
interconnect fabric; Parallel Architectures; resource
allocation; router latency; run-time multihop data
buffering; Runtime; Scalability; System-on-a-chip;
telecommunication network routing; Throughput;
transaction-aware network-on-chip resource
reservation",
}
@Article{Fide:2008:PUS,
author = "S. Fide and S. Jenks",
title = "Proactive Use of Shared {L3} Caches to Enhance Cache
Communications in Multi-Core Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "57--60",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "The software and hardware techniques to exploit the
potential of multi-core processors are falling behind,
even though the number of cores and cache levels per
chip is increasing rapidly. There is no explicit
communications support available, and hence inter-core
communications depend on cache coherence protocols,
resulting in demand-based cache line transfers with
their inherent latency and overhead. In this paper, we
present software controlled eviction (SCE) to improve
the performance of multithreaded applications running
on multi-core processors by moving shared data to
shared cache levels before it is demanded from remote
private caches. Simulation results show that SCE offers
significant performance improvement (8-28\%) and
reduces L3 cache misses by 88-98\%.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache coherence protocol; cache communication; cache
storage; Concurrent computing; Control systems;
Degradation; Delay; demand-based cache line transfer;
Hardware; intercore communications; microprocessor
chips; Multi-core/single-chip multiprocessors;
multi-threading; Multicore processing; multicore
processors; multithreaded application; Parallel
processing; Protocols; shared L3 cache; shared memory
systems; software controlled eviction; Software
performance; Support for multi-threaded execution",
}
@Article{Walter:2008:BBE,
author = "I. Walter and I. Cidon and A. Kolodny",
title = "{BENoC}: a Bus-Enhanced Network on-Chip for a Power
Efficient {CMP}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "61--64",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Network-on-chips (NoCs) outperform buses in terms of
scalability, parallelism and system modularity and
therefore are considered as the main interconnect
infrastructure in future chip multi-processor (CMP).
However, while NoCs are very efficient for delivering
high throughput point-to-point data from sources to
destinations, their multi-hop operation is too slow for
latency sensitive signals. In addition, current NoCS
are inefficient for broadcast operations and
centralized control of CMP resources. Consequently,
state-of-the-art NoCs may not facilitate the needs of
future CMP systems. In this paper, the benefit of
adding a low latency, customized shared bus as an
internal part of the NoC architecture is explored.
BENoC (bus-enhanced network on-chip) possesses two main
advantages: First, the bus is inherently capable of
performing broadcast transmission in an efficient
manner. Second, the bus has lower and more predictable
propagation latency. In order to demonstrate the
potential benefit of the proposed architecture, an
analytical comparison of the power saving in BENoC
versus a standard NoC providing similar services is
presented. Then, simulation is used to evaluate BENoC
in a dynamic non-uniform cache access (DNUCA)
multiprocessor system.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "broadcast transmission; Broadcasting; bus-enhanced
network-on-chip; Centralized control; chip
multiprocessor; Delay; dynamic nonuniform cache access;
integrated circuit interconnections; interconnect
infrastructure; Interconnection architectures;
low-power electronics; microprocessor chips;
multiprocessing systems; Multiprocessing systems;
Multiprocessor interconnection networks;
Network-on-a-chip; network-on-chip; NoC; On-chip
interconnection networks; power efficient CMP; Power
system interconnection; propagation latency;
Scalability; system buses; System-on-a-chip;
Throughput",
}
@Article{Golander:2008:DDS,
author = "A. Golander and S. Weiss and R. Ronen",
title = "{DDMR}: Dynamic and Scalable Dual Modular Redundancy
with Short Validation Intervals",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "65--68",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "DMR (dual modular redundancy) was suggested for
increasing reliability. Classical DMR consists of pairs
of cores that check each other and are pre-connected
during manufacturing by dedicated links. In this paper
we introduce the dynamic dual modular redundancy (DDMR)
architecture. DDMR supports run-time scheduling of
redundant threads, which has significant benefits
relative to static binding. To allow dynamic pairing,
DDMR replaces the special links with a novel ring
architecture. DDMR uses short instruction sequences for
validation, smaller than the processor reorder buffer.
Such short sequences reduce latencies in parallel
programs and save resources needed to buffer
uncommitted data. DDMR scales with the number of cores
and may be used in large multicore architectures.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "buffer storage; DDMR; Delay; dynamic dual modular
redundancy; Job shop scheduling; Joining processes;
Manufacturing; Multi-core/single-chip multiprocessors;
multicore architectures; Multicore processing; parallel
architectures; parallel programs; processor reorder
buffer; processor scheduling; Processor scheduling;
Proposals; Redundancy; Redundant design; ring
architecture; run-time scheduling; scalable dual
modular redundancy; short validation intervals;
Transistors",
}
@Article{Anonymous:2008:IA,
author = "Anonymous",
title = "Information for authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "c3--c3",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Provides instructions and guidelines to prospective
authors who wish to submit manuscripts.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2008:ICS,
author = "Anonymous",
title = "{IEEE Computer Society} [Cover 4]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "7",
number = "2",
pages = "c4--c4",
month = jul,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.18",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Fri Jun 21 05:49:19 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Provides a listing of current society officers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Ramanujam:2009:WRR,
author = "Rohit Sunkam Ramanujam and Bill Lin",
title = "Weighted Random Routing on Torus Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper, we introduce a new closed-form
oblivious routing algorithm called W2TURN that is
worst-case throughput optimal for 2D-torus networks.
W2TURN is based on a weighted random selection of paths
that contain at most two turns. In terms of average hop
count, W2TURN outperforms the best previously known
closed-form worst-case throughput optimal routing
algorithm called IVAL [7]. In addition, we present a
new optimal weighted random routing algorithm for rings
called WRD.",
acknowledgement = ack-nhfb,
affiliation = "Ramanujam, RS (Reprint Author), Univ Calif San Diego,
San Diego, CA 92103 USA. Ramanujam, Rohit Sunkam; Lin,
Bill, Univ Calif San Diego, San Diego, CA 92103 USA.",
author-email = "rsunkamr@ucsd.edu billlin@ucsd.edu",
da = "2019-06-20",
doc-delivery-number = "V17GC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "2D-torus networks; Algorithm design and analysis;
closed-form oblivious routing algorithm; Data
communications; Delay; Interconnection network;
internetworking; IVAL; latency; Measurement;
Multiprocessor interconnection networks;
Network-on-a-chip; oblivious routing; Oblivious
Routing; On-chip interconnection networks; optimal
weighted random routing algorithm; Routing; Runtime;
System recovery; telecommunication network routing;
throughput; Throughput; torus network; Torus Network;
W2TURN; weighted random path selection",
number-of-cited-references = "8",
ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X",
research-areas = "Computer Science",
researcherid-numbers = "Lin, Binshan/A-9772-2009",
times-cited = "2",
unique-id = "Ramanujam:2009:WRR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Ahn:2009:MDE,
author = "Jung Ho Ahn and Jacob Leverich and Robert S. Schreiber
and Norman P. Jouppi",
title = "Multicore {DIMM}: an Energy Efficient Memory Module
with Independently Controlled {DRAMs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2008.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Demand for memory capacity and bandwidth keeps
increasing rapidly in modern computer systems, and
memory power consumption is becoming a considerable
portion of the system power budget. However, the
current DDR DIMM standard is not well suited to
effectively serve CMP memory requests from both a power
and performance perspective. We propose a new memory
module called a Multicore DIMM, where DRAM chips are
grouped into multiple virtual memory devices, each of
which has its own data path and receives separate
commands (address and control signals). The Multicore
DIMM is designed to improve the energy efficiency of
memory systems with small impact on system performance.
Dividing each memory modules into 4 virtual memory
devices brings a simultaneous 22\%, 7.6\%, and 18\%
improvement in memory power, IPC, and system
energy-delay product respectively on a set of
multithreaded applications and consolidated
workloads.",
acknowledgement = ack-nhfb,
affiliation = "Ahn, JH (Reprint Author), Hewlett Packard Labs,
Mississauga, ON, Canada. Ahn, Jung Ho; Schreiber,
Robert S.; Jouppi, Norman P., Hewlett Packard Labs,
Mississauga, ON, Canada. Leverich, Jacob, Stanford
Univ, Stanford, CA 94305 USA.",
da = "2019-06-20",
doc-delivery-number = "V17GC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; CMP memory requests; Control systems; DDR
DIMM standard; DRAM; DRAM chips; Energy consumption;
Energy efficiency; energy efficiency; energy efficient
memory module; Energy-aware systems; Error correction
codes; independently controlled DRAM; Jacobian
matrices; memory capacity; memory module; memory power
consumption; Memory Structures; memory system;
microprocessor chips; Multicore; multicore DIMM;
Multicore processing; Proposals; Random access memory;
System performance; system power budget; virtual memory
devices",
number-of-cited-references = "16",
ORCID-numbers = "Ahn, Jung Ho/0000-0003-1733-1394",
research-areas = "Computer Science",
researcherid-numbers = "Ahn, Jung Ho/D-1298-2013",
times-cited = "26",
unique-id = "Ahn:2009:MDE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wang:2009:PST,
author = "Po-Han Wang and Yen-Ming Chen and Chia-Lin Yang and
Yu-Jung Cheng",
title = "A Predictive Shutdown Technique for {GPU} Shader
Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As technology continues to shrink, reducing leakage is
critical to achieve energy efficiency. Previous works
on low-power GPU (Graphics Processing Unit) focus on
techniques for dynamic power reduction, such as DVFS
(Dynamic Voltage/Frequency Scaling) and clock gating.
In this paper, we explore the potential of adopting
architecture-level power gating techniques for leakage
reduction on GPU. In particular, we focus on the most
power-hungry components, shader processors. We observe
that, due to different scene complexity, the required
shader resources to satisfy the target frame rate
actually vary across frames. Therefore, we propose the
Predictive Shader Shutdown technique to exploit
workload variation across frames for leakage reduction
on shader processors. The experimental results show
that Predictive Shader Shutdown achieves up to 46\%
leakage reduction on shader processors with negligible
performance degradation.",
acknowledgement = ack-nhfb,
affiliation = "Wang, PH (Reprint Author), Natl Taiwan Univ, Dept Comp
Sci \& Informat Engn, Taipei 10764, Taiwan. Wang,
Po-Han; Chen, Yen-Ming; Yang, Chia-Lin, Natl Taiwan
Univ, Dept Comp Sci \& Informat Engn, Taipei 10764,
Taiwan. Cheng, Yu-Jung, Natl Taiwan Univ, Grad Inst
Networking \& Multimedia, Taipei 10764, Taiwan.",
author-email = "r96002@csie.ntu.edu.tw r95125@csie.ntu.edu.tw
yangc@csie.ntu.edu.tw d96944002@ntu.edu.tw",
da = "2019-06-20",
doc-delivery-number = "V17GC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Institute for Information Industry of
Taiwan [97-FS-C03]; National Taiwan University
[97R0062-05]",
funding-text = "This work was partially supported by the Institute for
Information Industry of Taiwan under project No.
97-FS-C03, and by the Excellent Research Projects of
National Taiwan University, 97R0062-05.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architecture-level power gating techniques; Central
Processing Unit; Circuits; clock gating; Clocks;
computer architecture; computer graphic equipment;
Computer science; coprocessors; Degradation; dynamic
power reduction; Dynamic voltage scaling; dynamic
voltage-frequency scaling; Energy efficiency;
Energy-aware systems; Frequency; GPU; GPU shader
processors; Graphics; graphics processing unit; Layout;
leakage; Low-power design; power aware computing; power
gating; predictive shader shutdown technique",
number-of-cited-references = "15",
ORCID-numbers = "YANG, CHIA-LIN/0000-0003-0091-5027",
research-areas = "Computer Science",
times-cited = "10",
unique-id = "Wang:2009:PST",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Barnes:2009:XBA,
author = "Christopher Barnes and Pranav Vaidya and Jaehwan John
Lee",
title = "An {XML}-Based {ADL} Framework for Automatic
Generation of Multithreaded Computer Architecture
Simulators",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Computer architecture simulation has always played a
pivotal role in continuous innovation of computers.
However, constructing or modifying a high quality
simulator is time consuming and error-prone. Thus,
often Architecture Description Languages (ADLs) are
used to provide an abstraction layer for describing the
computer architecture and automatically generating
corresponding simulators. Along the line of such
research, we present a novel XML-based ADL, its
compiler, and a generation methodology to automatically
generate multithreaded simulators for computer
architecture. We utilize the industry-standard
extensible markup language XML to describe the
functionality and architecture of a modeled processor.
Our ADL framework allows users to easily and quickly
modify the structure, register set, and execution of a
modeled processor. To prove its validity, we have
generated several multithreaded simulators with
different configurations based on the MIPS five-stage
processor, and successfully tested with two programs.",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "V17GC",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "IUPUI RSFG",
funding-text = "This research was funded by the IUPUI RSFG grant.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "abstraction layer; Architecture description languages;
automatic generation; C.0.d Modeling of computer
architecture; C.1.1.b Pipeline processors;
Computational modeling; computer architecture; Computer
architecture; Computer simulation; Concurrent
computing; extensible markup language-architecture
description language; Kernel; MIPS five-stage
processor; Modeling of computer architecture;
multi-threading; multithreaded computer architecture
simulator; Object oriented modeling; Pipeline
processors; Pipelines; program compilers; program
verification; Testing; validity testing; XML; XML-based
ADL framework",
number-of-cited-references = "14",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Barnes:2009:XBA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Luque:2009:CAC,
author = "Carlos Luque and Miquel Moreto and Francisco J.
Cazorla and Roberto Gioiosa and Alper Buyuktosunoglu
and Mateo Valero",
title = "{CPU} Accounting in {CMP} Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Chip-MultiProcessors (CMP) introduce complexities when
accounting CPU utilization to processes because the
progress done by a process during an interval of time
highly depends on the activity of the other processes
it is co-scheduled with. We propose a new hardware
accounting mechanism to improve the accuracy when
measuring the CPU utilization in CMPs and compare it
with the previous accounting mechanisms. Our results
show that currently known mechanisms could lead to a
12\% average error when it comes to CPU utilization
accounting. Our proposal reduces this error to less
than 1\% in a modeled 4-core processor system.",
acknowledgement = ack-nhfb,
affiliation = "Luque, C (Reprint Author), Univ Politecn Cataluna,
E-08028 Barcelona, Spain. Luque, Carlos; Moreto,
Miquel; Valero, Mateo, Univ Politecn Cataluna, E-08028
Barcelona, Spain. Cazorla, Francisco J.; Valero, Mateo,
Barcelona Supercomp Ctr, Barcelona, Spain.",
da = "2019-06-20",
doc-delivery-number = "V17GC",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Ministry of Science and Technology of Spain
[TIN-2007-60625, BES-2008-003683, AP-2005-3318]; HiPEAC
Network of Excellence [IST-004408]; IBM Research; IBM
Deep Computing organizations",
funding-text = "This work has been supported by the Ministry of
Science and Technology of Spain under contract
TIN-2007-60625 and grants BES-2008-003683 and
AP-2005-3318, by the HiPEAC Network of Excellence
(IST-004408) and a Collaboration Agreement between IBM
and BSC with funds from IBM Research and IBM Deep
Computing organizations. The authors would like to
thank Pradip Bose and Chen-Yong Cher from IBM for their
technical support.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "4-core processor system; Bandwidth; Cache memory;
chip-multiprocessor architecture; Clocks; CMP processor
system; CPU utilization accounting; data center;
General; Hardware; hardware accounting mechanism;
Hardware/software interfaces; Kernel; microprocessor
chips; Multi-core/single-chip multiprocessors;
multiprocessing systems; operating system task
scheduling; Operating systems; process scheduling;
processor scheduling; Proposals; resource allocation;
Semiconductor device measurement; Switches",
number-of-cited-references = "11",
oa = "Green Published",
ORCID-numbers = "Moreto Planas, Miquel/0000-0002-9848-8758 Cazorla,
Francisco/0000-0002-3344-376X Luque,
Carlos/0000-0003-0442-0785 Valero,
Mateo/0000-0003-2917-2482 Gioiosa,
Roberto/0000-0001-9430-2656",
research-areas = "Computer Science",
researcherid-numbers = "Moreto Planas, Miquel/C-1823-2016 Cazorla,
Francisco/D-7261-2016 Luque, Carlos/E-2110-2019 Valero,
Mateo/L-5709-2014",
times-cited = "5",
unique-id = "Luque:2009:CAC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Soteriou:2009:HTD,
author = "Vassos Soteriou and Rohit Sunkam Ramanujam and Bill
Lin and Li-Shiuan Peh",
title = "A High-Throughput Distributed Shared-Buffer {NoC}
Router",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Microarchitectural configurations of buffers in
routers have a significant impact on the overall
performance of an on-chip network (NoC). This buffering
can be at the inputs or the outputs of a router,
corresponding to an input-buffered router (IBR) or an
output-buffered router (OBR). OBRs are attractive
because they have higher throughput and lower queuing
delays under high loads than IBRs. However, a direct
implementation of OBRs requires a router speedup equal
to the number of ports, making such a design
prohibitive given the aggressive clocking and power
budgets of most NoC applications. In this letter, we
propose a new router design that aims to emulate an OBR
practically based on a distributed shared-buffer (DSB)
router architecture. We introduce innovations to
address the unique constraints of NoCs, including
efficient pipelining and novel flow control. Our DSB
design can achieve significantly higher bandwidth at
saturation, with an improvement of up to 20\% when
compared to a state-of-the-art pipelined IBR with the
same amount of buffering, and our proposed
microarchitecture can achieve up to 94\% of the ideal
saturation throughput.",
acknowledgement = ack-nhfb,
affiliation = "Ramanujam, Rohit Sunkam; Lin, Bill, Univ Calif San
Diego, San Diego, CA 92103 USA. Peh, Li-Shiuan,
Princeton Univ, Princeton, NJ 08544 USA.",
author-email = "vassos.soteriou@cut.ac.cy rsunkamr@ucsd.edu
billlin@ucsd.edu peh@princeton.edu",
da = "2019-06-20",
doc-delivery-number = "V17GC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; buffer circuits; Clocks; Computer
architecture; configuration management; Delay;
distributed shared-buffer; Interconnection
architectures; Internet; microarchitectural
configurations; Microarchitecture; network routing;
Network-on-a-chip; network-on-chip; NoC router; On-chip
interconnection networks; output-buffered router;
Pipeline processing; router architecture; Router
micro-architecture; Technological innovation;
Throughput",
keywords-plus = "ARCHITECTURE",
number-of-cited-references = "16",
ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X Soteriou,
Vassos/0000-0002-2818-0459",
research-areas = "Computer Science",
researcherid-numbers = "Lin, Binshan/A-9772-2009 Soteriou,
Vassos/H-4603-2014",
times-cited = "15",
unique-id = "Soteriou:2009:HTD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Guz:2009:MCV,
author = "Zvika Guz and Evgeny Bolotin and Idit Keidar and
Avinoam Kolodny and Avi Mendelson and Uri C. Weiser",
title = "Many-Core vs. Many-Thread Machines: Stay Away From the
Valley",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We study the tradeoffs between Many-Core machines like
Intel's Larrabee and Many-Thread machines like Nvidia
and AMD GPGPUs. We define a unified model describing a
superposition of the two architectures, and use it to
identify operation zones for which each machine is more
suitable. Moreover, we identify an intermediate zone in
which both machines deliver inferior performance. We
study the shape of this ``performance valley'' and
provide insights on how it can be avoided.",
acknowledgement = ack-nhfb,
affiliation = "Guz, Z (Reprint Author), Technion Israel Inst Technol,
EE Dept, IL-32000 Haifa, Israel. Guz, Zvika; Keidar,
Idit; Kolodny, Avinoam; Weiser, Uri C., Technion Israel
Inst Technol, EE Dept, IL-32000 Haifa, Israel. Bolotin,
Evgeny, Intel Corp, Santa Clara, CA 95051 USA.
Mendelson, Avi, Microsoft Corp, Redmond, WA 98052
USA.",
author-email = "zguz@tx.technion.ac.il evgeny.bolotin@intel.com
idish@ee.technion.ac.il kolodny@ee.technion.ac.il
avim@microsoft.com uri.weiser@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "V17GC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Semiconductors Research Corporation (SRC);
Intel; Israeli Ministry of Science Knowledge Center on
Chip MultiProcessors",
funding-text = "We thank Ronny Ronen, Michael Behar, and Roni Rosner.
This work was partially supported by Semiconductors
Research Corporation (SRC), Intel, and the Israeli
Ministry of Science Knowledge Center on Chip
MultiProcessors.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "AMD GPGPU; architecture superposition; Bandwidth; Chip
Multiprocessors; Computer Systems; coprocessors; Delay;
Engines; Equations; GPGPU; Graphics; Intelpsilas
Larrabee; many-core machines; many-thread machines;
Multi-core/single-chip multiprocessors;
multi-threading; multiprocessing systems; Nvidia GPGPU;
Parallel Architectures; parallel architectures;
Parallel processing; performance valley; Processor
Architectures; Shape",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "27",
unique-id = "Guz:2009:MCV",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Desai:2009:AIC,
author = "Aniruddha Desai and Jugdutt Singh",
title = "Architecture Independent Characterization of Embedded
{Java} Workloads",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "29--32",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/java2000.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "This paper presents architecture independent
characterization of embedded Java workloads based on
the industry standard GrinderBench benchmark which
includes different classes of real world embedded Java
applications. This work is based on a custom built
embedded Java Virtual Machine (JVM) simulator
specifically designed for embedded JVM modeling and
embodies domain specific details such as thread
scheduling, algorithms used for native CLDC APIs and
runtime data structures optimized for use in embedded
systems. The results presented include dynamic
execution characteristics, dynamic bytecode instruction
mix, application and API workload distribution, Object
allocation statistics, instruction-set coverage, memory
usage statistics and method code and stack frame
characteristics.",
acknowledgement = ack-nhfb,
affiliation = "Desai, A (Reprint Author), La Trobe Univ, Bundoora,
Vic 3086, Australia. Desai, Aniruddha; Singh, Jugdutt,
La Trobe Univ, Bundoora, Vic 3086, Australia.",
author-email = "desai@ieee.org",
da = "2019-06-20",
doc-delivery-number = "V17GC",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithm design and analysis; application program
interfaces; architecture independent characterization;
CLDC API; custom built embedded Java virtual machine
simulator; data structures; Data structures; Design
optimization; dynamic bytecode instruction mix; dynamic
execution characteristics; embedded Java workload;
Embedded Systems; embedded systems; Embedded Systems;
industry standard GrinderBench benchmark; instruction
sets; instruction-set coverage; Java; Java bytecode;
Job shop scheduling; JVM; memory usage statistics;
method code characteristics; multi-threading; object
allocation statistics; Runtime; runtime data structure;
scheduling; Scheduling algorithm; stack frame
characteristics; Statistical distributions; storage
allocation; thread scheduling; virtual machines;
Virtual machining; Workload Characterization",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Desai:2009:AIC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Antelo:2009:CBF,
author = "Elisardo Antelo",
title = "A Comment on {``Beyond Fat-tree: Unidirectional
Load-Balanced Multistage Interconnection Network''}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "33--34",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
note = "See \cite{GomezRequena:2008:BFT}.",
abstract = "A recent work proposed to simplify fat-trees with
adaptive routing by means of a load-balancing
deterministic routing algorithm. The resultant network
has performance figures comparable to the more complex
adaptive routing fat-trees when packets need to be
delivered in order. In a second work by the same
authors published in IEEE CAL, they propose to simplify
the fat-tree to a unidirectional multistage
interconnection network (UMIN), using the same
load-balancing deterministic routing algorithm. They
show that comparable performance figures are achieved
with much lower network complexity. In this comment we
show that the proposed load-balancing deterministic
routing is in fact the routing scheme used by the
butterfly network. Moreover we show that the properties
of the simplified UMIN network proposed by them are
intrinsic to the standard butterfly and other existing
UMINs",
acknowledgement = ack-nhfb,
affiliation = "Antelo, E (Reprint Author), Univ Santiago de
Compostela, Dept Elect \& Comp Sci, Santiago De
Compostela, Spain. Univ Santiago de Compostela, Dept
Elect \& Comp Sci, Santiago De Compostela, Spain.",
da = "2019-06-20",
doc-delivery-number = "V17GC",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "adaptive routing fat-trees; Bismuth; butterfly
network; Computer science; deterministic algorithms;
fat-tree; hypercube networks; Interconnection networks;
Interconnections (Subsystems); load balancing
deterministic routing algorithm; Logic functions;
Multiprocessor interconnection networks; Multistage
Interconnection networks; network complexity; Network
topology; packets; resource allocation; Routing;
Switches; Technological innovation; Topology;
unidirectional load-balanced multistage interconnection
network; unidirectional multistage interconnection
network",
number-of-cited-references = "7",
ORCID-numbers = "Antelo, Elisardo/0000-0003-3743-3689",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Antelo:2009:CBF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2009:Aa,
author = "Anonymous",
title = "{[Advertisement]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "35--35",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.38",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:AIC,
author = "Anonymous",
title = "Ad --- {IEEE Computer Society Digital Library}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "36--36",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.39",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:EBCa,
author = "Anonymous",
title = "Editorial Board [Cover2]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "c2--c2",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.41",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:FCa,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "c1--c1",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.40",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:IAa,
author = "Anonymous",
title = "Information for authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "c3--c3",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.42",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:ICSa,
author = "Anonymous",
title = "{IEEE Computer Society} [Cover4]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "1",
pages = "c4--c4",
month = jan # "\slash " # jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.43",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Gaudiot:2009:INE,
author = "Jean-Luc Gaudiot",
title = "Introducing the New {Editor-in-Chief} of
{{\booktitle{IEEE Computer Architecture Letters}}}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "37--38",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.60",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
number-of-cited-references = "0",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Gaudiot:2009:INE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Skadron:2009:LE,
author = "K. Skadron",
title = "Letter from the {Editor}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "39--39",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.61",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Skadron:2009:U,
author = "Kevin Skadron",
title = "Untitled",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "39--39",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.61",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
number-of-cited-references = "0",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Skadron:2009:U",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Xin:2009:ELI,
author = "Jing Xin and Russ Joseph",
title = "Exploiting Locality to Improve Circuit-level Timing
Speculation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "40--43",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.50",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Circuit-level timing speculation has been proposed as
a technique to reduce dependence on design margins,
eliminating power and performance overheads. Recent
work has proposed microarchitectural methods to
dynamically detect and recover from timing errors in
processor logic. This work has not evaluated or
exploited the disparity of error rates at the level of
static instructions. In this paper, we demonstrate
pronounced locality in error rates at the level of
static instructions. We propose timing error prediction
to dynamically anticipate timing errors at the
instruction-level and reduce the costly recovery
penalty. This allows us to achieve 43.6\% power savings
when compared to a baseline policy and incurs only
6.9\% performance penalty.",
acknowledgement = ack-nhfb,
affiliation = "Xin, J (Reprint Author), Northwestern Univ, Evanston,
IL 60208 USA. Xin, Jing; Joseph, Russ, Northwestern
Univ, Evanston, IL 60208 USA.",
da = "2019-06-20",
doc-delivery-number = "V17GD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-0644332, CNS-0720820]",
funding-text = "Manuscript submitted: 17-Sep-2009. Manuscript
accepted: 08-Oct-2009. Final manuscript received:
15-Oct-2009. We thank the anonymous reviewers for their
constructive feedback. This work was supported by NSF
awards CAREER CCF-0644332 and CNS-0720820.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Circuit faults; circuit reliability; circuit-level
timing speculation; Costs; Delay; Dynamic voltage
scaling; Error analysis; Error locality; Frequency;
Hardware; instruction sets; Logic; logic design;
low-power design; Low-power design; microarchitectural
methods; microprocessor chips; Pipelines; power
elimination; processor logic; reliability; Reliability;
static instruction level; Testing and Fault-Tolerance;
Timing; timing error prediction; timing speculation",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Xin:2009:ELI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Sudarsanam:2009:PPD,
author = "Arvind Sudarsanam and Ramachandra Kallam and Aravind
Dasu",
title = "{PRR--PRR} Dynamic Relocation",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "44--47",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.49",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Partial bitstream relocation (PBR) on FPGAs has been
gaining attention in recent years as a potentially
promising technique to scale parallelism of accelerator
architectures at run time, enhance fault tolerance,
etc. PBR techniques to date have focused on reading
inactive bitstreams stored in memory, on-chip or
off-chip, whose contents are generated for a specific
partial reconfiguration region (PRR) and modified on
demand for configuration into a PRR at a different
location. As an alternative, we propose a PRR-PRR
relocation technique to generate source and destination
addresses, read the bitstream from an active PRR
(source) in a non-intrusive manner, and write it to
destination PRR. We describe two options of realizing
this on Xilinx Virtex 4 FPGAs: (a) hardware-based
accelerated relocation circuit (ARC) and (b) a software
solution executed on Microblaze. A comparative
performance analysis to highlight the speed-up obtained
using ARC is presented. For real test cases,
performance of our implementations are compared to
estimated performances of two state of the art
methods.",
acknowledgement = ack-nhfb,
affiliation = "Sudarsanam, A (Reprint Author), Utah State Univ, Dept
Elect \& Comp Engn, Logan, UT 84321 USA. Sudarsanam,
Arvind; Kallam, Ramachandra; Dasu, Aravind, Utah State
Univ, Dept Elect \& Comp Engn, Logan, UT 84321 USA.",
author-email = "arvind.sudarsanam@aggiemail.usu.edu
ramachandra.kallam@aggiemail.usu.edu
dasu@engineering.usu.edu",
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NASA; Micron Research Center",
funding-text = "Manuscript submitted: 03-Aug-2009. Manuscript
accepted: 16-Sep-2009. Final manuscript received:
24-Sep-2009. This work was supported by NASA and Micron
Research Center.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; Accelerator architectures; accelerator
architectures; Bioreactors; Circuits; destination
address; Emerging technologies; Fault tolerance; fault
tolerance; field programmable gate arrays; Field
programmable gate arrays; Filters; FPGAs; Hardware;
hardware-based accelerated relocation circuit; parallel
architecture; parallel architectures; Parallel
processing; partial bitstream relocation; Partial
dynamic reconfiguration; Partial dynamic relocation;
partial reconfiguration region; PBR techniques;
Performance analysis; Performance Analysis and Design
Aids; PRR-PRR dynamic relocation technique; PRR-PRR
relocation technique; Reconfigurable computing;
Reconfigurable hardware; source address; Xilinx Virtex
4 FPGA",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "Sudarsanam:2009:PPD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Leverich:2009:PMD,
author = "Jacob Leverich and Matteo Monchiero and Vanish Talwar
and Partha Ranganathan and Christos Kozyrakis",
title = "Power Management of Datacenter Workloads Using
Per-Core Power Gating",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "48--51",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.46",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "While modern processors offer a wide spectrum of
software-controlled power modes, most datacenters only
rely on Dynamic Voltage and Frequency Scaling (DVFS,
a.k.a. P-states) to achieve energy efficiency. This
paper argues that, in the case of datacenter workloads,
DVFS is not the only option for processor power
management. We make the case for per-core power gating
(PCPG) as an additional power management knob for
multi-core processors. PCPG is the ability to cut the
voltage supply to selected cores, thus reducing to
almost zero the leakage power for the gated cores.
Using a testbed based on a commercial 4-core chip and a
set of real-world application traces from enterprise
environments, we have evaluated the potential of PCPG.
We show that PCPG can significantly reduce a
processor's energy consumption (up to 40\%) without
significant performance overheads. When compared to
DVFS, PCPG is highly effective saving up to 30\% more
energy than DVFS. When DVFS and PCPG operate together
they can save up to almost 60\%.",
acknowledgement = ack-nhfb,
affiliation = "Leverich, J (Reprint Author), Hewlett Packard Labs,
Mississauga, ON, Canada. Leverich, Jacob; Monchiero,
Matteo; Talwar, Vanish; Ranganathan, Partha, Hewlett
Packard Labs, Mississauga, ON, Canada. Leverich, Jacob;
Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305
USA.",
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; computer centres; Costs; data
center workloads; dynamic voltage and frequency
scaling; Dynamic voltage scaling; Energy consumption;
energy efficiency; Energy management; Energy-aware
systems; enterprise environments; Frequency;
integration and modeling; Jacobian matrices; leakage
power; microprocessor chips; Multicore processing;
multicore processors; per-core power gating; power
consumption; Power supplies; processor energy
consumption; processor power management;
software-controlled power modes; System architectures;
Testing",
number-of-cited-references = "10",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "43",
unique-id = "Leverich:2009:PMD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Musoll:2009:PVA,
author = "Enric Musoll",
title = "A Process-Variation Aware Technique for Tile-Based,
Massive Multicore Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "52--55",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.48",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Process variations in advanced nodes introduce
significant core-to-core performance differences in
single-chip multicore architectures. Isolating each
core with its own frequency and voltage island helps
improving the performance of the multi-core
architecture by operating at the highest frequency
possible rather than operating all the cores at the
frequency of the slowest core. However, inter-core
communication suffers from additional
cross-clock-domain latencies that can offset the
performance benefits. This work proposes the concept of
the configurable, variable-size frequency and voltage
domain, and it is described in the context of a
tile-based, massive multi-core architecture.",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Clocks; computer architecture; Context;
cross-clock-domain latency; Delay; Frequency; intercore
communication; massive multi-core; massive multicore
processors; Multi-core/single-chip multiprocessors;
multicore architecture; Multicore processing;
Network-on-a-chip; network-on-chip; On-chip
interconnection networks; Performance gain; Process
design; process-variation aware architecture;
process-variation aware technique; Runtime; single-chip
multicore architectures; tile-base architecture;
tile-based multicore processors; variable-size
frequency domain; Voltage; voltage domain",
number-of-cited-references = "5",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Musoll:2009:PVA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Baldassin:2009:CEC,
author = "Alexandro Baldassin and Felipe Klein and Guido Araujo
and Rodolfo Azevedo and Paulo Centoducatte",
title = "Characterizing the Energy Consumption of Software
Transactional Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "56--59",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.47",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The well-known drawbacks imposed by lock-based
synchronization have forced researchers to devise new
alternatives for concurrent execution, of which
transactional memory is a promising one. Extensive
research has been carried out on Software Transaction
Memory (STM), most of all concentrated on program
performance, leaving unattended other metrics of great
importance like energy consumption. This letter
presents a thorough evaluation of energy consumption in
a state-of-the-art STM. We show that energy and
performance results do not always follow the same trend
and, therefore, it might be appropriate to consider
different strategies depending on the focus of the
optimization. We also introduce a novel strategy based
on dynamic voltage and frequency scaling for contention
managers, revealing important energy and energy-delay
product improvements in high-contended scenarios. This
work is a first study towards a better understanding of
the energy consumption behavior of STM systems, and
could prompt STM designers to research new
optimizations in this area, paving the way for an
energy-aware transactional memory.",
acknowledgement = ack-nhfb,
affiliation = "Baldassin, A (Reprint Author), Univ Estadual Campinas,
Inst Comp, Campinas, SP, Brazil. Baldassin, Alexandro;
Klein, Felipe; Araujo, Guido; Azevedo, Rodolfo;
Centoducatte, Paulo, Univ Estadual Campinas, Inst Comp,
Campinas, SP, Brazil.",
author-email = "alebal@ic.unicamp.br klein@ic.unicamp.br
guido@ic.unicamp.br rodolfo@ic.unicamp.br
ducatte@ic.unicamp.br",
da = "2019-06-20",
doc-delivery-number = "V17GD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "FAPESP [2005/02565-9]",
funding-text = "Manuscript submitted: 02-Jul-2009. Manuscript
accepted: 23-Jul-2009. Final manuscript received:
05-Aug-2009. This work was supported in part by FAPESP
(2005/02565-9).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Concurrent computing; Concurrent Programming; Content
management; Costs; Dynamic voltage scaling; Energy
Consumption; Energy consumption; energy consumption;
Energy management; Energy-aware systems; energy-delay
product improvements; frequency scaling; Frequency
synchronization; Hardware; lock-based synchronization;
Measurement techniques; Memory management;
multiprocessing systems; Multiprocessor Systems;
multiprocessor systems; Multiprocessor Systems;
Parallel Architectures; parallel architectures; Power
Management; Software performance; software
transactional memory; synchronisation; transaction
processing; Transactional Memory",
number-of-cited-references = "13",
ORCID-numbers = "Azevedo, Rodolfo/0000-0002-8803-0401",
research-areas = "Computer Science",
researcherid-numbers = "Azevedo, Rodolfo/F-3008-2012",
times-cited = "3",
unique-id = "Baldassin:2009:CEC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Balfour:2009:ORE,
author = "James Balfour and R. Curtis Harting and William J.
Dally",
title = "Operand Registers and Explicit Operand Forwarding",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "60--63",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.45",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Operand register files are small, inexpensive register
files that are integrated with function units in the
execute stage of the pipeline, effectively extending
the pipeline operand registers into register files.
Explicit operand forwarding lets software
opportunistically orchestrate the routing of operands
through the forwarding network to avoid writing
ephemeral values to registers. Both mechanisms let
software capture short-term reuse and locality close to
the function units, improving energy efficiency by
allowing a significant fraction of operands to be
delivered from inexpensive registers that are
integrated with the function units. An evaluation shows
that capturing operand bandwidth close to the function
units allows operand registers to reduce the energy
consumed in the register files and forwarding network
of an embedded processor by 61\%, and allows explicit
forwarding to reduce the energy consumed by 26\%.",
acknowledgement = ack-nhfb,
affiliation = "Balfour, J (Reprint Author), Stanford Univ, Comp Syst
Lab, Stanford, CA 94305 USA. Balfour, James; Harting,
R. Curtis; Dally, William J., Stanford Univ, Comp Syst
Lab, Stanford, CA 94305 USA.",
author-email = "jbalfour@cva.stanford.edu dally@cva.stanford.edu",
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Code generation; Computer aided
instruction; Computer System Implementation; Computer
Systems Organizat; embedded processor; Energy capture;
energy consumption; energy efficient register
organization; explicit operand forwarding; explicit
operand forwarding network; Fixed-point arithmetic;
impact of technology trends; Impact of VLSI on system
design; Laboratories; Logic; low-power programmable
processors; Memory hierarchy; microprocessor chips;
operand bandwidth; operand register files; operand
registers; Optimization; Physically aware
micro-architecture: power; Pipelines; Real-time and
embedded systems; Registers; Routing; software
reusability; thermal; VLSI Systems; Writing",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "Balfour:2009:ORE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Chiou:2009:AFF,
author = "Derek Chiou and Hari Angepat and Nikhil A. Patil and
Dam Sunwoo",
title = "Accurate Functional-First Multicore Simulators",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "64--67",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.44",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Fast and accurate simulation of multicore systems
requires a parallelized simulator. This paper describes
a novel method to build parallelizable and
cycle-accurate-capable functional-first simulators of
multicore targets.",
acknowledgement = ack-nhfb,
affiliation = "Chiou, D (Reprint Author), Univ Texas Austin, Dept
Elect \& Comp Engn, Austin, TX 78712 USA. Chiou, Derek;
Angepat, Hari; Patil, Nikhil A.; Sunwoo, Dam, Univ
Texas Austin, Dept Elect \& Comp Engn, Austin, TX 78712
USA.",
author-email = "derek@ece.utexas.edu angepat@ece.utexas.edu
npatil@ece.utexas.edu sunwoo@ece.utexas.edu",
da = "2019-06-20",
doc-delivery-number = "V17GD",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [0615352,
0747438]",
funding-text = "This material is based upon work supported by the
National Science Foundation under Grants No. 0615352
and No. 0747438 and gifts from Intel and IBM. We thank
the anonymous reviewers for their comments.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "circuit simulation; Computational modeling; Computer
simulation; field programmable gate arrays;
FPGA-accelerated simulation technologies;
functional-first multicore simulators; Instruction
sets; integration and modeling; Microarchitecture;
Modeling and Visualization; Modeling of computer
architecture; Modeling techniques;
Multi-core/single-chip multiprocessors; Multicore
processing; multicore system simulation; Parallel;
Parallel Architectures; parallelized simulator;
Performance Analysis and Design Aids; Predictive
models; Simulation; Software prototyping; System
architectures; Timing; Virtual machining; Virtual
prototyping",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "7",
unique-id = "Chiou:2009:AFF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2009:Ab,
author = "Anonymous",
title = "{[Advertisement]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "68--68",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.52",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:Ac,
author = "Anonymous",
title = "{[Advertisement]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "69--69",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.53",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:Ad,
author = "Anonymous",
title = "{[Advertisement]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "70--70",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.55",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:Ae,
author = "Anonymous",
title = "{[Advertisement]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "71--71",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.54",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:Af,
author = "Anonymous",
title = "{[Advertisement]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "72--72",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.51",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:EBCb,
author = "Anonymous",
title = "Editorial Board [Cover2]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "c2--c2",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.57",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:FCb,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "c1--c1",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.56",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:IAb,
author = "Anonymous",
title = "Information for authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "c3--c3",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.58",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2009:ICSb,
author = "Anonymous",
title = "{IEEE Computer Society} [Cover4]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "8",
number = "2",
pages = "c4--c4",
month = jul # "\slash " # dec,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2009.59",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Patil:2010:URT,
author = "Shruti Patil and David J. Lilja",
title = "Using Resampling Techniques to Compute Confidence
Intervals for the Harmonic Mean of Rate-Based
Performance Metrics",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Rate-based metrics such as floating point operations
per second, instructions per cycle and so forth are
commonly used to measure computer performance. In
addition to the average or mean performance of the
metric, indicating the precision of the mean using
confidence intervals helps to make informed decisions
and comparisons with the data. In this paper, we
discuss the determination of confidence intervals for
the harmonic mean of rate-based metrics using two
statistical resampling techniques Jackknife and
Bootstrap. We show using Monte Carlo simulations that
resampling indeed works as expected, and can be used
for generating confidence intervals for harmonic
mean.",
acknowledgement = ack-nhfb,
affiliation = "Patil, S (Reprint Author), Univ Minnesota Twin Cities,
Dept Elect \& Comp Engn, St Paul, MN USA. Patil,
Shruti; Lilja, David J., Univ Minnesota Twin Cities,
Dept Elect \& Comp Engn, St Paul, MN USA.",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CCF-0541162]",
funding-text = "This work was supported in part by the National
Science Foundation grant no. CCF-0541162. Any opinions,
findings and conclusions or recommendations expressed
in this material are those of the authors and do not
necessarily reflect the views of the NSF. The authors
also thank the University of Minnesota Statistical
Consulting Service for their helpful insights.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Arithmetic; bootstrap; bootstrap technique; Cities and
towns; Computer errors; Computer performance; computer
performance measurement; Confidence intervals;
confidence intervals; Electric variables measurement;
Equations; floating point operations; Harmonic
analysis; harmonic mean; jackknife; jackknife
technique; Monte Carlo methods; Monte Carlo
simulations; Nonparametric statistics; Performance
analysis; performance evaluation; Performance of
Systems; Probability distribution; rate-based
performance metrics; resampling; statistical analysis;
statistical resampling techniques; Statistics",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "Patil:2010:URT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Seznec:2010:PCM,
author = "Andre Seznec",
title = "A Phase Change Memory as a Secure Main Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/prng.bib",
abstract = "Phase change memory (PCM) technology appears as more
scalable than DRAM technology. As PCM exhibits access
time slightly longer but in the same range as DRAMs,
several recent studies have proposed to use PCMs for
designing main memory systems. Unfortunately PCM
technology suffers from a limited write endurance;
typically each memory cell can be only be written a
large but still limited number of times (10(7) to 10(9)
writes are reported for current technology). Till now,
research proposals have essentially focused their
attention on designing memory systems that will survive
to the average behavior of conventional applications.
However PCM memory systems should be designed to
survive worst-case applications, i.e., malicious
attacks targeting the physical destruction of the
memory through overwriting a limited number of memory
cells.",
acknowledgement = ack-nhfb,
affiliation = "Seznec, A (Reprint Author), INRIA Rennes Bretagne
Atlantique, Ctr Rech, Campus Beaulieu, F-35042 Rennes,
France. INRIA Rennes Bretagne Atlantique, Ctr Rech,
F-35042 Rennes, France.",
author-email = "seznec@irisa.fr",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "European Commission [27648]",
funding-text = "This work was partially supported by the European
Commission in the context of the SARC integrated
project \#27648 (FP6).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application software; DRAM technology; Energy
consumption; memory cells; Memory Structures; PCM
memory systems; Phase change materials; phase change
memories; phase change memory; Phase change memory;
Physics computing; Proposals; Random access memory;
Random number generation; Random processes;
Scalability; secure PCM-based main memory;
Semiconductor Memories",
keywords-plus = "TECHNOLOGY",
number-of-cited-references = "8",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "17",
unique-id = "Seznec:2010:PCM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Park:2010:EIP,
author = "Seon-yeong Park and Euiseong Seo and Ji-Yong Shin and
Seungryoul Maeng and Joonwon Lee",
title = "Exploiting Internal Parallelism of Flash-based
{SSDs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "For the last few years, the major driving force behind
the rapid performance improvement of SSDs has been the
increment of parallel bus channels between a flash
controller and flash memory packages inside the
solid-state drives (SSDs). However, there are other
internal parallelisms inside SSDs yet to be explored.
In order to improve performance further by utilizing
the parallelism, this paper suggests request
rescheduling and dynamic write request mapping.
Simulation results with real workloads have shown that
the suggested schemes improve the performance of the
SSDs by up to 15\% without any additional hardware
support.",
acknowledgement = ack-nhfb,
affiliation = "Park, SY (Reprint Author), Korea Adv Inst Sci \&
Technol, Taejon, South Korea. Park, Seon-yeong; Shin,
Ji-Yong; Maeng, Seungryoul, Korea Adv Inst Sci \&
Technol, Taejon, South Korea. Seo, Euiseong, Ulsan Natl
Inst Sci \& Technol, Ulsan, South Korea. Lee, Joonwon,
Sungkyunkwan Univ, Seoul, South Korea.",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Korea government(MEST) [2009-0080381]",
funding-text = "This work was supported by the Korea Science and
Engineering Foundation (KOSEF) grant funded by the
Korea government (MEST), (No. 2009-080381)",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Delay; Drives; exploiting internal parallelism; flash
based SSD; flash controller; flash memories; Flash
memory; flash memory packages; Force control; Hard
disks; I/O scheduling; Input/Output Devices; Packaging;
parallel bus channels; parallel processing; Parallel
systems; parallelism; pipeline processing; Pipeline
processing; Secondary storage; Simulation; Solid state
circuits; solid state drives; Solid-State Drives
(SSDs); Space technology; Storage Management; system
buses; Throughput",
number-of-cited-references = "6",
research-areas = "Computer Science",
researcherid-numbers = "Maeng, Seungryoul/C-1882-2011",
times-cited = "35",
unique-id = "Park:2010:EIP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Subramoni:2010:ISI,
author = "Hari Subramoni and Fabrizio Petrini and Virat Agarwal
and Davide Pasetto",
title = "Intra-Socket and Inter-Socket Communication in
Multi-core Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The increasing computational and communication demands
of the scientific and industrial communities require a
clear understanding of the performance trade-offs
involved in multi-core computing platforms. Such
analysis can help application and toolkit developers in
designing better, topology aware, communication
primitives intended to suit the needs of various high
end computing applications. In this paper, we take on
the challenge of designing and implementing a portable
intra-core communication framework for streaming
computing and evaluate its performance on some popular
multi-core architectures developed by Intel, AMD and
Sun. Our experimental results, obtained on the Intel
Nehalem, AMD Opteron and Sun Niagara 2 platforms, show
that we are able to achieve an intra-socket small
message latency between 120 and 271 nanoseconds, while
the inter-socket small message latency is between 218
and 320 nanoseconds. The maximum intra-socket
communication bandwidth ranges from 0.179 (Sun Niagara
2) to 6.5 (Intel Nehalem) Gbytes/second. We were also
able to obtain an inter-socket communication
performance of 1.2 and 6.6 Gbytes/second on the AMD
Opteron and Intel Nehalem, respectively.",
acknowledgement = ack-nhfb,
affiliation = "Subramoni, H (Reprint Author), IBM TJ Watson, Yorktown
Hts, NY 10598 USA. Subramoni, Hari; Petrini, Fabrizio;
Agarwal, Virat, IBM TJ Watson, Yorktown Hts, NY 10598
USA. Pasetto, Davide, IBM Computat Sci Ctr, Dublin,
Ireland. Subramoni, Hari, Ohio State Univ, Columbus, OH
43210 USA.",
author-email = "subramon@cse.ohio-state.edu fpetrin@us.ibm.com
viratagarwal@us.ibm.com pasetto\_davide@ie.ibm.com",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "AMD Opteron; Bandwidth; Communication industry;
communication primitives; Communication Protocols;
Computer applications; Computer architecture; Computer
industry; Delay; General; Hardware; High Performance
Computing; industrial communities; Intel Nehalem;
intersocket communication; Intrasocket communication;
multicore architectures; Multicore Processors;
multicore systems; multiprocessing systems; parallel
architectures; Performance of Systems; Portable
computers; streaming computing; Sun; toolkit
developers; Topology; topology aware",
keywords-plus = "NETWORK",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Subramoni:2010:ISI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Hoang:2010:CAN,
author = "Giang Hoang and Chang Bae and John Lange and Lide
Zhang and Peter Dinda and Russ Joseph",
title = "A Case for Alternative Nested Paging Models for
Virtualized Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Address translation often emerges as a critical
performance bottleneck for virtualized systems and has
recently been the impetus for hardware paging
mechanisms. These mechanisms apply similar translation
models for both guest and host address translations. We
make an important observation that the model employed
to translate from guest physical addresses (GPAs) to
host physical addresses (HPAs) is in fact orthogonal to
the model used to translate guest virtual addresses
(GVAs) to GPAs. Changing this model requires VMM
cooperation, but has no implications for guest OS
compatibility. As an example, we consider a hashed page
table approach for GPA -> HPA translation. Nested
paging, widely considered the most promising approach,
uses unhashed multi-level forward page tables for both
GVA -> GPA and GPA -> HPA translations, resulting in a
potential O(n(2)) page walk cost on a TLB miss, for
n-level page tables. In contrast, the hashed page table
approach results in an expected O(n) cost. Our
simulation results show that when a hashed page table
is used in the nested level, the performance of the
memory system is not worse, and sometimes even better
than a nested forward-mapped page table due to reduced
page walks and cache pressure. This showcases the
potential for alternative paging mechanisms.",
acknowledgement = ack-nhfb,
affiliation = "Hoang, GA (Reprint Author), Northwestern Univ,
Evanston, IL 60208 USA. Hoang, Giang; Bae, Chang;
Lange, John; Dinda, Peter; Joseph, Russ, Northwestern
Univ, Evanston, IL 60208 USA. Zhang, Lide, Univ
Michigan, Ann Arbor, MI 48109 USA.",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "address translation; Computer Architecture; Computer
architecture; Computer Architecture; Computer displays;
Control systems; Costs; Emerging technologies; file
organisation; guest physical addresses; guest virtual
addresses; Hardware; hardware paging mechanisms;
Hardware/software interfaces; host physical addresses;
Instruction sets; Nested Paging; nested paging models;
Operating systems; OS compatibility; paged storage;
Platform virtualization; Software performance; storage
allocation; unhashed multilevel forward page tables;
virtual machine monitors; Virtual machine monitors;
virtual machines; Virtual Memory; Virtualization;
virtualized systems; VMM cooperation",
number-of-cited-references = "11",
research-areas = "Computer Science",
researcherid-numbers = "Joseph, Russell/B-7230-2009 Dinda,
Peter/B-7142-2009",
times-cited = "5",
unique-id = "Hoang:2010:CAN",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Krimer:2010:SNT,
author = "Evgeni Krimer and Robert Pawlowski and Mattan Erez and
Patrick Chiang",
title = "{Synctium}: a Near-Threshold Stream Processor for
Energy-Constrained Parallel Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "While Moore's law scaling continues to double
transistor density every technology generation, supply
voltage reduction has essentially stopped, increasing
both power density and total energy consumed in
conventional microprocessors. Therefore, future
processors will require an architecture that can: (a)
take advantage of the massive amount of transistors
that will be available; and (b) operate these
transistors in the near-threshold supply domain,
thereby achieving near optimal energy/computation by
balancing the leakage and dynamic energy consumption.
Unfortunately, this optimality is typically achieved
while running at very low frequencies (i.e.,
0.1--10MHz) and with only one computation executing per
cycle, such that performance is limited. Further,
near-threshold designs suffer from severe process
variability that can introduce extremely large delay
variations. In this paper, we propose a near
energy-optimal, stream processor family that relies on
massively parallel, near-threshold VLSI circuits and
interconnect, incorporating cooperative
circuit/architecture techniques to tolerate the
expected large delay variations. Initial estimations
from circuit simulations show that it is possible to
achieve greater than 1 Giga-Operations per second
(1GOP/s) with less than 1mW total power consumption,
enabling a new class of energy-constrained,
high-throughput computing applications.",
acknowledgement = ack-nhfb,
affiliation = "Krimer, E (Reprint Author), UT Austin, ECE, Austin, TX
USA. Krimer, Evgeni; Erez, Mattan, UT Austin, ECE,
Austin, TX USA. Pawlowski, Robert; Chiang, Patrick,
Oregon State Univ, EECS, Corvallis, OR 97331 USA.",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Circuits; Computer architecture; conventional
microprocessors; Delay; double transistor density;
dynamic energy consumption; energy constrained parallel
applications; Energy consumption; etc.; Frequency;
impact of technology trends; Low-power design;
Microprocessors; Mobile processors; Moore's Law; near
threshold stream processor; optimisation; parallel
programming; Physically aware micro-architecture:
power; pipeline processing; Power generation; SIMD
processors; supply voltage reduction; Synctium;
thermal; Very large scale integration; VLSI circuits;
Voltage",
keywords-plus = "CIRCUITS; TOLERANCE; CMOS",
number-of-cited-references = "19",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "22",
unique-id = "Krimer:2010:SNT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Hilton:2010:SDE,
author = "Andrew Hilton and Amir Roth",
title = "{SMT-Directory}: Efficient Load-Load Ordering for
{SMT}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Memory models like SC, TSO, and PC enforce load-load
ordering, requiring that loads from any single thread
appear to occur in program order to all other threads.
Out-of-order execution can violate load-load ordering.
Conventional multi-processors with out-of-order cores
detect load-load ordering violations by snooping an
age-ordered load queue on cache invalidations or
evictions-events that act as proxies for the completion
of remote stores. This mechanism becomes less efficient
in an SMT processor, as every completing store must
search the loads queue segments of all other threads.
This inefficiency exists because store completions from
other threads in the same core are not filtered by the
cache and coherence protocol: thread 0 observes all of
thread 1's stores, not only the first store to every
cache line. SMT-Directory eliminates this overhead by
implementing the filtering traditionally provided by
the cache in the cache itself. SMT-Directory adds a
per-thread ``read'' bit to every data cache line. When
a load executes, it sets the bit corresponding to its
thread. When a store completes and write to the cache,
it checks the SMT-Directory bits of its cache line and
searches the load queue segments only of those threads
whose bits are set. As a result, local store
completions trigger searches only for data that is
actually shared.",
acknowledgement = ack-nhfb,
affiliation = "Hilton, A (Reprint Author), Univ Penn, Philadelphia,
PA 19104 USA. Hilton, Andrew; Roth, Amir, Univ Penn,
Philadelphia, PA 19104 USA.",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-0541292]",
funding-text = "We thank Arun Raghavan for the address traces and Milo
Martin for comments on early versions of this work. The
anonymous reviewers provided valuable feedback. This
work was supported by NSF award CCF-0541292.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "age-ordered load queue; Buffer storage; cache
invalidations; cache protocol; cache storage; coherence
protocol; consistency models; data cache line;
directory; Filtering; Load modeling; load queue search;
load queue segments; load-load ordering; Memory
hierarchy; multi-threading; multiprocessing systems;
Multithreaded processors; Multithreading; Out of order;
Protocols; Read-write memory; Simultaneous
multithreading; SMT processor; Surface-mount
technology; Writing",
keywords-plus = "CONSISTENCY",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Hilton:2010:SDE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Hammoud:2010:DPA,
author = "Mohammad Hammoud and Sangyeun Cho and Rami G. Melhem",
title = "A Dynamic Pressure-Aware Associative Placement
Strategy for Large Scale Chip Multiprocessors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "29--32",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper describes dynamic pressure-aware
associative placement (DPAP), a novel distributed cache
management scheme for large-scale chip multiprocessors.
Our work is motivated by the large non-uniform
distribution of memory accesses across cache sets in
different L2 banks. DPAP decouples the physical
locations of cache blocks from their addresses for the
sake of reducing misses caused by destructive
interferences. Temporal pressure at the on-chip
last-level cache, is continuously collected at a group
(comprised of local cache sets) granularity, and
periodically recorded at the memory controller(s) to
guide the placement process. An incoming block is
consequently placed at a cache group that exhibits the
minimum pressure. Simulation results using a
full-system simulator demonstrate that DPAP outperforms
the baseline shared NUCA scheme by an average of 8.3\%
and by as much as 18.9\% for the benchmark programs we
examined. Furthermore, evaluations showed that DPAP
outperforms related cache designs.",
acknowledgement = ack-nhfb,
affiliation = "Hammoud, M (Reprint Author), Univ Pittsburgh, Dept
Comp Sci, Pittsburgh, PA 15260 USA. Hammoud, Mohammad;
Cho, Sangyeun; Melhem, Rami G., Univ Pittsburgh, Dept
Comp Sci, Pittsburgh, PA 15260 USA.",
author-email = "mhh@cs.pitt.edu cho@cs.pitt.edu melhem@cs.pitt.edu",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-0952273]",
funding-text = "This work was supported in part by NSF grant
CCF-0952273.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Aggregate Cache Sets; Aggregates; Associative
Placement; cache storage; Chip Multiprocessors;
Computer architecture; Computer science; destructive
interferences; distributed cache management; DPAP;
dynamic pressure aware associative placement strategy;
Interference; large scale chip multiprocessors;
Large-scale systems; Local Cache Sets; memory access
distribution; memory controllers; microprocessor chips;
Network-on-a-chip; NUCA scheme; Pressure control;
Pressure-Aware Placement; Random access memory",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Hammoud:2010:DPA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2010:LUC,
author = "Hyungjun Kim and Paul V. Gratz",
title = "Leveraging Unused Cache Block Words to Reduce Power in
{CMP} Interconnect",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "33--36",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Power is of paramount importance in modern computer
system design. In particular, the cache interconnect in
future CMP designs is projected to consume up to half
of the system power for cache fills and spills [8].
Despite the power consumed by spills and fills, a
significant percentage of each cache line is unused
prior to eviction from the cache. If unused cache block
words can be identified, this information can be used
to improve CMP interconnect power and energy
consumption. We propose a new method of CMP
interconnect packet composition, leveraging unused data
to reduce power. These methods are well suited to
interconnection networks with high-bandwidth wires, and
do not require expensive multi-ported memory systems.
Assuming perfect prediction, our techniques achieve an
average of similar to 37\% savings in total dynamic
link power consumption. With our current best
prediction mechanism, our techniques reduce dynamic
power consumption by similar to 23\% on average.",
acknowledgement = ack-nhfb,
affiliation = "Kim, H (Reprint Author), Texas A\&M Univ, Dept Elect
\& Comp Engn, College Stn, TX 77843 USA. Kim, Hyungjun;
Gratz, Paul V., Texas A\&M Univ, Dept Elect \& Comp
Engn, College Stn, TX 77843 USA.",
author-email = "hyungjuk@tamu.edu pgratz@tamu.edu",
da = "2019-06-20",
doc-delivery-number = "731BP",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; cache fills; cache interconnect; Cache
memories; cache spills; cache storage; CMP
interconnect; computer system design; Delay; dynamic
power; Energy consumption; energy consumption; flit
encoding; integrated circuit design; Interconnection
architectures; Low-power design; memory system;
microprocessor chips; Multicore; Multiprocessor
interconnection networks; Network-on-a-chip; NoC; power
aware computing; Power engineering computing; power
reduction; Power system interconnection; Random access
memory; total dynamic link power consumption; unused
cache block words; Very large scale integration;
Wires",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kim:2010:LUC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2010:EBCa,
author = "Anonymous",
title = "Editorial Board [Cover2]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "c2--c2",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:FCa,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "c1--c1",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:IAa,
author = "Anonymous",
title = "Information for authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "c3--c3",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ICSa,
author = "Anonymous",
title = "{IEEE Computer Society} [Cover4]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "1",
pages = "c4--c4",
month = jan # "\slash " # jun,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Skadron:2010:ELE,
author = "K. Skadron",
title = "Editorial: Letter from the {Editor-in-Chief}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "37--44",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.27",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Skadron:2010:U,
author = "Kevin Skadron",
title = "Untitled",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "37--44",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.27",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "731BX",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
number-of-cited-references = "0",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Skadron:2010:U",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Iqbal:2010:POS,
author = "Syed Muhammad Zeeshan Iqbal and Yuchen Liang and Hakan
Grahn",
title = "{ParMiBench} --- an Open-Source Benchmark for Embedded
Multiprocessor Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "45--48",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/gnu.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Multicore processors are the main computing platform
in laptops, desktop, and servers today, and are making
their way into the embedded systems market also. Using
benchmarks is a common approach to evaluate the
performance of a system. However, benchmarks for
embedded systems have so far been either targeted for a
uni-processor environment, e.g., MiBench, or have been
commercial, e.g., MultiBench by EEMBC. In this paper,
we propose and implement an open source benchmark,
ParMiBench, targeted for multiprocessor-based embedded
systems. ParMiBench consists of parallel
implementations of seven compute intensive algorithms
from the uni-processor benchmark suite MiBench. The
applications are selected from four domains: Automation
and Industry Control, Network, Office, and Security.",
acknowledgement = ack-nhfb,
affiliation = "Iqbal, SMZ (Reprint Author), Blekinge Inst Technol,
Sch Comp, SE-37179 Karlskrona, Sweden. Iqbal, Syed
Muhammad Zeeshan; Liang, Yuchen; Grahn, Hakan, Blekinge
Inst Technol, Sch Comp, SE-37179 Karlskrona, Sweden.",
author-email = "mzeeshan01@gmail.com yuchen9760@gmail.com
hakan.grahn@bth.se",
da = "2019-06-20",
doc-delivery-number = "731BX",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "benchmark testing; Benchmark testing; Concurrent
Programming; desktop; embedded multiprocessor system;
Embedded system; embedded system market; embedded
systems; intensive algorithm; laptop; Load management;
Multicore processing; multiprocessing systems;
Multiprocessor Systems; open-source benchmark; parallel
architectures; parallel implementation; ParMiBench;
Performance Evaluation; Performance evaluation;
Performance Evaluation; Program processors; public
domain software; Security; uniprocessor benchmark
suite",
number-of-cited-references = "9",
ORCID-numbers = "Grahn, Hakan/0000-0001-9947-1088",
research-areas = "Computer Science",
researcherid-numbers = "Grahn, Hakan/G-9720-2011",
times-cited = "32",
unique-id = "Iqbal:2010:POS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Fang:2010:BRP,
author = "Zhen Fang and Erik G. Hallnor and Bin Li and Michael
Leddige and Donglai Dai and Seung Eun Lee and Srihari
Makineni and Ravi Iyer",
title = "{Boomerang}: Reducing Power Consumption of Response
Packets in {NoCs} with Minimal Performance Impact",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "49--52",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Most power reduction mechanisms for NoC channel
buffers rely on on-demand wakeup to transition from a
low-power state to the active state. Two drawbacks of
on-demand wakeup limit its effectiveness: (1)
performance impact caused by wakeup delays, and (2)
energy and area cost of sleep circuitry itself. What
makes the problem harder to solve is that solutions to
either problem tend to exacerbate the other. For
example, faster wakeup from a power-gated state
requires greater charge/discharge current for the sleep
transistors while using nimbler sleep transistors
implies long wakeup delays. As a result, powerdowns
have to be conservatively prescribed, missing many
power-saving opportunities. We propose Boomerang, a
novel power-saving method that overcomes the above
drawbacks. Specifically, based on the observation that
a response is always preceded by a request, we let the
request trigger wakeup of the buffer that is to be used
by its response in the ( near) future, instead of using
on-demand wakeups. Hiding the wakeup delay completely,
Boomerang allows us to employ aggressive sleep policies
and use low-cost power gating circuits on response
buffers.",
acknowledgement = ack-nhfb,
affiliation = "Fang, Z (Reprint Author), Intel Corp, Santa Clara, CA
95051 USA. Fang, Zhen; Hallnor, Erik G.; Li, Bin;
Leddige, Michael; Dai, Donglai; Makineni, Srihari;
Iyer, Ravi, Intel Corp, Santa Clara, CA 95051 USA. Lee,
Seung Eun, Seoul Natl Univ Sci \& Technol, Seoul, South
Korea.",
author-email = "zhen.fang@intel.com",
da = "2019-06-20",
doc-delivery-number = "731BX",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Boomerang; buffer circuits; charge-discharge current;
Delay; Interconnection networks; Leakage currents;
leakage power; low-cost power gating circuits;
low-power design; Mobile communication;
network-on-chip; nimbler sleep transistors; NoC channel
buffers; packet-switching networks; power aware
computing; power consumption reduction mechanism;
power-gated state; power-saving method; response
packets; Routing; Switches; System-on-a-chip;
Transistors; wakeup delay",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Fang:2010:BRP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lyons:2010:ASF,
author = "Michael J. Lyons and Mark Hempstead and Gu-Yeon Wei
and David Brooks",
title = "The Accelerator Store framework for high-performance,
low-power accelerator-based systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "53--56",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware acceleration can increase performance and
reduce energy consumption. To maximize these benefits,
accelerator-based systems that emphasize computation on
accelerators (rather than on general purpose cores)
should be used. We introduce the ``accelerator store,''
a structure for sharing memory between accelerators in
these accelerator-based systems. The accelerator store
simplifies accelerator I/O and reduces area by mapping
memory to accelerators when needed at runtime.
Preliminary results demonstrate a 30\% system area
reduction with no energy overhead and less than 1\%
performance overhead in contrast to conventional DMA
schemes.",
acknowledgement = ack-nhfb,
affiliation = "Lyons, MJ (Reprint Author), Harvard Univ, Sch Engn \&
Appl Sci, Cambridge, MA 02138 USA. Lyons, Michael J.;
Brooks, David, Harvard Univ, Sch Engn \& Appl Sci,
Cambridge, MA 02138 USA.",
author-email = "mjlyons@eecs.harvard.edu mhempstead@coe.drexel.edu
guyeon@eecs.harvard.edu dbrooks@eecs.harvard.edu",
da = "2019-06-20",
doc-delivery-number = "731BX",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [IIS-0926148];
Gigascale Systems Research Center",
funding-text = "This material is based upon work supported by the
National Science Foundation under Grant No.
IIS-0926148. The authors acknowledge the support of the
Gigascale Systems Research Center, one of six research
centers funded under the Focus Center Research Program
(FCRP), a Semiconductor Research Corporation entity.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; accelerator store framework; energy
consumption; General; hardware acceleration;
Heterogeneous (hybrid) systems; high-performance
low-power accelerator-based system; low-power
electronics; memory architecture; Memory management;
memory mapping; memory sharing; Program processors;
Random access memory; Real time systems; Real-time and
embedded systems; shared memory systems; storage
management; Throughput; Transform coding",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "13",
unique-id = "Lyons:2010:ASF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Manevich:2010:CAR,
author = "Ran Manevich and Israel Cidon and Avinoam Kolodny and
Isask'har Walter",
title = "Centralized Adaptive Routing for {NoCs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "57--60",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As the number of applications and programmable units
in CMPs and MPSoCs increases, the Network-on-Chip (NoC)
encounters diverse and time dependent traffic loads.
This trend motivates the introduction of NoC
load-balanced, adaptive routing mechanisms that achieve
higher throughput as compared with traditional
oblivious routing schemes that are perceived better
suited for hardware implementations. However, an
efficient adaptive routing scheme should base its
decisions on the global state of the system rather than
on local or regional congestion signals as is common in
current adaptive routing schemes. In this paper we
introduce a novel paradigm of NoC centralized adaptive
routing, and a specific design for mesh topology. Our
scheme continuously monitors the global traffic load in
the network and modifies the routing of packets to
improve load balancing accordingly. In our specific
mesh-based design, XY or YX routes are adaptively
selected for each source-destination pair. We show that
while our implementation is scalable and lightweight in
hardware costs, it outperforms distributed adaptive
routing schemes in terms of load balancing and
throughput.",
acknowledgement = ack-nhfb,
affiliation = "Manevich, R (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
Manevich, Ran; Cidon, Israel; Kolodny, Avinoam; Walter,
Isask'har, Technion Israel Inst Technol, Dept Elect
Engn, IL-32000 Haifa, Israel.",
author-email = "ranman@tx.technion.ac.il cidon@ee.technion.ac.il
kolodny@ee.technion.ac.il zigi@tx.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "731BX",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "adaptive routing; Adaptive systems; centralized
adaptive routing; Computer architecture; distributed
adaptive routing; global state; load balanced adaptive
routing; load balancing; Load control; Load management;
mesh based design; mesh topology; network on chip;
Network on Chip; network routing; Network-on-Chip;
network-on-chip; NoC; packet routing; programmable
unit; regional congestion signal; routing algorithms;
Routing protocols; Telecommunication traffic;
Throughput; time dependent traffic load",
number-of-cited-references = "7",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "Manevich:2010:CAR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zhang:2010:FCA,
author = "Meng Zhang and Alvin R. Lebeck and Daniel J. Sorin",
title = "Fractal Consistency: Architecting the Memory System to
Facilitate Verification",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "61--64",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.18",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "One of the most challenging problems in developing a
multicore processor is verifying that the design is
correct, and one of the most difficult aspects of
pre-silicon verification is verifying that the memory
system obeys the architecture's specified memory
consistency model. To simplify the process of
pre-silicon design verification, we propose a system
model called the Fractally Consistent Model (FCM). We
prove that systems that adhere to the FCM can be
verified to obey the memory consistency model in three
simple, scalable steps. The procedure for verifying FCM
systems contrasts sharply with the difficult,
non-scalable procedure required to verify non-FCM
systems. We show that FCM systems do not necessarily
sacrifice performance, compared to non-FCM systems,
despite being simpler to verify.",
acknowledgement = ack-nhfb,
affiliation = "Zhang, M (Reprint Author), Duke Univ, Dept Elect \&
Comp Engn, Durham, NC 27706 USA. Zhang, Meng; Sorin,
Daniel J., Duke Univ, Dept Elect \& Comp Engn, Durham,
NC 27706 USA. Lebeck, Alvin R., Duke Univ, Dept Comp
Sci, Durham, NC 27706 USA.",
da = "2019-06-20",
doc-delivery-number = "731BX",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CCF-0702434,
CCF-0811290]",
funding-text = "This material is based upon work supported by the
National Science Foundation under grants CCF-0702434
and CCF-0811290.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Arithmetic and Logic Structures; Coherence;
Computational modeling; Computer architecture; Computer
Reliability; Fault-Tolerance; FCM systems; Formal
verification; fractal consistent model; Fractals;
Hardware; Memory; memory architecture; Memory
Consistency; memory consistency model; Memory
hierarchy; memory system architecture;
Micro-architecture implementation considerations;
microprocessor chips; Multicore; multicore processor;
multiprocessing systems; Performance Analysis and
Design Aids; presilicon verification; Processor
Architectures; Protocols; Testing; Validation;
Verification",
number-of-cited-references = "10",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Zhang:2010:FCA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2010:AIT,
author = "Anonymous",
title = "Advertisement --- {{\booktitle{IEEE Transactions on
Computers}}} Celebrates 60 Years",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "65--65",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.29",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ICSb,
author = "Anonymous",
title = "2011 {IEEE Computer Society} Simulator Design
Competition",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "66--66",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.19",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ASS,
author = "Anonymous",
title = "Advertisement --- Special Student Offer",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "67--67",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.30",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ADY,
author = "Anonymous",
title = "Advertisement --- Distinguish Yourself With the
{CSDP}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "68--68",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.26",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:CPS,
author = "Anonymous",
title = "{Conference Proceedings Services (CPS)}
[advertisement]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "69--69",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.21",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ICSc,
author = "Anonymous",
title = "{IEEE Computer Society} Jobs",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "70--70",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.28",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ASC,
author = "Anonymous",
title = "Advertisement --- Stay Connected to the {IEEE Computer
Society}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "71--71",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.31",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ACS,
author = "Anonymous",
title = "Advertisement --- {Computer Society Digital Library}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "72--72",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.20",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:EBCb,
author = "Anonymous",
title = "Editorial Board [Cover2]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "c2--c2",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.23",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:FCb,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "c1--c1",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.22",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:IAb,
author = "Anonymous",
title = "Information for authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "c3--c3",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.24",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2010:ICSd,
author = "Anonymous",
title = "{IEEE Computer Society} [Cover4]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "9",
number = "2",
pages = "c4--c4",
month = jul # "\slash " # dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2010.25",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Skadron:2011:ELE,
author = "K. Skadron",
title = "Editorial: Letter from the {Editor-in-Chief}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "1--3",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Skadron:2011:U,
author = "Kevin Skadron",
title = "Untitled",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "1--3",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "773ZN",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
number-of-cited-references = "0",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Skadron:2011:U",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Vandierendonck:2011:FMM,
author = "Hans Vandierendonck and Andre Seznec",
title = "Fairness Metrics for Multi-Threaded Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "4--7",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Multi-threaded processors execute multiple threads
concurrently in order to increase overall throughput.
It is well documented that multi-threading affects
per-thread performance but, more importantly, some
threads are affected more than others. This is
especially troublesome for multi-programmed workloads.
Fairness metrics measure whether all threads are
affected equally. However defining equal treatment is
not straightforward. Several fairness metrics for
multi-threaded processors have been utilized in the
literature, although there does not seem to be a
consensus on what metric does the best job of measuring
fairness. This paper reviews the prevalent fairness
metrics and analyzes their main properties. Each metric
strikes a different trade-off between fairness in the
strict sense and throughput. We categorize the metrics
with respect to this property. Based on experimental
data for SMT processors, we suggest using the minimum
fairness metric in order to balance fairness and
throughput.",
acknowledgement = ack-nhfb,
affiliation = "Vandierendonck, H (Reprint Author), Univ Ghent, Dept
Elect \& Informat Syst, Ghent, Belgium. Vandierendonck,
Hans, Univ Ghent, Dept Elect \& Informat Syst, Ghent,
Belgium. Seznec, Andre, INRIA Rennes, Rennes, France.",
author-email = "hans.vandierendonck@elis.ugent.be
Andre.Seznec@inria.fr",
da = "2019-06-20",
doc-delivery-number = "773ZN",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Correlation; fairness; fairness metrics; Harmonic
analysis; Instruction sets; measurement; Measurement;
multi-programming; Multi-threaded processors;
multi-threading; multiprocessing systems;
multiprogrammed workloads; multithreaded processors;
Parallel Architectures; Performance of Systems;
quality-of-service; resource allocation; SMT
processors; software metrics; System-on-a-chip;
Throughput",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "13",
unique-id = "Vandierendonck:2011:FMM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Tang:2011:PEM,
author = "Jie Tang and Shaoshan Liu and Zhimin Gu and Chen Liu
and Jean-Luc Gaudiot",
title = "Prefetching in Embedded Mobile Systems Can Be
Energy-Efficient",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "8--11",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Data prefetching has been a successful technique in
high-performance computing platforms. However, the
conventional wisdom is that they significantly increase
energy consumption, and thus not suitable for embedded
mobile systems. On the other hand, as modern mobile
applications pose an increasing demand for high
performance, it becomes essential to implement
high-performance techniques, such as prefetching, in
these systems. In this paper, we study the impact of
prefetching on the performance and energy consumption
of embedded mobile systems. Contrary to the
conventional wisdom, our findings demonstrate that as
technology advances, prefetching can be
energy-efficient while improving performance.
Furthermore, we have developed a simple but effective
analytical model to help system designers to identify
the conditions for energy efficiency.",
acknowledgement = ack-nhfb,
affiliation = "Tang, J (Reprint Author), Beijing Inst Technol,
Beijing 100081, Peoples R China. Tang, Jie; Gu, Zhimin,
Beijing Inst Technol, Beijing 100081, Peoples R China.
Liu, Shaoshan, Microsoft Corp, Redmond, WA 98052 USA.
Liu, Chen, Florida Int Univ, Miami, FL 33199 USA.
Gaudiot, Jean-Luc, Univ Calif Irvine, Irvine, CA USA.",
author-email = "tangjie.bit@gmail.com shaoliu@microsoft.com
zmgu@x263.net chen.liu@fiu.edu gaudiot@uci.edu",
da = "2019-06-20",
doc-delivery-number = "773ZN",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "data prefetching; embedded mobile systems; embedded
systems; energy consumption; energy efficiency
condition; energy-efficient prefetching;
high-performance computing platform; Low power
electronics; Low-power design; Memory management;
Memory Structures; mobile computing; Mobile computing;
Mobile Computing; storage management",
number-of-cited-references = "11",
ORCID-numbers = "Liu, Chen/0000-0003-1558-6836",
research-areas = "Computer Science",
times-cited = "19",
unique-id = "Tang:2011:PEM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Khan:2011:DDC,
author = "Omer Khan and Mieszko Lis and Yildiz Sinangil and
Srinivas Devadas",
title = "{DCC}: a Dependable Cache Coherence Multicore
Architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "12--15",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Cache coherence lies at the core of
functionally-correct operation of shared memory
multicores. Traditional directory-based hardware
coherence protocols scale to large core counts, but
they incorporate complex logic and directories to track
coherence states. Technology scaling has reached
miniaturization levels where manufacturing
imperfections, device unreliability and occurrence of
hard errors pose a serious dependability challenge.
Broken or degraded functionality of the coherence
protocol can lead to a non-operational processor or
user visible performance loss. In this paper, we
propose a dependable cache coherence architecture (DCC)
that combines the traditional directory protocol with a
novel execution-migration-based architecture to ensure
dependability that is transparent to the programmer.
Our architecturally redundant execution migration
architecture only permits one copy of data to be cached
anywhere in the processor: when a thread accesses an
address not locally cached on the core it is executing
on, it migrates to the appropriate core and continues
execution there. Both coherence mechanisms can co-exist
in the DCC architecture and we present architectural
extensions to seamlessly transition between the
directory and execution migration protocols.",
acknowledgement = ack-nhfb,
affiliation = "Khan, O (Reprint Author), MIT, 77 Massachusetts Ave,
Cambridge, MA 02139 USA. Khan, Omer; Lis, Mieszko;
Sinangil, Yildiz; Devadas, Srinivas, MIT, Cambridge, MA
02139 USA. Khan, Omer, Univ Massachusetts, Lowell, MA
USA.",
da = "2019-06-20",
doc-delivery-number = "773ZN",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architecturally redundant execution migration
architecture; B.3.4 Reliability, Testing, and
Fault-Tolerance; B.8 Performance and Reliability;
broken functionality; C.4.b Fault tolerance; cache
coherence; cache storage; Coherence; coherence
mechanisms; coherence states; DCC architecture;
degraded functionality; dependability challenge;
Dependable architecture; dependable cache coherence
architecture; dependable cache coherence multicore
architecture; device unreliability; directory protocol;
directory-based hardware coherence protocols;
execution-migration-based architecture;
functionally-correct operation; Hardware; incorporate
complex logic; Instruction sets; large core counts;
manufacturing imperfections; memory architecture;
memory protocols; microprocessor chips; miniaturization
levels; Multicore processing; multicores;
nonoperational processor; Protocols; shared memory
multicores; shared memory systems; System-on-a-chip;
technology scaling; user visible performance loss",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Khan:2011:DDC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Rosenfeld:2011:DCA,
author = "Paul Rosenfeld and Elliott Cooper-Balis and Bruce
Jacob",
title = "{DRAMSim2}: a Cycle Accurate Memory System Simulator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "16--19",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper we present DRAMSim2, a cycle accurate
memory system simulator. The goal of DRAMSim2 is to be
an accurate and publicly available DDR2/3 memory system
model which can be used in both full system and
trace-based simulations. We describe the process of
validating DRAMSim2 timing against manufacturer Verilog
models in an effort to prove the accuracy of simulation
results. We outline the combination of DRAMSim2 with a
cycle-accurate x86 simulator that can be used to
perform full system simulations. Finally, we discuss
DRAMVis, a visualization tool that can be used to graph
and compare the results of DRAMSim2 simulations.",
acknowledgement = ack-nhfb,
affiliation = "Rosenfeld, P (Reprint Author), Univ Maryland, Dept
Elect \& Comp Engn, College Pk, MD 20742 USA.
Rosenfeld, Paul; Cooper-Balis, Elliott; Jacob, Bruce,
Univ Maryland, Dept Elect \& Comp Engn, College Pk, MD
20742 USA.",
author-email = "prosenf1@umd.edu ecc17@umd.edu blj@umd.edu",
da = "2019-06-20",
doc-delivery-number = "773ZN",
eissn = "1556-6064",
esi-highly-cited-paper = "Y",
esi-hot-paper = "N",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computational modeling; cycle accurate memory system
simulator; DDR2/3 memory system model; DRAM; DRAM
chips; DRAMSim2 simulation; DRAMSim2 timing; Driver
circuits; Hardware design languages; Load modeling;
memory architecture; memory cards; Object oriented
modeling; Primary memory; Random access memory;
Simulation; Timing; trace-based simulation; Verilog
model; visualization tool",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "270",
unique-id = "Rosenfeld:2011:DCA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Gou:2011:ESH,
author = "Chunyang Gou and Georgi N. Gaydadjiev",
title = "Exploiting {SPMD} Horizontal Locality",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "20--23",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this paper, we analyze a particular spatial
locality case (called horizontal locality) inherent to
manycore accelerator architectures employing barrel
execution of SPMD kernels, such as GPUs. We then
propose an adaptive memory access granularity framework
to exploit and enforce the horizontal locality in order
to reduce the interferences among accelerator cores
memory accesses and hence improve DRAM efficiency. With
the proposed technique, DRAM efficiency grows by 1.42X
on average, resulting in 12.3\% overall performance
gain, for a set of representative memory intensive
GPGPU applications.",
acknowledgement = ack-nhfb,
affiliation = "Gou, C (Reprint Author), Delft Univ Technol, NL-2600
AA Delft, Netherlands. Gou, Chunyang; Gaydadjiev,
Georgi N., Delft Univ Technol, NL-2600 AA Delft,
Netherlands.",
author-email = "c.gou@tudelft.nl g.n.gaydadjiev@tudelft.nl",
da = "2019-06-20",
doc-delivery-number = "773ZN",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerator core memory access; adaptive memory access
granularity; Bandwidth; barrel execution; DRAM chips;
DRAM efficiency; GPU; Graphics processing unit;
Instruction sets; interference; Kernel; manycore
accelerator architecture; Memory hierarchy;
microprocessor chips; Multi-core/single-chip
multiprocessors; parallel architectures; Pipelines;
Proposals; Random access memory; SIMD processors;
single program multiple data; spatial locality; SPMD
horizontal locality; SPMD kernel",
number-of-cited-references = "13",
ORCID-numbers = "Gaydadjiev, Georgi/0000-0002-3678-7007",
research-areas = "Computer Science",
researcherid-numbers = "Gaydadjiev, Georgi/F-1488-2010",
times-cited = "1",
unique-id = "Gou:2011:ESH",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wang:2011:GGC,
author = "Xiaoqun Wang and Zhenzhou Ji and Chen Fu and Mingzeng
Hu",
title = "{GCMS}: a Global Contention Management Scheme in
Hardware Transactional Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "24--27",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware Transactional Memory (HTM) is a promising
Transactional Memory (TM) implementation because of its
strong atomicity and high performance. Unfortunately,
most contention management approaches in HTMs are
dedicated to specific transaction conflict scenarios
and it is hard to choose a universal strategy for
different workloads. In addition, HTM performance
degrades sharply when there are severe transaction
conflicts. In this paper, we present a Global
Contention Management Scheme (GCMS) to resolve severe
transaction conflicts in HTMs. Our scheme depends on a
Deadlock and Livelock Detection Mechanism (DLDM) and a
Global Contention Manager (GCM) to resolve severe
transaction conflicts. This scheme is orthogonal to the
rest of the contention management policies. We have
incorporated GCMS into different HTMs and compared the
performance of the enhanced systems with that of the
original HTMs with the STAMP benchmark suite. The
results demonstrate that the performance of the
enhanced HTMs is improved.",
acknowledgement = ack-nhfb,
affiliation = "Wang, XQ (Reprint Author), Harbin Inst Technol, Sch
Comp Sci, Harbin 150006, Peoples R China. Wang,
Xiaoqun; Ji, Zhenzhou; Fu, Chen; Hu, Mingzeng, Harbin
Inst Technol, Sch Comp Sci, Harbin 150006, Peoples R
China.",
author-email = "wxiaoqun@gmail.com",
da = "2019-06-20",
doc-delivery-number = "773ZN",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bioinformatics; Concurrent Programming; Contention
Management; deadlock-and-livelock detection mechanism;
GCMS scheme; Genomics; global contention management
scheme; global contention manager; Hardware; Hardware
Transactional Memory; hardware transactional memory;
Multi-core/single-chip multiprocessors; Multicore
Processors; Parallel Programming; Program processors;
Radiation detectors; storage management; System
recovery; transaction conflict; transaction
processing",
number-of-cited-references = "14",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Wang:2011:GGC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2011:RL,
author = "Anonymous",
title = "2010 Reviewers List",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "28--28",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "IEEE publishing",
}
@Article{Anonymous:2011:AI,
author = "Anonymous",
title = "2010 Annual Index",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "??--??",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:Ca,
author = "Anonymous",
title = "Cover 2",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "c2--c2",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:Cb,
author = "Anonymous",
title = "Cover 3",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "c3--c3",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:Cc,
author = "Anonymous",
title = "Cover 4",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "c4--c4",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:FCa,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "1",
pages = "c1--c1",
month = jan # "\slash " # jun,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Mars:2011:HHW,
author = "Jason Mars and Lingjia Tang and Robert Hundt",
title = "Heterogeneity in {``Homogeneous''} Warehouse-Scale
Computers: a Performance Opportunity",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "29--32",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The class of modern datacenters recently coined as
``warehouse scale computers'' (WSCs) has traditionally
been embraced as homogeneous computing platforms.
However, due to frequent machine replacements and
upgrades, modern WSCs are in fact composed of diverse
commodity microarchitectures and machine
configurations. Yet, current WSCs are designed with an
assumption of homogeneity, leaving a potentially
significant performance opportunity unexplored. In this
paper, we investigate the key factors impacting the
available heterogeneity in modern WSCs, and the benefit
of exploiting this heterogeneity to maximize overall
performance. We also introduce a new metric,
opportunity factor, which can be used to quantify an
application's sensitivity to the heterogeneity in a
given WSC. For applications that are sensitive to
heterogeneity, we observe a performance improvement of
up to 70\% when employing our approach. In a WSC
composed of state-of-the-art machines, we can improve
the overall performance of the entire datacenter by
16\% over the status quo.",
acknowledgement = ack-nhfb,
affiliation = "Mars, J (Reprint Author), Univ Virginia,
Charlottesville, VA 22903 USA. Mars, Jason; Tang,
Lingjia, Univ Virginia, Charlottesville, VA 22903
USA.",
author-email = "jom5x@cs.virginia.edu lt8f@cs.virginia.edu
rhundt@google.com",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Computer architecture; computer
centres; datacenters; Design studies; Distributed
architectures; diverse commodity microarchitectures;
Heterogeneous (hybrid) systems; homogeneous
warehouse-scale computers; integration and modeling;
machine configurations; mainframes; Microarchitecture;
Optimization; Scheduling and task partitioning; Super
(very large) computers; System architectures",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "22",
unique-id = "Mars:2011:HHW",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Michelogiannakis:2011:PCE,
author = "George Michelogiannakis and Nan Jiang and Daniel U.
Becker and William J. Dally",
title = "Packet Chaining: Efficient Single-Cycle Allocation for
On-Chip Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "33--36",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper introduces packet chaining, a simple and
effective method to increase allocator matching
efficiency and hence network performance, particularly
suited to networks with short packets and short cycle
times. Packet chaining operates by chaining packets
destined to the same output together, to reuse the
switch connection of a departing packet. This allows an
allocator to build up an efficient matching over a
number of cycles, like incremental allocation, but not
limited by packet length. For a 64-node 2D mesh at
maximum injection rate and with single-flit packets,
packet chaining increases network throughput by 15\%
compared to a conventional single-iteration separable
iSLIP allocator, outperforms a wavefront allocator, and
gives comparable throughput with an augmenting paths
allocator. Packet chaining achieves this performance
with a cycle time comparable to a single-iteration
separable allocator. Packet chaining also reduces
average network latency by 22.5\% compared to iSLIP.
Finally, packet chaining increases IPC up to 46\% (16\%
average) for application benchmarks because short
packets are critical in a typical cache-coherent CMP.
These are considerable improvements given the maturity
of network-on-chip routers and allocators.",
acknowledgement = ack-nhfb,
affiliation = "Michelogiannakis, G (Reprint Author), Stanford Univ,
Stanford, CA 94305 USA. Michelogiannakis, George;
Jiang, Nan; Becker, Daniel U.; Dally, William J.,
Stanford Univ, Stanford, CA 94305 USA.",
author-email = "mihelog@stanford.edu njiang37@stanford.edu
dub@stanford.edu dally@stanford.edu",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CCF-0702341];
National Security Agency [H98230-08-C-0272-P007];
Robert Bosch Fellowship; Prof. Michael Farmwald
Fellowship; Prof. Michael J. Flynn Stanford Graduate
Fellowship",
funding-text = "This work was supported in part by the National
Science Foundation under Grant CCF-0702341, in part by
the National Security Agency under Contract
H98230-08-C-0272-P007 and in part by the Robert Bosch,
Prof. Michael Farmwald and Prof. Michael J. Flynn
Stanford Graduate Fellowships.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "allocator matching efficiency; Benchmark testing;
Interconnection architectures; network performance;
network-on-chip; network-on-chip routers; On-chip
interconnection networks; on-chip networks; packet
chaining; Resource management; single-iteration
separable iSLIP allocator; System-on-a-chip;
Throughput",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Michelogiannakis:2011:PCE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Ho:2011:EIB,
author = "Chen-Han Ho and Garret Staus and Aaron Ulmer and
Karthikeyan Sankaralingam",
title = "Exploring the Interaction Between Device Lifetime
Reliability and Security Vulnerabilities",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "37--40",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As technology scales, device reliability is becoming a
fundamental problem. Even though manufacture test can
guarantee product quality, due to various types of
wearout and failure modes, permanent faults appearing
in the filed is becoming an increasingly important and
real problem. Such types of wear-out creates permanent
faults in devices after release to the user during
their lifetime. In this paper, we perform a formal
investigation of the impact of permanent faults on
security, examine empirical evidence, and demonstrate a
real attack. Our results show that permanent stuck-at
faults may leave security holes in microprocessors. We
show that an adversary with knowledge of a fault can
launch attacks which can obtain critical secrets such
as a private key in 30 seconds.",
acknowledgement = ack-nhfb,
affiliation = "Ho, CH (Reprint Author), Univ Wisconsin, Madison, WI
53706 USA. Ho, Chen-Han; Staus, Garret; Ulmer, Aaron;
Sankaralingam, Karthikeyan, Univ Wisconsin, Madison, WI
53706 USA.",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Arithmetic and Logic Structures; Circuit faults;
Computer bugs; Control Structures and Microprogramming;
Cryptography; device lifetime reliability; failure
mode; fault tolerant computing; Hardware reliability;
Logic programming; microprocessor chips;
microprocessors; Permanent Fault; permanent fault;
private key; product quality; Program processors;
public key cryptography; Reliability; Reliability
engineering; Security; security vulnerability; wear-out
type; wearout mode",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Ho:2011:EIB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Hernandez:2011:FTV,
author = "Carles Hernandez and Antoni Roca and Jose Flich and
Federico Silla and Jose Duato",
title = "Fault-Tolerant Vertical Link Design for Effective {3D}
Stacking",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "41--44",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Recently, 3D stacking has been proposed to alleviate
the memory bandwidth limitation arising in chip
multiprocessors (CMPs). As the number of integrated
cores in the chip increases the access to external
memory becomes the bottleneck, thus demanding larger
memory amounts inside the chip. The most accepted
solution to implement vertical links between stacked
dies is by using Through Silicon Vias (TSVs). However,
TSVs are exposed to misalignment and random defects
compromising the yield of the manufactured 3D chip. A
common solution to this problem is by
over-provisioning, thus impacting on area and cost. In
this paper, we propose a fault-tolerant vertical link
design. With its adoption, fault-tolerant vertical
links can be implemented in a 3D chip design at low
cost without the need of adding redundant TSVs (no
over-provision). Preliminary results are very promising
as the fault-tolerant vertical link design increases
switch area only by 6.69\% while the achieved
interconnect yield tends to 100\%.",
acknowledgement = ack-nhfb,
affiliation = "Hernandez, C (Reprint Author), Univ Politecn Valencia,
C Cami de Vera S-N, Valencia 46022, Spain. Hernandez,
Carles; Roca, Antoni; Flich, Jose; Silla, Federico;
Duato, Jose, Univ Politecn Valencia, Valencia 46022,
Spain.",
author-email = "carherlu@gap.upv.es",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Spanish MEC; MICINN; European Commission
[CSD2006-00046, TIN2009-14475-C04]; NaNoC [248972]",
funding-text = "This work was supported by the Spanish MEC and MICINN,
as well as European Commission FEDER funds, under
Grants CSD2006-00046 and TIN2009-14475-C04. It was also
partly supported by the project NaNoC (project label
248972) which is funded by the European Commission
within the Research Programme FP7.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "3D chip; 3D Stacking; 3D stacking; chip
multiprocessors; CMP; effective 3D stacking; external
memory; Fault Tolerance; fault tolerance; Fault
Tolerance; Fault tolerant systems; fault-tolerant
vertical link design; memory bandwidth limitation;
Memory management; microprocessor chips;
network-on-chip; NoC; Stacking; storage management
chips; Three dimensional displays; three-dimensional
integrated circuits; through silicon vias; TSV",
number-of-cited-references = "20",
oa = "Green Published",
ORCID-numbers = "Silla, Federico/0000-0002-6435-1200 Hernandez,
Carles/0000-0001-5393-3195",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Hernandez:2011:FTV",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Choi:2011:EID,
author = "Inseok Choi and Minshu Zhao and Xu Yang and Donald
Yeung",
title = "Experience with Improving Distributed Shared Cache
Performance on {Tilera}'s {Tile} Processor",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "45--48",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.18",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper describes our experience with profiling and
optimizing physical locality for the distributed shared
cache (DSC) in Tilera's Tile multicore processor. Our
approach uses the Tile Processor's hardware performance
measurement counters (PMCs) to acquire page-level
access pattern profiles. A key problem we address is
imprecise PMC interrupts. Our profiling tools use
binary analysis to correct for interrupt ``skid'', thus
pinpointing individual memory operations that incur
remote DSC slice references and permitting us to sample
their access patterns. We use our access pattern
profiles to drive page homing optimizations for both
heap and static data objects. Our experiments show we
can improve physical locality for 5 out of 11 SPLASH2
benchmarks running on 32 cores, enabling 32.9\%-77.9\%
of DSC references to target the local DSC slice. To our
knowledge, this is the first work to demonstrate page
homing optimizations on a real system.",
acknowledgement = ack-nhfb,
affiliation = "Choi, I (Reprint Author), Univ Maryland, Dept Elect \&
Comp Engn, College Pk, MD 20742 USA. Choi, Inseok;
Zhao, Minshu; Yang, Xu; Yeung, Donald, Univ Maryland,
Dept Elect \& Comp Engn, College Pk, MD 20742 USA.",
author-email = "inseok@umd.edu mszhao@umd.edu yangxu@umd.edu
yeung@umd.edu",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; binary analysis; cache storage;
Computer architecture; Data streams; Design
methodology; Design studies; distributed shared cache
performance; hardware performance measurement counters;
microprocessor chips; Multi-core/single-chip
multiprocessors; Multicore processing; Multiple Data
Stream Architectures (Multiprocessors); multiprocessing
systems; Multiprocessing systems; page homing
optimization; page-level access pattern profile; PMC
interrupt; profiling tool; Tilera tile multicore
processor",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "5",
unique-id = "Choi:2011:EID",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Prieto:2011:MCM,
author = "Pablo Prieto and Valentin Puente and Jose-Angel
Gregorio",
title = "Multilevel Cache Modeling for Chip-Multiprocessor
Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "49--52",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.20",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "This paper presents a simple analytical model for
predicting on-chip cache hierarchy effectiveness in
chip multiprocessors (CMP) for a state-of-the-art
architecture. Given the complexity of this type of
systems, we use rough approximations, such as the
empirical observation that the re-reference timing
pattern follows a power law and the assumption of a
simplistic delay model for the cache, in order to
provide a useful model for the memory hierarchy
responsiveness. This model enables the analytical
determination of average access time, which makes
design space pruning useful before sweeping the vast
design space of this class of systems. The model is
also useful for predicting cache hierarchy behavior in
future systems. The fidelity of the model has been
validated using a state-of-the-art, full-system
simulation environment, on a system with up to sixteen
out-of-order processors with cache-coherent caches and
using a broad spectrum of applications, including
complex multithread workloads. This simple model can
predict a near-to-optimal, on-chip cache distribution
while also estimating how future systems running future
applications might behave.",
acknowledgement = ack-nhfb,
affiliation = "Prieto, P (Reprint Author), Univ Cantabria, Cantabria,
Spain. Prieto, Pablo; Puente, Valentin; Gregorio,
Jose-Angel, Univ Cantabria, Cantabria, Spain.",
author-email = "prietop@unican.es vpuente@unican.es
monaster@unican.es",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Spanish Ministry of Science and Innovation
[TIN2010-18159]; HiPEAC2 European Network of
Excellence",
funding-text = "This work has been supported by the Spanish Ministry
of Science and Innovation, under contracts
TIN2010-18159, and by the HiPEAC2 European Network of
Excellence. The authors would like to thank the
reviewers for their valuable comments.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "approximation theory; cache hierarchy behavior
prediction; cache storage; Cache storage;
cache-coherent caches; chip-multiprocessor systems;
complex multithread workloads; Complexity theory;
Computational modeling; design space; integrated
circuit design; Memory hierarchy; memory hierarchy
responsiveness; microprocessor chips;
Multi-core/single-chip multiprocessors; multilevel
cache modeling; multiprocessing systems;
Multiprocessing systems; near-to-optimal on-chip cache
distribution; on-chip cache hierarchy effectiveness
prediction; power law; re-reference timing pattern;
rough approximations; simplistic delay model
assumption; Software tools; Thermal analysis; Thermal
sensors",
number-of-cited-references = "13",
ORCID-numbers = "Prieto, Pablo/0000-0002-5818-1188 Puente,
Valentin/0000-0002-6904-3282 Gregorio, Jose
Angel/0000-0003-2214-303X",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Prieto:2011:MCM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Siozios:2011:SRT,
author = "Kostas Siozios and Dimitrios Rodopoulos and Dimitrios
Soudris",
title = "On Supporting Rapid Thermal Analysis",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "53--56",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.19",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Detailed thermal analysis is usually performed
exclusively at design time since it is a
computationally intensive task. In this paper, we
introduce a novel methodology for fast, yet accurate,
thermal analysis. The introduced methodology is
software supported by a new open source tool that
enables hierarchical thermal analysis with adaptive
levels of granularity. Experimental results prove the
efficiency of our approach since it leads to average
reduction of the execution overhead up to 70\% with a
penalty in accuracy ranging between 2\% and 8\%.",
acknowledgement = ack-nhfb,
affiliation = "Siozios, K (Reprint Author), Natl Tech Univ Athens,
Sch ECE, GR-10682 Athens, Greece. Siozios, Kostas;
Rodopoulos, Dimitrios; Soudris, Dimitrios, Natl Tech
Univ Athens, Sch ECE, GR-10682 Athens, Greece.",
da = "2019-06-20",
doc-delivery-number = "855NW",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Complexity theory; Computational modeling; Computer
Systems Organization; Design Methodologies; General;
Hardware; hierarchical thermal analysis; Modeling
techniques; Monitoring; open source tool; Performance
of Systems; Power Management; public domain software;
rapid thermal analysis; Reconfigurable Hardware;
Reconfigurable hardware; Reliability; software
engineering; software supported; Software tools;
thermal analysis; Thermal analysis; Thermal Monitoring;
Thermal sensors",
number-of-cited-references = "8",
ORCID-numbers = "Siozios, Kostas/0000-0002-0285-2202 Soudris,
Dimitrios/0000-0002-6930-6847",
research-areas = "Computer Science",
researcherid-numbers = "Soudris, Dimitrios/I-5252-2014 Siozios,
Kostas/F-9726-2011 Soudris, Dimitrios/O-8843-2019",
times-cited = "3",
unique-id = "Siozios:2011:SRT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2011:Cd,
author = "Anonymous",
title = "Cover 3",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "c3--c3",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.30",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:FCb,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "c1--c1",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.28",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:ICS,
author = "Anonymous",
title = "{IEEE Computer Society} [society information]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "c4--c4",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.31",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2011:PI,
author = "Anonymous",
title = "Publication information",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "10",
number = "2",
pages = "c2--c2",
month = jul # "\slash " # dec,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.29",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Sethumadhavan:2012:CHD,
author = "Simha Sethumadhavan and Ryan Roberts and Yannis
Tsividis",
title = "A Case for Hybrid Discrete-Continuous Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.22",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Current technology trends indicate that power- and
energy-efficiency will limit chip throughput in the
future. Current solutions to these problems, either in
the way of programmable or fixed-function digital
accelerators will soon reach their limits as
microarchitectural overheads are successively trimmed.
A significant departure from current computing methods
is required to carry forward computing advances beyond
digital accelerators. In this paper we describe how the
energy-efficiency of a large class of problems can be
improved by employing a hybrid of the discrete and
continuous models of computation instead of the
ubiquitous, traditional discrete model of computation.
We present preliminary analysis of domains and
benchmarks that can be accelerated with the new model.
Analysis shows that machine learning, physics and up to
one-third of SPEC, RMS and Berkeley suite of
applications can be accelerated with the new hybrid
model.",
acknowledgement = ack-nhfb,
affiliation = "Sethumadhavan, S (Reprint Author), Columbia Univ, New
York, NY 10027 USA. Sethumadhavan, Simha; Roberts,
Ryan; Tsividis, Yannis, Columbia Univ, New York, NY
10027 USA.",
author-email = "simha@cs.columbia.edu",
da = "2019-06-20",
doc-delivery-number = "953VM",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "DARPA; AFRL [FA8750-10-2-0253,
FA9950-09-1-0389]; NSF",
funding-text = "Sethumadhavan's research is funded by grants from
DARPA, AFRL (FA8750-10-2-0253, FA9950-09-1-0389), the
NSF CAREER program, gifts from Microsoft Research and
Columbia University, and software donations from
Synopsys and Wind River. Roberts conducted this
research as a GRA in Sethumadhavan's Lab.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Approximation algorithms; Benchmark testing; Berkeley
suite; Computational modeling; Computer architecture;
computer architecture; Computer architecture; computer
architecture; computing methods; continuous models;
cryptography; Design studies; Differential equations;
discrete model; discrete models; domains analysis;
energy conservation; energy-efficiency; fixed-function
digital accelerators; forward computing advances;
hybrid discrete-continuous architectures; Hybrid
systems; machine learning; Mathematical model;
microarchitectural overheads; microprocessor chips;
power-efficiency; Processor architectures; RMS; SPEC;
Very large scale integration",
number-of-cited-references = "16",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Sethumadhavan:2012:CHD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kong:2012:ASF,
author = "Ji Kong and Peilin Liu and Yu Zhang",
title = "Atomic Streaming: a Framework of On-Chip Data Supply
System for Task-Parallel {MPSoCs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.21",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "State of the art fabrication technology for
integrating numerous hardware resources such as
Processors/DSPs and memory arrays into a single chip
enables the emergence of Multiprocessor System-on-Chip
(MPSoC). Stream programming paradigm based on MPSoC is
highly efficient for single functionality scenario due
to its dedicated and predictable data supply system.
However, when memory traffic is heavily shared among
parallel tasks in applications with multiple
interrelated functionalities, performance suffers
through task interferences and shared memory
congestions which lead to poor parallel speedups and
memory bandwidth utilizations. This paper proposes a
framework of stream processing based on-chip data
supply system for task-parallel MPSoCs. In this
framework, stream address generations and data
computations are decoupled and parallelized to allow
full utilization of on-chip resources. Task
granularities are dynamically tuned to jointly optimize
the overall application performance. Experiments show
that proposed framework as well as the tuning scheme
are effective for joint optimization in task-parallel
MPSoCs.",
acknowledgement = ack-nhfb,
affiliation = "Kong, J (Reprint Author), Shanghai Jiao Tong Univ, Sch
Elect Informat \& Elect Engn, Shanghai 200030, Peoples
R China. Kong, Ji; Liu, Peilin, Shanghai Jiao Tong
Univ, Sch Elect Informat \& Elect Engn, Shanghai
200030, Peoples R China.",
author-email = "johnhophen@sjtu.edu.cn liupeilin@sjtu.edu.cn
zhyu@cn.ibm.com",
da = "2019-06-20",
doc-delivery-number = "953VM",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "IBM Research-China under the IBM",
funding-text = "This work has been partially supported by IBM
Research-China under the IBM Ph.D. Fellowship program
for the 2010-2011 academic year.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application studies resulting in better
multiple-processor systems; atomic streaming;
Bandwidth; data computations; Memory hierarchy;
Multi-core/single-chip multiprocessors; Multicore
processing; Multiple Data Stream Architectures
(Multiprocessors); Multiprocessing systems;
multiprocessor system-on-chip; on-chip data supply
system; Prefetching; shared memory congestions; shared
memory systems; stream address generations; stream
programming paradigm; Streaming media;
System-on-a-chip; system-on-chip; task interferences;
task-parallel MPSoC; Throughput",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kong:2012:ASF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Deb:2012:HSC,
author = "Abhishek Deb and Josep Maria Codina and Antonio
Gonzalez",
title = "A {HW\slash SW} Co-designed Programmable Functional
Unit",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.23",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "In this paper, we propose a novel programmable
functional unit (PFU) to accelerate general purpose
application execution on a modern out-of-order x86
processor. Code is transformed and instructions are
generated that run on the PFU using a co-designed
virtual machine (Cd-VM). Results presented in this
paper show that this HW/SW co-designed approach
produces average speedups in performance of 29\% in
SPECFP and 19\% in SPECINT, and up-to 55\%, over modern
out-of-order processor.",
acknowledgement = ack-nhfb,
affiliation = "Deb, A (Reprint Author), Univ Politecn Cataluna, C
Jordi Girona 1-3, Barcelona, Spain. Deb, Abhishek;
Gonzalez, Antonio, Univ Politecn Cataluna, Barcelona,
Spain. Maria Codina, Josep; Gonzalez, Antonio, Intel
Res Labs Barcelona, Barcelona, Spain.",
author-email = "abhishek@ac.upc.edu josep.m.codina@intel.com
antonio@intel.com",
da = "2019-06-20",
doc-delivery-number = "953VM",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; hardware-software codesign;
Hardware/software interfaces; hw/sw co-designed;
Interface states; Load modeling; Micro-architecture
implementation considerations; Microarchitecture;
Processor Architectures; programmable functional unit;
Programmable functional units; Registers; virtual
machine",
number-of-cited-references = "13",
ORCID-numbers = "Gonzalez, Antonio/0000-0002-0009-0996",
research-areas = "Computer Science",
researcherid-numbers = "Gonzalez, Antonio/I-2961-2014",
times-cited = "0",
unique-id = "Deb:2012:HSC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Piscitelli:2012:HLP,
author = "Roberta Piscitelli and Andy D. Pimentel",
title = "A High-Level Power Model for {MPSoC} on {FPGA}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.24",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper presents a framework for high-level power
estimation of multiprocessor systems-on-chip (MPSoC)
architectures on FPGA. The technique is based on
abstract execution profiles, called event signatures.
As a result, it is capable of achieving good evaluation
performance, thereby making the technique highly useful
in the context of early system-level design space
exploration. We have integrated the power estimation
technique in a system-level MPSoC synthesis framework.
Using this framework, we have designed a range of
different candidate MPSoC architectures and compared
our power estimation results to those from real
measurements on a Virtex-6 FPGA board.",
acknowledgement = ack-nhfb,
affiliation = "Piscitelli, R (Reprint Author), Univ Amsterdam, Inst
Informat, NL-1012 WX Amsterdam, Netherlands.
Piscitelli, Roberta; Pimentel, Andy D., Univ Amsterdam,
Inst Informat, NL-1012 WX Amsterdam, Netherlands.",
author-email = "r.piscitelli@uva.nl a.d.pimentel@uva.nl",
da = "2019-06-20",
doc-delivery-number = "953VM",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "MADNESS STREP",
funding-text = "This work has been partially supported by the MADNESS
STREP-FP7 European Project.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "abstract execution profiles; Computational modeling;
Computer architecture; Estimation; event signatures;
Field programmable gate arrays; field programmable gate
arrays; Field programmable gate arrays; Formal models;
High-level power estimation; high-level power
estimation framework; high-level power model;
integrated circuit design; Mathematical model;
Microprocessors; MPSoC on FPGA; multiprocessing
systems; multiprocessor systems-on-chip architectures;
Performance Analysis and Design Aids; performance
evaluation; power aware computing; Power demand; power
estimation technique; Simulation; system-level design
space exploration; system-level MPSoC design space
exploration; system-level MPSoC synthesis framework;
system-on-chip; Virtex-6 FPGA board",
number-of-cited-references = "15",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Piscitelli:2012:HLP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Finlayson:2012:OSP,
author = "Ian Finlayson and Gang-Ryung Uh and David Whalley and
Gary Tyson",
title = "An Overview of Static Pipelining",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.26",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "A new generation of mobile applications requires
reduced energy consumption without sacrificing
execution performance. In this paper, we propose to
respond to these conflicting demands with an innovative
statically pipelined processor supported by an
optimizing compiler. The central idea of the approach
is that the control during each cycle for each portion
of the processor is explicitly represented in each
instruction. Thus the pipelining is in effect
statically determined by the compiler. The benefits of
this approach include simpler hardware and that it
allows the compiler to perform optimizations that are
not possible on traditional architectures. The initial
results indicate that static pipelining can
significantly reduce power consumption without
adversely affecting performance.",
acknowledgement = ack-nhfb,
affiliation = "Finlayson, I (Reprint Author), Florida State Univ,
Dept Comp Sci, Tallahassee, FL 32306 USA. Finlayson,
Ian; Whalley, David; Tyson, Gary, Florida State Univ,
Dept Comp Sci, Tallahassee, FL 32306 USA. Uh,
Gang-Ryung, Boise State Univ, Dept Comp Sci, Boise, ID
83725 USA.",
author-email = "finlayso@cs.fsu.edu uh@cs.boisestate.edu
whalley@cs.fsu.edu tyson@cs.fsu.edu",
da = "2019-06-20",
doc-delivery-number = "953VM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CNS-0964413, CNS-0915926]",
funding-text = "We thank the anonymous reviewers for their
constructive comments and suggestions. This research
was supported in part by NSF grants CNS-0964413 and
CNS-0915926.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Computer architecture; Energy
consumption; energy consumption reduction; execution
performance; General; mobile applications; optimising
compilers; Optimization; optimizing compiler; Pipeline
processing; pipeline processing; Pipeline processors;
power aware computing; Radio frequency; Registers;
statically pipelined processor",
number-of-cited-references = "14",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Finlayson:2012:OSP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wu:2012:CID,
author = "Lisa Wu and Martha A. Kim and Stephen A. Edwards",
title = "Cache Impacts of Datatype Acceleration",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.25",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware acceleration is a widely accepted solution
for performance and energy efficient computation
because it removes unnecessary hardware for general
computation while delivering exceptional performance
via specialized control paths and execution units. The
spectrum of accelerators available today ranges from
coarse-grain off-load engines such as GPUs to
fine-grain instruction set extensions such as SSE. This
research explores the benefits and challenges of
managing memory at the data-structure level and
exposing those operations directly to the ISA. We call
these instructions Abstract Datatype Instructions
(ADIs). This paper quantifies the performance and
energy impact of ADIs on the instruction and data cache
hierarchies. For instruction fetch, our measurements
indicate that ADIs can result in 21-48\% and 16-27\%
reductions in instruction fetch time and energy
respectively. For data delivery, we observe a 22-40\%
reduction in total data read/write time and 9-30\% in
total data read/write energy.",
acknowledgement = ack-nhfb,
affiliation = "Wu, L (Reprint Author), Columbia Univ, Dept Comp Sci,
New York, NY 10027 USA. Wu, Lisa; Kim, Martha A.;
Edwards, Stephen A., Columbia Univ, Dept Comp Sci, New
York, NY 10027 USA.",
author-email = "lisa@cs.columbia.edu martha@cs.columbia.edu
sedwards@cs.columbia.edu",
da = "2019-06-20",
doc-delivery-number = "953VM",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "abstract data types; abstract datatype instruction;
Accelerators; ADI; cache hierarchy; Cache Hierarchy;
cache hierarchy; Cache memories; cache storage; coarse
grain off-load engine; data read-write energy; data
structure level; Data Structures; energy conservation;
energy efficient computation; energy impact; execution
unit; fine grain instruction set extension; hardware
acceleration; Hardware acceleration; hardware
acceleration; Hardware/software interfaces; Instruction
fetch; instruction fetch energy; instruction fetch
time; Instruction Set Extensions; instruction sets;
ISA; Memory hierarchy; memory management; Memory
Structures; Multicore processing; power aware
computing; Program processors; Support vector machines;
Vectors",
number-of-cited-references = "15",
ORCID-numbers = "Edwards, Stephen/0000-0003-2609-4861",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Wu:2012:CID",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2012:RL,
author = "Anonymous",
title = "2011 Reviewers List",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "25--26",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Lists the reviewers who contributed to IEEE Computer
Architecture Letters in 2011.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "IEEE publishing",
}
@Article{Anonymous:2012:TNQ,
author = "Anonymous",
title = "There now is a quick and easy way to find out about
our collection of {{\booktitle{Transactions}}}
[Advertisement]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "26--26",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.19",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Advertisement: Visit http://www.computer.org/whats-new
today!",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:ACP,
author = "Anonymous",
title = "Advertisement --- {Conference Publishing Services
(CPS)}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "28--28",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "IEEE Conference Publishing Services (CPS)
advertisement.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:AI,
author = "Anonymous",
title = "2011 Annual Index",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "??--??",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This index covers all technical items --- papers,
correspondence, reviews, etc. --- that appeared in this
periodical during the year, and items from previous
years that were commented upon or corrected in this
year. Departments and other items may also be covered
if they have been judged to have archival value. The
Author Index contains the primary entry for each item,
listed under the first author's name. The primary entry
includes the co-authors' names, the title of the paper
or other item, and its location, specified by the
publication abbreviation, year, month, and inclusive
pagination. The Subject Index contains entries
describing the item under all appropriate subject
headings, plus the first author's name, the publication
abbreviation, month, and year, and inclusive pages.
Note that the item title is found only under he primary
entry in the Author Index.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Indexes",
}
@Article{Anonymous:2012:Ca,
author = "Anonymous",
title = "{[Cover2]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "c2--c2",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Provides a listing of current society officers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:Cb,
author = "Anonymous",
title = "{[Cover3]}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "c3--c3",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Provides a listing of current society officers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:FCT,
author = "Anonymous",
title = "[{Front} cover and table of contents]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "c1--c1",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Presents the table of contents for this issue of the
periodical.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:ICS,
author = "Anonymous",
title = "{IEEE Computer Society} [Back cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "1",
pages = "c4--c4",
month = jan # "\slash " # jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Provides a listing of current committee members and
society officers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Davis:2012:IVL,
author = "John D. Davis and Suzanne Rivoire and Moises
Goldszmidt and Ehsan K. Ardestani",
title = "Including Variability in Large-Scale Cluster Power
Models",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "29--32",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.27",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Studying the energy efficiency of large-scale computer
systems requires models of the relationship between
resource utilization and power consumption. Prior work
on power modeling assumes that models built for a
single node will scale to larger groups of machines.
However, we find that inter-node variability in
homogeneous clusters leads to substantially different
models for different nodes. Moreover, ignoring this
variability will result in significant prediction
errors when scaled to the cluster level. We report on
inter-node variation for four homogeneous five-node
clusters using embedded, laptop, desktop, and server
processors. The variation is manifested quantitatively
in the prediction error and qualitatively on the
resource utilization variables (features) that are
deemed relevant for the models. These results
demonstrate the need to sample multiple machines in
order to produce accurate cluster models.",
acknowledgement = ack-nhfb,
affiliation = "Rivoire, Suzanne, Sonoma State Univ, Rohnert Pk, CA
94928 USA. Ardestani, Ehsan K., Univ CA, Santa Cruz, CA
USA.",
author-email = "john.d@microsoft.com suzanne.rivoire@sonoma.edu
moises@microsoft.com eka@soe.ucsc.edu",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computational modeling; Data models; evaluation;
Measurement; modeling; Power demand; Power Management;
Power measurement; Predictive models; Radiation
detectors; Servers; simulation of multiple-processor
systems",
number-of-cited-references = "26",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Davis:2012:IVL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lakshminarayana:2012:DSP,
author = "Nagesh B. Lakshminarayana and Jaekyu Lee and Hyesoon
Kim and Jinwoo Shin",
title = "{DRAM} Scheduling Policy for {GPGPU} Architectures
Based on a Potential Function",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "33--36",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.32",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "GPGPU architectures (applications) have several
different characteristics compared to traditional CPU
architectures (applications): highly multithreaded
architectures and SIMD-execution behavior are the two
important characteristics of GPGPU computing. In this
paper, we propose a potential function that models the
DRAM behavior in GPGPU architectures and a DRAM
scheduling policy, alpha-SJF policy to minimize the
potential function. The scheduling policy essentially
chooses between SJF and FR-FCFS at run-time based on
the number of requests from each thread and whether the
thread has a row buffer hit.",
acknowledgement = ack-nhfb,
affiliation = "Lakshminarayana, NB (Reprint Author), Georgia Inst
Technol, Sch Comp Sci, Atlanta, GA 30332 USA.
Lakshminarayana, Nagesh B.; Lee, Jaekyu; Kim, Hyesoon;
Shin, Jinwoo, Georgia Inst Technol, Sch Comp Sci,
Atlanta, GA 30332 USA.",
author-email = "nageshbl@cc.gatech.edu jaekyu.lee@cc.gatech.edu
hyesoon.kim@cc.gatech.edu jshin72@cc.gatech.edu",
da = "2019-06-20",
doc-delivery-number = "057JO",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Computer architecture; DRAM chips;
DRAM scheduling; DRAM scheduling policy; dynamic random
access memory; Equations; general-purpose graphics
processing unit; GPGPU; GPGPU architecture; graphics
processing units; Instruction sets; Mathematical model;
multi-threading; multithreaded architecture; Potential
function; potential function; Potential function;
Processor scheduling; Random access memory; row buffer
hit; scheduling; SIMD-execution behavior",
number-of-cited-references = "5",
research-areas = "Computer Science",
researcherid-numbers = "Shin, Jinwoo/M-5389-2013",
times-cited = "7",
unique-id = "Lakshminarayana:2012:DSP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wang:2012:ISA,
author = "Yaohua Wang and Shuming Chen and Kai Zhang and
Jianghua Wan and Xiaowen Chen and Hu Chen and Haibo
Wang",
title = "Instruction Shuffle: Achieving {MIMD}-like Performance
on {SIMD} Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "37--40",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.34",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "SIMD architectures are less efficient for applications
with the diverse control-flow behavior, which can be
mainly attributed to the requirement of the identical
control-flow. In this paper, we propose a novel
instruction shuffle scheme that features an efficient
control-flow handling mechanism. The cornerstones are
composed of a shuffle source instruction buffer array
and an instruction shuffle unit. The shuffle unit can
concurrently deliver instructions of multiple distinct
control-flows from the instruction buffer array to
eligible SIMD lanes. Our instruction shuffle scheme
combines the best attributes of both the SIMD and MIMD
execution paradigms. Experimental results show that, an
average performance improvement of 86\% can be
achieved, at a cost of only 5.8\% area overhead.",
acknowledgement = ack-nhfb,
affiliation = "Wang, YH (Reprint Author), Natl Univ Def Technol, Sch
Comp Sci, Changsha, Hunan, Peoples R China. Wang,
Yaohua; Chen, Shuming; Zhang, Kai; Wan, Jianghua; Chen,
Xiaowen; Chen, Hu; Wang, Haibo, Natl Univ Def Technol,
Sch Comp Sci, Changsha, Hunan, Peoples R China.",
author-email = "nudtyh@gmail.com",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Natural Science Foundation of
China [61070036, 61133007]; National 863 Program of
China [2009AA011704]",
funding-text = "The work is partially supported by the National
Natural Science Foundation of China (No. 61070036), the
National Natural Science Foundation of China (No.
61133007), the National 863 Program of China (No.
2009AA011704).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Arrays; data dependent control-flow; diverse
control-flow behavior; identical control-flow behavior;
instruction buffer array; Instruction sets; instruction
shuffle; instruction shuffle unit; Kernel; MIMD
execution paradigm; MIMD-like performance; multiple
instruction multiple data; parallel processing; Process
control; Resource management; Scalability; shuffle
source instruction buffer array; SIMD; SIMD
architecture; SIMD execution paradigm; single
instruction multiple data; Vectors",
number-of-cited-references = "9",
research-areas = "Computer Science",
researcherid-numbers = "Chen, Shuming/Q-1147-2018",
times-cited = "6",
unique-id = "Wang:2012:ISA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Panda:2012:BFB,
author = "Reena Panda and Paul V. Gratz and Daniel A.
Jim{\'e}nez",
title = "{B-Fetch}: Branch Prediction Directed Prefetching for
In-Order Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "41--44",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.33",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Computer architecture is beset by two opposing trends.
Technology scaling and deep pipelining have led to high
memory access latencies; meanwhile, power and energy
considerations have revived interest in traditional
in-order processors. In-order processors, unlike their
superscalar counterparts, do not allow execution to
continue around data cache misses. In-order processors,
therefore, suffer a greater performance penalty in the
light of the current high memory access latencies.
Memory prefetching is an established technique to
reduce the incidence of cache misses and improve
performance. In this paper, we introduce B-Fetch, a new
technique for data prefetching which combines branch
prediction based lookahead deep path speculation with
effective address speculation, to efficiently improve
performance in in-order processors. Our results show
that B-Fetch improves performance 38.8\% on SPEC
CPU2006 benchmarks, beating a current, state-of-the-art
prefetcher design at similar to 1/3 the hardware
overhead.",
acknowledgement = ack-nhfb,
affiliation = "Panda, R (Reprint Author), Texas A\&M Univ, Dept Elect
\& Comp Engn, CESG, College Stn, TX 77843 USA. Panda,
Reena; Gratz, Paul V., Texas A\&M Univ, Dept Elect \&
Comp Engn, CESG, College Stn, TX 77843 USA. Jimenez,
Daniel A., Univ Texas San Antonio, Dept Comp Sci, San
Antonio, TX USA.",
author-email = "reena.panda@tamu.edu pgratz@tamu.edu dj@cs.utsa.edu",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "address speculation; B-fetch; Benchmark testing;
Branch Prediction; branch prediction based lookahead
deep path speculation; branch prediction directed
prefetching; Cache memory; computer architecture;
Computer architecture; data cache; Data Cache
Prefetching; deep pipelining; energy consideration;
Hardware; in-order processor; In-order Processors;
memory access latency; memory prefetching; Memory
Systems; Pipelines; power aware computing; power
consideration; Prefetching; Process control; Registers;
storage management; superscalar processor; technology
scaling; Value Prediction",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Panda:2012:BFB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Miller:2012:MEP,
author = "Timothy N. Miller and Renji Thomas and Radu
Teodorescu",
title = "Mitigating the Effects of Process Variation in
Ultra-low Voltage Chip Multiprocessors using Dual
Supply Voltages and Half-Speed Units",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "45--48",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.36",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Energy efficiency is a primary concern for
microprocessor designers. One very effective approach
to improving processor energy efficiency is to lower
its supply voltage to very near to the transistor
threshold voltage. This reduces power consumption
dramatically, improving energy efficiency by an order
of magnitude. Low voltage operation, however, increases
the effects of parameter variation resulting in
significant frequency heterogeneity between (and
within) otherwise identical cores. This heterogeneity
severely limits the maximum frequency of the entire
CMP. We present a combination of techniques aimed at
reducing the effects of variation on the performance
and energy efficiency of near-threshold, many-core
CMPs. Dual Voltage Rail (DVR), mitigates core-to-core
variation with a dual-rail power delivery system that
allows post-manufacturing assignment of different
supply voltages to individual cores. This speeds up
slow cores by assigning them to a higher voltage and
saves power on fast cores by assigning them to a lower
voltage. Half-Speed Unit (HSU) mitigates within-core
variation by halving the frequency of select functional
blocks with the goal of boosting the frequency of
individual cores, thus raising the frequency ceiling
for the entire CMP. Together, these variation-reduction
techniques result in almost 50\% improvement in CMP
performance for the same power consumption over a mix
of workloads.",
acknowledgement = ack-nhfb,
affiliation = "Miller, TN (Reprint Author), Ohio State Univ, Dept
Comp Sci \& Engn, Columbus, OH 43210 USA. Miller,
Timothy N.; Thomas, Renji; Teodorescu, Radu, Ohio State
Univ, Dept Comp Sci \& Engn, Columbus, OH 43210 USA.",
author-email = "millerti@cse.ohio-state.edu thomasr@cse.ohio-state.edu
teodores@cse.ohio-state.edu",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CCF-1117799]",
funding-text = "This work was supported in part by the National
Science Foundation under grant CCF-1117799 and an
allocation of computing time from the Ohio
Supercomputer Center. The authors would like to thank
the anonymous reviewers for their suggestions and
feedback.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; chip multiprocessors; Clocks; CMP
frequency ceiling; CMP performance; Computer
architecture; core-to-core variation; Delay; dual
supply voltage; dual voltage rail; dual-rail power
delivery system; energy conservation; Energy
efficiency; energy efficiency; Energy efficiency;
frequency heterogeneity; half-speed unit; low voltage
operation; microprocessor chips; microprocessor design;
Multiprocessing systems; near-threshold voltage;
parameter variation; power aware computing; power
consumption; Power demand; process variation; process
variation effect; Rails; supply voltage assignment;
Threshold voltage; transistor threshold voltage;
ultra-low voltage chip multiprocessors; within-core
variation",
number-of-cited-references = "15",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Miller:2012:MEP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Li:2012:LSS,
author = "Yong Li and Rami Melhem and Alex K. Jones",
title = "Leveraging Sharing in Second Level
Translation-Lookaside Buffers for Chip
Multiprocessors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "49--52",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.35",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Traversing page table during virtual to physical
address translation causes significant pipeline stalls
when misses occur in the translation-lookaside buffer
(TLB). To mitigate this penalty, we propose a fast,
scalable, multi-level TLB organization that leverages
page sharing behaviors and performs efficient TLB entry
placement. Our proposed partial sharing TLB (PSTLB)
reduces TLB misses by around 60\%. PSTLB also improves
TLB performance by nearly 40\% compared to traditional
private TLBs and 17\% over the state of the art
scalable TLB proposal.",
acknowledgement = ack-nhfb,
affiliation = "Li, Y (Reprint Author), Univ Pittsburgh, Dept Elect \&
Comp Engn, Pittsburgh, PA 15261 USA. Li, Yong, Univ
Pittsburgh, Dept Elect \& Comp Engn, Pittsburgh, PA
15261 USA.",
author-email = "yol26@pitt.edu",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-0702452]",
funding-text = "This work is supported by NSF award CCF-0702452",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; buffer storage; chip
multiprocessor; CMPs; Fluids; microprocessor chips;
multilevel TLB organization; multiprocessing systems;
Oceans; page sharing behavior; Partial Sharing; partial
sharing TLB; Prefetching; private TLB; program
interpreters; Runtime; second level
translation-lookaside buffers; Tiles; TLB entry
placement; TLBs; Virtual private networks;
virtual-to-physical address translation",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Li:2012:LSS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Delimitrou:2012:DDS,
author = "Christina Delimitrou and Sriram Sankar and Kushagra
Vaid and Christos Kozyrakis",
title = "Decoupling Datacenter Storage Studies from Access to
Large-Scale Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "53--56",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2011.37",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Suboptimal storage design has significant cost and
power impact in large-scale datacenters (DCs).
Performance, power and cost-optimized systems require
deep understanding of target workloads, and mechanisms
to effectively model different storage design choices.
Traditional benchmarking is invalid in cloud
data-stores, representative storage profiles are hard
to obtain, while replaying applications in different
storage configurations is impractical both in cost and
time. Despite these issues, current workload generators
are not able to reproduce key aspects of real
application patterns (e.g., spatial/temporal locality,
I/O intensity). In this paper, we propose a modeling
and generation framework for large-scale storage
applications. As part of this framework we use a state
diagram-based storage model, extend it to a
hierarchical representation, and implement a tool that
consistently recreates DC application I/O loads. We
present the principal features of the framework that
allow accurate modeling and generation of storage
workloads, and the validation process performed against
ten original DC application traces. Finally, we explore
two practical applications of this methodology: SSD
caching and defragmentation benefits on enterprise
storage. Since knowledge of the workload's spatial and
temporal locality is necessary to model these use
cases, our framework was instrumental in quantifying
their performance benefits. The proposed methodology
provides detailed understanding of the storage activity
of large-scale applications, and enables a wide
spectrum of storage studies, without the requirement to
access application code and full application
deployment.",
acknowledgement = ack-nhfb,
affiliation = "Delimitrou, C (Reprint Author), Stanford Univ,
Stanford, CA 94305 USA. Delimitrou, Christina;
Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305
USA. Sankar, Sriram; Vaid, Kushagra, Microsoft Corp,
Seattle, WA USA.",
author-email = "cdel@stanford.edu srsankar@microsoft.com
kvaid@microsoft.com kozyraki@stanford.edu",
da = "2019-06-20",
doc-delivery-number = "057JO",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cloud data-store; Computational modeling; computer
centres; cost impact; datacenter storage; Electronic
mail; enterprise storage defragmentation; Generators;
large-scale datacenter; Load modeling; Mass storage;
Modeling of computer architecture; Modeling techniques;
power impact; SSD caching; state diagram-based storage
model; Storage area networks; storage design choice;
storage management; storage profile; storage workload;
suboptimal storage design; Super (very large)
computers; Throughput; Very large scale integration;
workload spatial locality; workload temporal locality",
number-of-cited-references = "7",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Delimitrou:2012:DDS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Chen:2012:NPD,
author = "Jie Chen and Guru Venkataramani and Gabriel Parmer",
title = "The Need for Power Debugging in the Multi-Core
Environment",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "57--60",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Debugging an application for power has a wide array of
benefits ranging from minimizing the thermal hotspots
to reducing the likelihood of CPU malfunction. In this
work, we justify the need for power debugging, and show
that performance debugging of a parallel application
does not automatically guarantee power balance across
multiple cores. We perform experiments and show our
results using two case study benchmarks, Volrend from
Splash-2 and Bodytrack from Parsec-1.0.",
acknowledgement = ack-nhfb,
affiliation = "Chen, J (Reprint Author), George Washington Univ,
Washington, DC 20052 USA. Chen, Jie; Venkataramani,
Guru; Parmer, Gabriel, George Washington Univ,
Washington, DC 20052 USA.",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CCF-1117243]",
funding-text = "This material is based upon work supported in part by
the National Science Foundation under Grant No.
CCF-1117243.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Bodytrack; Debugging; Instruction
sets; Multi-cores; multicore environment; Multicore
processing; multiprocessing systems; parallel
application; parallel programming; Parsec-1.0;
performance debugging; power aware computing; power
balance; Power Debugging; power debugging; Power
Debugging; Power demand; Power Imbalance; program
debugging; Splash-2; Volrend",
number-of-cited-references = "18",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Chen:2012:NPD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Meza:2012:EES,
author = "Justin Meza and Jichuan Chang and HanBin Yoon and Onur
Mutlu and Parthasarathy Ranganathan",
title = "Enabling Efficient and Scalable Hybrid Memories Using
Fine-Granularity {DRAM} Cache Management",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "61--64",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hybrid main memories composed of DRAM as a cache to
scalable non-volatile memories such as phase-change
memory (PCM) can provide much larger storage capacity
than traditional main memories. A key challenge for
enabling high-performance and scalable hybrid memories,
though, is efficiently managing the metadata (e.g.,
tags) for data cached in DRAM at a fine granularity.
Based on the observation that storing metadata off-chip
in the same row as their data exploits DRAM row buffer
locality, this paper reduces the overhead of
fine-granularity DRAM caches by only caching the
metadata for recently accessed rows on-chip using a
small buffer. Leveraging the flexibility and efficiency
of such a fine-granularity DRAM cache, we also develop
an adaptive policy to choose the best granularity when
migrating data into DRAM. On a hybrid memory with a
512MB DRAM cache, our proposal using an 8KB on-chip
buffer can achieve within 6\% of the performance of,
and 18\% better energy efficiency than, a conventional
8MB SRAM metadata store, even when the energy overhead
due to large SRAM metadata storage is not considered.",
acknowledgement = ack-nhfb,
affiliation = "Meza, J (Reprint Author), Carnegie Mellon Univ,
Pittsburgh, PA 15213 USA. Meza, Justin; Yoon, HanBin;
Mutlu, Onur, Carnegie Mellon Univ, Pittsburgh, PA 15213
USA. Chang, Jichuan; Ranganathan, Parthasarathy,
Hewlett Packard Labs, Palo Alto, CA USA.",
author-email = "meza@cmu.edu jichuan.chang@hp.com hanbinyoon@cmu.edu
onur@cmu.edu partha.ranganathan@hp.com",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF CAREER [CCF-0953246]; NSF EAGER
[CCF-1147397]; Gigascale Systems Research Center",
funding-text = "We thank the members of the SAFARI research group and
the anonymous reviewers for their comments and
suggestions. We gratefully acknowledge the support of
an NSF CAREER Award CCF-0953246, NSF EAGER Grant
CCF-1147397, and the Gigascale Systems Research Center.
Part of this work was done while Justin Meza and HanBin
Yoon were interns at Hewlett-Packard Labs.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Buffer storage; Cache memories; Cache
memory; cache storage; data migration; DRAM chips; DRAM
row buffer locality; dynamic random access memory;
fine-granularity DRAM cache management; hybrid main
memories; hybrid main memory; Indexes; Memory
management; meta data; metadata caching; metadata
management; metadata storage; non-volatile memories;
Phase change materials; phase-change memory; Random
access memory; scalable hybrid memory;
System-on-a-chip; tag storage",
number-of-cited-references = "16",
research-areas = "Computer Science",
times-cited = "35",
unique-id = "Meza:2012:EES",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zidenberg:2012:MHS,
author = "Tsahee Zidenberg and Isaac Keslassy and Uri Weiser",
title = "{MultiAmdahl}: How Should {I} Divide My Heterogeneous
Chip?",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "65--68",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Future multiprocessor chips will integrate many
different units, each tailored to a specific
computation. When designing such a system, a chip
architect must decide how to distribute the available
limited system resources, such as area and power, among
all the computational units. In this paper, we
introduce MultiAmdahl, an analytical optimization
technique for resource sharing among heterogeneous
units. MultiAmdahl takes into account the workload, the
performance of each computational unit, and the total
available resource. The results obtained by MultiAmdahl
allow us, for example, to provide a closed-form
solution for an optimal asymmetric-offload chip, and to
analyze the impact of different design constraints on
an optimal chip architecture.",
acknowledgement = ack-nhfb,
affiliation = "Zidenberg, T (Reprint Author), Technion Israel Inst
Technol, EE Dept, Haifa, Israel. Zidenberg, Tsahee;
Keslassy, Isaac; Weiser, Uri, Technion Israel Inst
Technol, EE Dept, Haifa, Israel.",
author-email = "tsahee@tx.technion.ac.il isaac@ee.technion.ac.il
weiser@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "057JO",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "European Research Council [210389]; Intel
Heterogeneous Computing research grant",
funding-text = "This work was partly supported by the European
Research Council Starting Grant No. 210389 and by the
Intel Heterogeneous Computing research grant.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "area resource; asymmetric-offload chip; Central
Processing Unit; Chip Multiprocessors; Computational
modeling; computational unit; Computer architecture;
design constraint; heterogeneous chip; heterogeneous
unit; Mathematical model; microprocessor chips;
Modeling of computer architecture; MultiAmdahl
analytical optimization technique; multiprocessing
systems; multiprocessor chip; optimal chip
architecture; Optimization; power resource; Program
processors; resource allocation; Resource management;
resource sharing",
keywords-plus = "AMDAHLS LAW",
number-of-cited-references = "7",
research-areas = "Computer Science",
times-cited = "12",
unique-id = "Zidenberg:2012:MHS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2012:BC,
author = "Anonymous",
title = "[{Back} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "c4--c4",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.38",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:BIC,
author = "Anonymous",
title = "[{Back} inside cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "c3--c3",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.37",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2012:FIC,
author = "Anonymous",
title = "[{Front} inside cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "11",
number = "2",
pages = "c2--c2",
month = jul # "\slash " # dec,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.36",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Skadron:2013:INE,
author = "Kevin Skadron",
title = "Introducing the New {Editor-in-Chief} of the
{{\booktitle{IEEE Computer Architecture Letters}}}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "1--1",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.15",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The out-going Editor-in-Chief introduces Jose F.
Mart{\'\i}nez as the new Editor-in-Chief (EIC) of the
IEEE Computer Architecture Letters (CAL). A brief
professional biography is included. In addition, it is
noted that CAL aims to provide fast-turnaround for
early work with outstanding promise. The majority of
decisions are returned within one month, nearly all
within six weeks, and all decisions are rendered within
two months. The overall acceptance rate has
consistently run at about 25\%. Many papers first
published in CAL go on to become full papers in premier
conferences and journals, and CAL's impact factor
continues to increase. CAL has been a valuable addition
to the publishing landscape in computer architecture
and under Prof. Martinez's leadership, we can look
forward to even greater impact in the future. I would
like to take this opportunity to thank all of the CAL
Associate Editors, authors, readers, and reviewers for
their great help and support.",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "172HT",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
number-of-cited-references = "0",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Skadron:2013:INE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2013:AI,
author = "Anonymous",
title = "2012 Annual Index",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This index covers all technical items - papers,
correspondence, reviews, etc. - that appeared in this
periodical during the year, and items from previous
years that were commented upon or corrected in this
year. Departments and other items may also be covered
if they have been judged to have archival value. The
Author Index contains the primary entry for each item,
listed under the first author's name. The primary entry
includes the co-authors' names, the title of the paper
or other item, and its location, specified by the
publication abbreviation, year, month, and inclusive
pagination. The Subject Index contains entries
describing the item under all appropriate subject
headings, plus the first author's name, the publication
abbreviation, month, and year, and inclusive pages.
Note that the item title is found only under the
primary entry in the Author Index.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Indexes",
}
@Article{Eeckhout:2013:MNE,
author = "Lieven Eeckhout",
title = "A Message from the New {Editor-in-Chief} and
Introduction of New {Associate Editors}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "2--2",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
affiliation = "Eeckhout, L (Reprint Author), Univ Ghent, B-9000
Ghent, Belgium.",
da = "2019-06-20",
doc-delivery-number = "172HT",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
number-of-cited-references = "0",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Eeckhout:2013:MNE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Martinez:2013:MNE,
author = "J. Martinez",
title = "A Message from the New {Editor-in-Chief} and
Introduction of New {Associate} Editors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "2--4",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.12",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The incoming Editor-in-Chief states that his goal
during his tenure with IEEE Computer Architecture
Letters (CAL) will be to further increase its
visibility in our research community, and to attract
more submissions from computer architecture leaders.
The {"Best} of {CAL"} session at HPCA, which has taken
place for the last couple of years, is a good step in
this direction. He is also committed to continue
improving the coordination with authors and conference
program chairs, and to consolidate CAL's unique place
in the publication pipeline as the prime venue for
quick dissemination of high-quality novel ideas and
early results.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Tavakkol:2013:NSS,
author = "Arash Tavakkol and Mohammad Arjomand and Hamid
Sarbazi-Azad",
title = "{Network-on-SSD}: a Scalable and High-Performance
Communication Design Paradigm for {SSDs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In recent years, flash memory solid state disks (SSDs)
have shown a great potential to change storage
infrastructure because of its advantages of high speed
and high throughput random access. This promising
storage, however, greatly suffers from performance loss
because of frequent ``erase-before-write'' and
``garbage collection'' operations. Thus. novel
circuit-level, architectural, and algorithmic
techniques are currently explored to address these
limitations. In parallel with others, current study
investigates replacing shared buses in multi-channel
architecture of SSDs with an interconnection network to
achieve scalable, high throughput, and reliable SSD
storage systems. Roughly speaking, such a communication
scheme provides superior parallelism that allows us to
compensate the main part of the performance loss
related to the aforementioned limitations through
increasing data storage and retrieval processing
throughput.",
acknowledgement = ack-nhfb,
affiliation = "Tavakkol, A (Reprint Author), Sharif Univ Technol,
Dept Comp Engn, HPCAN Lab, Tehran, Iran. Tavakkol,
Arash; Arjomand, Mohammad; Sarbazi-Azad, Hamid, Sharif
Univ Technol, Dept Comp Engn, HPCAN Lab, Tehran, Iran.
Sarbazi-Azad, Hamid, Inst Res Fundamental Sci IPM, Sch
Comp Sci, Tehran, Iran.",
author-email = "tavakkol@ce.sharif.edu arjomand@ce.sharif.edu
azad@sharif.edu",
da = "2019-06-20",
doc-delivery-number = "172HT",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "algorithmic technique; architectural technique;
Bandwidth; Buffer storage; circuit-level technique;
Complexity theory; Data storage systems; data storage
throughput; flash memories; Flash memory; flash memory
solid state disks; frequent erase-before-write
operations; garbage collection operations; high speed
random access; high throughput random access;
high-performance communication design paradigm;
integrated circuit design; integrated circuit
reliability; Inter-package parallelism; interconnection
network; Interconnection network; interconnection
network; Interconnections (Subsystems); Mass storage;
memory architecture; multichannel architecture;
multiprocessor interconnection networks;
network-on-chip; network-on-SSD; parallel memories;
Parallel processing; parallel storage; performance
evaluation; performance loss; retrieval processing
throughput; scalable communication design paradigm;
Solid state disk; SSD storage system reliability;
storage infrastructure; storage management; system
buses; Throughput",
keywords-plus = "MEMORY",
number-of-cited-references = "6",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Tavakkol:2013:NSS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Sun:2013:NWC,
author = "Guang Sun and Chia-Wei Chang and Bill Lin",
title = "A New Worst-Case Throughput Bound for Oblivious
Routing in Odd Radix Mesh Network",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "1/2 network capacity is often believed to be the limit
of worst-case throughput for mesh networks. However,
this letter provides a new worst-case throughput bound,
which is higher than 1/2 network capacity, for odd
radix two-dimensional mesh networks. In addition, we
propose a routing algorithm called U2TURN that can
achieve this worst-case throughput bound. U2TURN
considers all routing paths with at most 2 turns and
distributes the traffic loads uniformly in both X and Y
dimensions. Theoretical analysis and simulation results
show that U2TURN outperforms existing routing
algorithms in worst-case throughput. Moreover, U2TURN
achieves good average-throughput at the expense of
approximately 1.5x minimal average hop count.",
acknowledgement = ack-nhfb,
affiliation = "Sun, G (Reprint Author), Tsinghua Univ, Beijing,
Peoples R China. Sun, Guang, Tsinghua Univ, Beijing,
Peoples R China. Chang, Chia-Wei; Lin, Bill, Univ Calif
San Diego, San Diego, CA 92103 USA.",
da = "2019-06-20",
doc-delivery-number = "172HT",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithm design and analysis; average-case
throughput; Computer architecture; Interconnection
architectures; mesh; Mesh networks; network capacity;
network-on-chip; Networks-on-Chip (NoC); oblivious
routing; odd radix mesh network; odd radix
two-dimensional mesh network; On-chip interconnection
networks; Parallel algorithms; Routing; routing;
Routing; Routing protocols; Throughput; traffic load;
U2TURN; Worst-case analysis; worst-case throughput;
worst-case throughput bound",
number-of-cited-references = "10",
ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X",
research-areas = "Computer Science",
researcherid-numbers = "Lin, Binshan/A-9772-2009",
times-cited = "1",
unique-id = "Sun:2013:NWC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Karsli:2013:EDT,
author = "I. Burak Karsli and Pedro Reviriego and M. Fatih Balli
and O{\u{g}}uz Ergin and J. A. Maestro",
title = "Enhanced Duplication: a Technique to Correct Soft
Errors in Narrow Values",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Soft errors are transient errors that can alter the
logic value of a register bit causing data corruption.
They can be caused by radiation particles such as
neutrons or alpha particles. Narrow values are commonly
found in the data consumed or produced by processors.
Several techniques have recently been proposed to
exploit the unused bits in narrow values to protect
them against soft errors. These techniques replicate
the narrow value over the unused register bits such
that errors can be detected when the value is
duplicated and corrected when the value is tripled. In
this letter, a technique that can correct errors when
the narrow value is only duplicated is presented. The
proposed approach stores a modified duplicate of the
narrow value such that errors on the original value and
the duplicate can be distinguished and therefore
corrected. The scheme has been implemented at the
circuit level to evaluate its speed and also at the
architectural level to assess the benefits in
correcting soft errors. The results show that the
scheme is significantly faster than a parity check and
can improve substantially the number of soft errors
that are corrected compared to existing techniques.",
acknowledgement = ack-nhfb,
affiliation = "Karsli, IB (Reprint Author), TOBB Univ Econ \&
Technol, Ankara, Turkey. Karsli, I. Burak; Balli, M.
Fatih; Ergin, O{\u{g}}uz, TOBB Univ Econ \& Technol,
Ankara, Turkey. Reviriego, Pedro; Maestro, J. A., Univ
Antonio de Nebrija, Madrid, Spain.",
da = "2019-06-20",
doc-delivery-number = "172HT",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Spanish Ministry of Science and Education
[AYA2009-13300-C03]; Scientific and Technological
Research Council of Turkey (TUBITAK) [112E004]",
funding-text = "This work was supported in part by the Spanish
Ministry of Science and Education under Grant
AYA2009-13300-C03 and by the Scientific and
Technological Research Council of Turkey (TUBITAK)
under Grant 112E004. The work is a collaboration in the
framework of COST ICT Action 1103 ``Manufacturable and
Dependable Multicore Architectures at Nanoscale''.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "alpha particles; architectural level; Benchmark
testing; computer architecture; Data Cache; data
corruption; Data processing; enhanced duplication;
Error correction; Error Correction; Error correction;
Error-checking; Logic gates; logic value;
microprocessor chips; narrow values; Narrow Values;
narrow values; neutrons; Parity check codes;
processors; Program processors; radiation hardening
(electronics); radiation particles; Redundant design;
register bit; Registers; soft errors; Soft Errors; soft
errors",
number-of-cited-references = "11",
ORCID-numbers = "Sousa, Leonel/0000-0002-8066-221X Ergin,
O{\u{g}}uz/0000-0003-2701-3787 Maestro, Juan
Antonio/0000-0001-7133-9026 Reviriego,
Pedro/0000-0001-6805-6519",
research-areas = "Computer Science",
researcherid-numbers = "Sousa, Leonel/B-2749-2009 Ergin,
O{\u{g}}uz/E-5717-2010 Maestro, Juan
Antonio/L-6091-2014 Reviriego, Pedro/B-8353-2009",
times-cited = "2",
unique-id = "Karsli:2013:EDT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lyons:2013:SFF,
author = "Michael Lyons and Gu-Yeon Wei and David Brooks",
title = "{Shrink-Fit}: a Framework for Flexible Accelerator
Sizing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "RTL design complexity discouraged adoption of
reconfigurable logic in general purpose systems,
impeding opportunities for performance and energy
improvements. Recent improvements to HLS compilers
simplify RTL design and are easing this barrier. A new
challenge will emerge: managing reconfigurable
resources between multiple applications with custom
hardware designs. In this paper, we propose a method to
``shrink-fit' accelerators within widely varying fabric
budgets. Shrink-fit automatically shrinks existing
accelerator designs within small fabric budgets and
grows designs to increase performance when larger
budgets are available. Our method takes advantage of
current accelerator design techniques and introduces a
novel architectural approach based on fine-grained
virtualization. We evaluate shrink-fit using a
synthesized implementation of an IDCT for decoding
JPEGs and show the IDCT accelerator can shrink by a
factor of 16x with minimal performance and area
overheads. Using shrink-fit, application designers can
achieve the benefits of hardware acceleration with
single RTL designs on FPGAs large and small.",
acknowledgement = ack-nhfb,
affiliation = "Lyons, M (Reprint Author), Harvard Univ, Sch Engn \&
Appl Sci, Cambridge, MA 02138 USA. Lyons, Michael; Wei,
Gu-Yeon; Brooks, David, Harvard Univ, Sch Engn \& Appl
Sci, Cambridge, MA 02138 USA.",
da = "2019-06-20",
doc-delivery-number = "172HT",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accelerators; computational complexity; Computer
applications; custom hardware design; Decoding;
discrete cosine transforms; fabric budget; field
programmable gate arrays; Field programmable gate
arrays; fine grained virtualization; flexible
accelerator sizing; FPGA; general purpose computers;
general purpose system; hardware acceleration;
Heterogeneous (hybrid) systems; HLS compiler; IDCT
accelerator; inverse transforms; JPEG decoding; program
compilers; Program processors; reconfigurable
architectural approach; reconfigurable architectures;
Reconfigurable hardware; reconfigurable logic;
reconfigurable resource management; RTL design
complexity; Runtime; shrink fit accelerator;
Special-Purpose and Application-Based Systems; temporal
logic; virtual machines; virtualisation",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Lyons:2013:SFF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Duong:2013:CAS,
author = "Nam Duong and Alexander V. Veidenbaum",
title = "Compiler-Assisted, Selective Out-Of-Order Commit",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper proposes an out-of-order instruction commit
mechanism using a novel compiler/architecture
interface. The compiler creates instruction ``blocks''
guaranteeing some commit conditions and the processor
uses the block information to commit certain
instructions out of order. Micro-architectural support
for the new commit mode is made on top of the standard,
ROB-based processor and includes out-of-order
instruction commit with register and load queue entry
release. The commit mode may be switched multiple times
during execution. Initial results for a 4-wide
processor show that, on average, 52\% instructions are
committed out of order resulting in 10\% to 26\%
speedups over in-order commit, with minimal hardware
overhead. The performance improvement is a result of an
effectively larger instruction window that allows more
cache misses to be overlapped for both L1 and L2
caches.",
acknowledgement = ack-nhfb,
affiliation = "Duong, N (Reprint Author), Univ Calif Irvine, Dept
Comp Sci, Irvine, CA 92717 USA. Duong, Nam; Veidenbaum,
Alexander V., Univ Calif Irvine, Dept Comp Sci, Irvine,
CA 92717 USA.",
author-email = "nlduong@ics.uci.edu alexv@ics.uci.edu",
da = "2019-06-20",
doc-delivery-number = "172HT",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architecture/compiler co-design; Benchmark testing;
block information; cache misses; cache storage; Cache
storage; cache storage; Cache storage; commit
conditions; compiler-architecture interface;
compiler-assisted selective out-of-order commit;
computer architecture; Computer architecture; computer
architecture; dynamically-scheduled and
statically-scheduled implementation; Hardware/software
interfaces; instruction blocks; instruction sets; L1
cache; L2 cache; load queue entry release;
microarchitectural support; minimal hardware overhead;
Out of order instruction; Out-of-order commit;
out-of-order instruction commit mechanism; overlapping
cache misses; performance evaluation; performance
improvement; Pipeline implementation; Pipeline
processors; program compilers; Program processors;
register; resource release; RISC/CISC; ROB-based
processor; Superscalar; VLIW architectures; Von Neumann
architectures",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Duong:2013:CAS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Nilakantan:2013:MES,
author = "Siddharth Nilakantan and Steven Battle and Mark
Hempstead",
title = "Metrics for Early-Stage Modeling of Many-Accelerator
Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The term `Dark Silicon'' has been coined to describe
the threat to microprocessor performance caused by
increasing transistor power density. Improving energy
efficiency is now the primary design goal for all
market segments of microprocessors from mobile to
server. Specialized hardware accelerators, designed to
run only a subset of workloads with orders of magnitude
energy efficiency improvement, are seen as a potential
solution. Selecting an ensemble of accelerators to best
cover the workloads run on a platform remains a
challenge. We propose metrics for accelerator selection
derived from a detailed communication-aware performance
model and present an automated methodology to populate
this model. Employing a combination of characterized
RTL and our selection metrics, we evaluate a set of
accelerators for a sample application and compare
performance to selections based on execution time and
Pollack's rule. We find that the architecture selected
by our communication-aware metric shows improved
performance over architectures selected based on
execution time and Pollack's rule, as they do not
account for speedup being limited by communication.",
acknowledgement = ack-nhfb,
affiliation = "Nilakantan, S (Reprint Author), Drexel Univ, Dept
Elect \& Comp Engn, Philadelphia, PA 19104 USA.
Nilakantan, Siddharth; Battle, Steven; Hempstead, Mark,
Drexel Univ, Dept Elect \& Comp Engn, Philadelphia, PA
19104 USA.",
author-email = "sn446@drexel.edu sjb328@drexel.edu mdh77@drexel.edu",
da = "2019-06-20",
doc-delivery-number = "172HT",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accelerators; Code Profiling; communication-aware
performance model; Computer architecture; computer
architecture; Computer Systems Organization; dark
silicon; General; hardware accelerators; Heterogeneous
(hybrid) systems; Heterogeneous Architectures;
magnitude energy efficiency improvement;
many-accelerator architectures; microprocessor;
microprocessor chips; Modeling; Modeling of computer
architecture; modelling; Multiprocessing systems; Other
Architecture Styles; performance evaluation; Pollack
rule; Processor Architectures; Program processors; RTL;
transistor power density; transistors",
number-of-cited-references = "16",
ORCID-numbers = "Nilakantan, Siddharth/0000-0003-1067-700X",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Nilakantan:2013:MES",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Delimitrou:2013:NCD,
author = "Christina Delimitrou and Christos Kozyrakis",
title = "The {Netflix} Challenge: Datacenter Edition",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "29--32",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.10",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The hundreds of thousands of servers in modern
warehouse scale systems make performance and efficiency
optimizations pressing design challenges. These systems
are traditionally considered homogeneous. However, that
is not typically the case. Multiple server generations
compose a heterogeneous environment, whose performance
opportunities have not been fully explored since
techniques that account for platform heterogeneity
typically do not scale to the tens of thousands of
applications hosted in large-scale cloud providers. We
present ADSM, a scalable and efficient recommendation
system for application-to-server mapping in large-scale
datacenters (DCs) that is QoS-aware. ADSM overcomes the
drawbacks of previous techniques, by leveraging robust
and computationally efficient analytical methods to
scale to tens of thousands of applications with minimal
overheads. It is also OoS-aware, mapping applications
to platforms while enforcing strict QoS guarantees.
ADSM is derived from validated analytical models, has
low and bounded prediction errors, is simple to
implement and scales to thousands of applications
without significant changes to the system. Over 390
real DC workloads, ADSM improves performance by 16\% on
average and up to 2.5x and efficiency by 22\% in a DC
with 10 different server configurations.",
acknowledgement = ack-nhfb,
affiliation = "Delimitrou, C (Reprint Author), Stanford Univ,
Stanford, CA 94305 USA. Delimitrou, Christina;
Kozyrakis, Christos, Stanford Univ, Stanford, CA 94305
USA.",
author-email = "cdel@stanford.edu kozyraki@stanford.edu",
da = "2019-06-20",
doc-delivery-number = "172HT",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ADSM; application mapping; Application studies
resulting in better multiple-processor systems;
application-to-server mapping; Computer architecture;
computer centres; Computer System Implementation;
Computer Systems Organization; Data centers;
datacenter; design challenge; Design studies;
evaluation; Heterogeneous (hybrid) systems; Large and
Medium ( Mainframe ) Computers; Large-scale systems;
Measurement; modeling; Multiprocessing systems; Netflix
challenge; Other Architecture Styles; Parallel
Architectures; Performance of Systems; Processor
Architectures; QoS-aware; quality of service;
Scheduling; Scheduling and task partitioning; server
generation; simulation of multiple-processor systems;
Special-Purpose and Application-Based Systems; Super
(very large) computers; warehouse-scale system",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Delimitrou:2013:NCD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2013:RL,
author = "Anonymous",
title = "2012 reviewers list",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "33--34",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.11",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The publication offers a note of thanks and lists its
reviewers.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "IEEE publishing",
}
@Article{Anonymous:2013:IOAa,
author = "Anonymous",
title = "{IEEE} Open Access Publishing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "35--35",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.13",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Advertisement: This publication offers open access
options for authors. IEEE open access publishing.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2013:ITN,
author = "Anonymous",
title = "{{\booktitle{IEEE Transactions}}} Newsletter",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "1",
pages = "36--36",
month = jan # "\slash " # jun,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.14",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Advertisement: Stay connected with the IEEE Computer
Society Transactions by signing up for our new
Transactions Connection newsletter. It is free and
contains valuable information.",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Martinez:2013:E,
author = "J. F. Martinez",
title = "Editorial",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "37--38",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.32",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Jian:2013:HPE,
author = "Xun Jian and John Sartori and Henry Duwe and Rakesh
Kumar",
title = "High Performance, Energy Efficient Chipkill Correct
Memory with Multidimensional Parity",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "39--42",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.21",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "It is well-known that a significant fraction of server
power is consumed in memory; this is especially the
case for servers with chipkill correct memories. We
propose a new chipkill correct memory organization that
decouples correction of errors due to local faults that
affect a single symbol in a word from correction of
errors due to device-level faults that affect an entire
column, sub-bank, or device. By using a combination of
two codes that separately target these two fault modes,
the proposed chipkill correct organization reduces code
overhead by half as compared to conventional chipkill
correct memories for the same rank size. Alternatively,
this allows the rank size to be reduced by half while
maintaining roughly the same total code overhead.
Simulations using PARSEC and SPEC benchmarks show that,
compared to a conventional double chipkill correct
baseline, the proposed memory organization, by
providing double chipkill correct at half the rank
size, reduces power by up to 41\%, 32\% on average over
a conventional baseline with the same chipkill correct
strength and access granularity that relies on linear
block codes alone, at only 1\% additional code
overhead.",
acknowledgement = ack-nhfb,
affiliation = "Jian, X (Reprint Author), Univ Illinois, Urbana, IL
USA. Jian, Xun; Sartori, John; Duwe, Henry; Kumar,
Rakesh, Univ Illinois, Urbana, IL USA.",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "block codes; chipkill correct; chipkill correct memory
organization; code overhead reduction; Computer
architecture; device level fault; DRAM; DRAM chips;
error correction; error correction codes; fault mode;
fault tolerant computing; granular computing;
granularity access; linear block code; linear codes;
low power; Low power electronics; PARSEC; Random access
memory; rank size; reliable memory; server power
consumption; Servers; SPEC; storage management",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Jian:2013:HPE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Maddah:2013:DDS,
author = "Rakan Maddah and Sangyeun Cho and Rami Melhem",
title = "Data Dependent Sparing to Manage Better-Than-Bad
Blocks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "43--46",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.20",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We forecast that proper handling of unreliable storage
blocks (e.g., ``bad block management'' in solid-state
drives) will remain critical for future systems built
with advanced and emerging memory technologies. This
paper argues that the conventional block retirement and
sparing approach --- a block is retired as soon as it
shows faulty behavior --- is overly conservative and
inefficient. We observe that it is highly unlikely that
all faulty bits in a storage block manifest errors.
Consequently, we propose data dependent sparing, a
relaxed block retirement and sparing approach that
recycles faulty storage blocks. At small management
cost and with less than 1\% sparing, data dependent
sparing achieves the same lifetime as the conventional
approach with 20\% sparing.",
acknowledgement = ack-nhfb,
affiliation = "Maddah, R (Reprint Author), Univ Pittsburgh, Dept Comp
Sci, Pittsburgh, PA 15260 USA. Maddah, Rakan; Cho,
Sangyeun; Melhem, Rami, Univ Pittsburgh, Dept Comp Sci,
Pittsburgh, PA 15260 USA.",
author-email = "rmaddah@cs.pitt.edu cho@cs.pitt.edu
melhem@cs.pitt.edu",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CCF-1064976, CCF-1059283,
CNS-1012070]",
funding-text = "This work is supported in part by NSF grants
CCF-1064976, CCF-1059283, and CNS-1012070.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "better-than-bad block management; data dependent
sparing; data dependent sparing approach; Data storage
systems; fault tolerant computing; faulty bits; faulty
storage blocks; flash memory; Flash memory; flash
memory; management cost; memory technologies; phase
change memories; phase-change memory; phase-change
memory (PCM); relaxed block retirement approach;
solid-state drive; solid-state drive (SSD); Solid-state
drives; solid-state drives; Sparing; sparing; storage
block; storage management; stuck-at faults; unreliable
storage block handling",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Maddah:2013:DDS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2013:CFC,
author = "Hanjoon Kim and Yonggon Kim and John Kim",
title = "Clumsy Flow Control for High-Throughput Bufferless
On-Chip Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "47--50",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.22",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Bufferless on-chip networks are an alternative type of
on-chip network organization that can improve the
cost-efficiency of an on-chip network by removing
router input buffers. However, bufferless on-chip
network performance degrades at high load because of
the increased network contention and large number of
deflected packets. The energy benefit of bufferless
network is also reduced because of the increased
deflection. In this work, we propose a novel flow
control for bufferless on-chip networks in
high-throughput manycore accelerator architectures to
reduce the impact of deflection routing. By using a
clumsy flow control (CFC), instead of the per-hop flow
control that is commonly used in buffered on-chip
networks, we are able to reduce the amount of
deflection by up to 92\% on high-throughput workloads.
As a result, on average, CFC can approximately match
the performance of a baseline buffered router while
reducing the energy consumption by approximately
39\%.",
acknowledgement = ack-nhfb,
affiliation = "Kim, H (Reprint Author), Korea Adv Inst Sci \&
Technol, Dept Comp Sci, Taejon, South Korea. Kim,
Hanjoon; Kim, Yonggon; Kim, John, Korea Adv Inst Sci \&
Technol, Dept Comp Sci, Taejon, South Korea.",
author-email = "hanj@kaist.ac.kr ilios@kaist.ac.kr jjk12@kaist.ac.kr",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "MKE, Korea, under the ITRC
[NIPA-2012-H0301-12-1011]; BST program through the NRF
of Korea; MEST [2012-0003579]",
funding-text = "This research was supported in part by the MKE, Korea,
under the ITRC support program supervised by the NIPA
(NIPA-2012-H0301-12-1011) and in part by BST program
through the NRF of Korea funded by the
MEST(2012-0003579).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "bufferless NoC; bufferless router; CFC; clumsy flow
control; computer architecture; Computer architecture;
Computer Systems Organization; cost-efficiency
improvement; Data processing; deflection routing;
deflection routing impact reduction; energy benefit;
energy consumption reduction; flow control;
high-throughput bufferless on-chip networks;
high-throughput manycore accelerator architectures;
high-throughput workloads; Interconnection
architectures; microprocessor chips; Multiple Data
Stream Architectures (Multiprocessors); Multiprocessing
systems; network contention; network routing;
network-on-chip; On-chip interconnection networks;
on-chip network organization; on-chip networks;
Parallel architectures; Parallel Architectures;
performance evaluation; Processor Architectures; router
input buffer removal; System-on-chip",
number-of-cited-references = "14",
research-areas = "Computer Science",
researcherid-numbers = "Kim, John/C-1792-2011",
times-cited = "7",
unique-id = "Kim:2013:CFC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kai:2013:GRP,
author = "Yi Kai and Yi Wang and Bin Liu",
title = "{GreenRouter}: Reducing Power by Innovating Router's
Architecture",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "51--54",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.23",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "High speed routers in Internet are becoming more
powerful, as well as more energy hungry. In this paper,
we present a new architecture of router named
GreenRouter which separates a line-card into two parts:
network interface card (DB) and packet processing card
(MB), connected by a two-stage switch fabric in traffic
flows' ingress and egress direction respectively.
Traffic from all DBs shares all the MBs in GreenRouter,
thus can be aggregated to a few active MBs on demand
and other MBs can be shut down to save power. Several
key issues to this new architecture are addressed. We
evaluate the power saving efficiency and give
preliminary simulation results. GreenRouter can well
adapt the traffic fluctuation and real trace
evaluations over one week shows that up to 63.7\% power
saving can be achieved while QoS constraints are
guaranteed.",
acknowledgement = ack-nhfb,
affiliation = "Liu, B (Reprint Author), Tsinghua Univ, Dept Comp Sci
\& Technol, Beijing 100084, Peoples R China. Kai, Yi;
Wang, Yi; Liu, Bin, Tsinghua Univ, Dept Comp Sci \&
Technol, Beijing 100084, Peoples R China.",
author-email = "kaiyi02@gmail.com pig020623@gmail.com
lmyujie@gmail.com",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSFC [61073171]; Tsinghua University
Initiative Scientific Research Program [20121080068];
Specialized Research Fund for the Doctoral Program of
Higher Education of China [20100002110051]",
funding-text = "This work is supported by NSFC (61073171), Tsinghua
University Initiative Scientific Research Program
(20121080068), Specialized Research Fund for the
Doctoral Program of Higher Education of China
(20100002110051).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; DB; Energy efficiency;
energy-aware system; green computing; Green design;
GreenRouter; High-speed networks; Internet; line-card;
low power design; MB; network interface card; packet
processing card; power reduction; power saving
efficiency; QoS constraints; router; router
architecture innovation; Routers; telecommunication
network routing; Telecommunication traffic;
telecommunication traffic; traffic flow egress
direction; traffic flow ingress direction; traffic
fluctuation; two-stage switch fabric",
number-of-cited-references = "6",
ORCID-numbers = "Wang, Yi/0000-0002-9095-6879",
research-areas = "Computer Science",
researcherid-numbers = "Wang, Yi/A-8884-2015",
times-cited = "1",
unique-id = "Kai:2013:GRP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Joo:2013:HPS,
author = "Yongsoo Joo and Sangsoo Park",
title = "A Hybrid {PRAM} and {STT--RAM} Cache Architecture for
Extending the Lifetime of {PRAM} Caches",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "55--58",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.24",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "To extend the lifetime of phase change RAM (PRAM)
caches, we propose a hybrid cache architecture that
integrates a relatively small capacity of spin transfer
torque RAM (STT--RAM) write buffer with a PRAM cache.
Our hybrid cache improves the endurance limitation of
the PRAM cache by judiciously redirecting the write
traffic from an upper memory layer to the STT--RAM
write buffer. We have demonstrated through simulation
that the proposed hybrid cache outperforms existing
write-traffic reduction schemes with the same area
overhead. Moreover, our approach is orthogonal to the
existing schemes, providing an effective way of
investing die area for cache lifetime extension by
being used in combination with them.",
acknowledgement = ack-nhfb,
affiliation = "Joo, Y (Reprint Author), Ewha Womans Univ, Dept Comp
Sci \& Engn, Seoul 120750, South Korea. Joo, Yongsoo;
Park, Sangsoo, Ewha Womans Univ, Dept Comp Sci \& Engn,
Seoul 120750, South Korea.",
author-email = "ysjoo@ewha.ac.kr sangsoo.park@ewha.ac.kr",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Ewha Womans University",
funding-text = "We thank Guangyu Sun and Cong Xu for their helpful
comments on NVRAM characteristics. This research was
supported by RP-Grant 2010 of Ewha Womans University.
Sangsoo Park is the corresponding author.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache; cache lifetime extension; Cache memories; Cache
storage; cache storage; Computer architecture;
concurrency theory; Design Styles; endurance; Fault
tolerance; Hardware; hybrid cache architecture; hybrid
PRAM caches; investing die area; lifetime; memory
layer; Memory Structures; phase change memories; phase
change RAM; PRAM; Random access memory; Redundancy;
Redundant design; Reliability; spin transfer torque
RAM; STT RAM cache architecture; STT RAM write buffer;
STT--RAM; Testing and Fault-Tolerance; write traffic
reduction schemes",
number-of-cited-references = "14",
research-areas = "Computer Science",
times-cited = "10",
unique-id = "Joo:2013:HPS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Blem:2013:MMA,
author = "Emily Blem and Hadi Esmaeilzadeh and Renee St Amant
and Karthikeyan Sankaralingam and Doug Burger",
title = "Multicore Model from Abstract Single Core Inputs",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "59--62",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.27",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper describes a first order multicore model to
project a tighter upper bound on performance than
previous Amdahl's Law based approaches. The speedup
over a known baseline is a function of the core
performance, microarchitectural features, application
parameters, chip organization, and multicore topology.
The model is flexible enough to consider both CPU and
GPU like organizations as well as modern topologies
from symmetric to aggressive heterogeneous (asymmetric,
dynamic, and fused) designs. This extended model
incorporates first order effects-exposing more
bottlenecks than previous applications of Amdahl's
Law-while remaining simple and flexible enough to be
adapted for many applications.",
acknowledgement = ack-nhfb,
affiliation = "Blem, E (Reprint Author), Univ Wisconsin, Madison, WI
53706 USA. Blem, Emily; Sankaralingam, Karthikeyan,
Univ Wisconsin, Madison, WI 53706 USA. Esmaeilzadeh,
Hadi, Univ Washington, Seattle, WA 98195 USA. St Amant,
Renee, Univ Texas Austin, Austin, TX 78712 USA.",
author-email = "blem@cs.wisc.edu hadianeh@cs.washington.edu
stamant@cs.utexas.edu karu@cs.wisc.edu
dburger@microsoft.com",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "abstract single core inputs; aggressive heterogeneous
designs; Amdahl law based approach; application
parameters; chip organization; Computer Systems
Organization; CPU like organizations; first order
multicore model; General; GPU like organizations;
graphics processing units; microarchitectural features;
Modeling of computer architecture; multicore topology;
multicores; Multiple Data Stream Architectures
(Multiprocessors); multiprocessing systems; network
topology; parallelism; performance evaluation;
Performance modeling; Processor Architectures",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Blem:2013:MMA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Michaud:2013:DMT,
author = "Pierre Michaud",
title = "Demystifying Multicore Throughput Metrics",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "63--66",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.25",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Several different metrics have been proposed for
quantifying the throughput of multicore processors.
There is no clear consensus about which metric should
be used. Some studies even use several throughput
metrics. We show that there exists a relation between
single-thread average performance metrics and
throughput metrics, and that throughput metrics inherit
the meaning or lack of meaning of the corresponding
single-thread metric. We show that two popular
throughput metrics, the weighted speedup and the
harmonic mean of speedups, are inconsistent: they do
not give equal importance to all benchmarks. Moreover
we demonstrate that the weighted speedup favors
unfairness. We show that the harmonic mean of IPCs, a
seldom used throughput metric, is actually consistent
and has a physical meaning. We explain under which
conditions the arithmetic mean or the harmonic mean of
IPCs can be used as a strong indicator of throughput
increase.",
acknowledgement = ack-nhfb,
affiliation = "Michaud, P (Reprint Author), INRIA Rennes, Rennes,
France. INRIA Rennes, Rennes, France.",
author-email = "Pierre.Michaud@inria.fr",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; Computer Systems Organization;
evaluation; Measurement; Modeling; modeling;
Multi-core/single-chip multiprocessors; Multicore
processing; multicore processors; multicore throughput;
multicore throughput metrics; multiprocessing systems;
Parallel Architectures; Parallel architectures;
Performance evaluation; performance metric; Performance
of Systems; Processor Architectures; Program
processors; simulation of multiple-processor systems;
single thread metric; software metrics",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "7",
unique-id = "Michaud:2013:DMT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Tembey:2013:SSS,
author = "Priyanka Tembey and Augusto Vega and Alper
Buyuktosunoglu and Dilma {Da Silva} and Pradip Bose",
title = "{SMT} Switch: Software Mechanisms for Power Shifting",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "67--70",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.26",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Simultaneous multithreading (SMT) as a processor
design to achieve higher levels of system and
application throughput is a well-accepted and deployed
technique in most desktop and server processors. We
study the power implications of varying SMT levels
i.e., thread counts per core for various multi-threaded
applications on a real SMT multicore platform, and
introduce a novel software mechanism of changing SMT
level of a core to tune platform power. Power-shifting
policies by varying per core SMT levels for performance
benefits within a power cap are introduced. Projected
power savings (of 15\%) for a streaming parallel
benchmark can be attained using SMT-level power
shifting mechanisms.",
acknowledgement = ack-nhfb,
affiliation = "Tembey, P (Reprint Author), Georgia Tech, Atlanta, GA
30332 USA. Tembey, Priyanka, Georgia Tech, Atlanta, GA
30332 USA.",
da = "2019-06-20",
doc-delivery-number = "279CD",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "application throughput; Computer architecture;
Computer Systems Organization; Hardware;
multi-threading; Multicore platforms; multiprocessing
systems; Multithreaded processors; Multithreading;
Operating Systems; Other Architecture Styles; Parallel
processing; power aware computing; Power Management;
Power shifting; Power system management; Process
Management; Processor Architectures; processor design;
Program processors; Scheduling; simultaneous
multithreading; SMT; SMT multicore platform; SMT
switch; SMT-level power shifting mechanism; Software
engineering; software mechanisms; Software/Software
Engineering; streaming parallel benchmark; tune
platform power",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Tembey:2013:SSS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2013:IOAb,
author = "Anonymous",
title = "{IEEE} Open Access Publishing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "71--71",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.33",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2013:SCI,
author = "Anonymous",
title = "Stay Connected to the {IEEE Computer Society}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "72--72",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.34",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2013:BC,
author = "Anonymous",
title = "[{Back} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "c4--c4",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.31",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2013:BIC,
author = "Anonymous",
title = "[{Back} inside cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "c3--c3",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.30",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2013:FC,
author = "Anonymous",
title = "[{Front} cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "c1--c1",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.28",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2013:FIC,
author = "Anonymous",
title = "[{Front} inside cover]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "12",
number = "2",
pages = "c2--c2",
month = jul # "\slash " # dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.29",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Arelakis:2014:CVA,
author = "Angelos Arelakis and Per Stenstr{\"o}m",
title = "A Case for a Value-Aware Cache",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.31",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Replication of values causes poor utilization of
on-chip cache memory resources. This paper addresses
the question: How much cache resources can be
theoretically and practically saved if value
replication is eliminated? We introduce the concept of
value-aware caches and show that a sixteen times
smaller value-aware cache can yield the same miss rate
as a conventional cache. We then make a case for a
value-aware cache design using Huffman-based
compression. Since the value set is rather stable
across the execution of an application, one can afford
to reconstruct the coding tree in software. The
decompression latency is kept short by our proposed
novel pipelined Huffman decoder that uses canonical
codewords. While the (loose) upper-bound compression
factor is 5.2X, we show that, by eliminating
cache-block alignment restrictions, it is possible to
achieve a compression factor of 3.4X for practical
designs.",
acknowledgement = ack-nhfb,
affiliation = "Arelakis, A (Reprint Author), Chalmers, Gothenburg,
Sweden. Arelakis, Angelos; Stenstrom, Per, Chalmers,
Gothenburg, Sweden.",
author-email = "angelos@chalmers.se per.stenstrom@chalmers.se",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Swedish Research Council",
funding-text = "This research is supported by the Swedish Research
Council. The simulations ran on the resources provided
by the Swedish National Infrastructure for Computing
(SNIC) at C3SE.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B Hardware; B.3 Memory Structures; B.3.2 Design
Styles; B.3.2.b Cache memories; cache storage;
cache-block alignment restriction elimination; Clocks;
coding tree reconstruction; data compression; data
handling; Decoding; decompression latency; E Data; E.4
Coding and Information Theory; E.4.a Data compaction
and compression; Engines; Huffman codes; Huffman
coding; Huffman-based compression; Indexes; on-chip
cache memory resources; System-on-a-chip; tree codes;
value replication; value-aware cache design",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Arelakis:2014:CVA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Chen:2014:PEC,
author = "Zheng Chen and Huaxi Gu and Yintang Yang and Luying
Bai and Hui Li",
title = "A Power Efficient and Compact Optical Interconnect for
Network-on-Chip",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.5",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Optical interconnect is a promising alternative to
substitute the electrical interconnect for intra-chip
communications. The topology of optical Network-on-Chip
(ONoC) has a great impact on the network performance.
However, the size of ONoC is limited by the power
consumption and crosstalk noise, which are mainly
resulted from the waveguide crossings in the topology.
In this paper, a diagonal Mesh topology (DMesh) is
proposed to relieve the limitation of scalability by
reducing the number of waveguide crossing, which is
only 20\% that of Mesh. In addition, the number of
optical routers in DMesh is less than half of that in
Mesh-based ONoC. Due to its compact architecture and
favorable scalability, DMesh topology is suitable for
large-scale ONoC design.",
acknowledgement = ack-nhfb,
affiliation = "Chen, Z (Reprint Author), Xidian Univ Xian, State Key
Lab Integrated Serv Networks, Xian, Peoples R China.
Chen, Zheng; Gu, Huaxi; Bai, Luying; Li, Hui, Xidian
Univ Xian, State Key Lab Integrated Serv Networks,
Xian, Peoples R China. Yang, Yintang, Xidian Univ Xian,
Inst Microelect, Xian, Peoples R China.",
author-email = "chenzheng8331@stu.xidian.edu.cn hxgu@xidian.edu.cn
ytyang@xidian.edu.cn",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation of China
[61070046, 60803038]; State Key Lab [ISN1104001];
Fundamental Research Funds for the Central Universities
[K5051301003]; 111 Project [B08038]",
funding-text = "This work is supported by the National Science
Foundation of China Grant No. 61070046 and 60803038,
the special fund from State Key Lab Grant No.
ISN1104001, the Fundamental Research Funds for the
Central Universities Grant No. K5051301003, the 111
Project Grant No. B08038.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "compact optical interconnect; crosstalk noise;
diagonal mesh topology; DMesh topology; integrated
optoelectronics; intra-chip communications; large-scale
ONoC design; mesh-based ONoC; multiprocessors; network
performance; Network topology; network-on-chip; optical
interconnections; Optical interconnects; optical
network-on-chip; optical router; Optical routers;
optical routers; power consumption; power efficient
interconnect; Topology; topology; Topology; waveguide
crossings; wavelength division multiplexing; Wavelength
division multiplexing; wavelength division
multiplexing",
number-of-cited-references = "9",
ORCID-numbers = "Gu, Huaxi/0000-0002-6409-2229",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Chen:2014:PEC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Cota:2014:AMR,
author = "Emilio G. Cota and Paolo Mantovani and Michele
Petracca and Mario R. Casu and Luca P. Carloni",
title = "Accelerator Memory Reuse in the Dark Silicon Era",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.29",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Accelerators integrated on-die with General-Purpose
CPUs (GP-CPUs) can yield significant performance and
power improvements. Their extensive use, however, is
ultimately limited by their area overhead; due to their
high degree of specialization, the opportunity cost of
investing die real estate on accelerators can become
prohibitive, especially for general-purpose
architectures. In this paper we present a novel
technique aimed at mitigating this opportunity cost by
allowing GP-CPU cores to reuse accelerator memory as a
non-uniform cache architecture (NUCA) substrate. On a
system with a last level-2 cache of 128kB, our
technique achieves on average a 25\% performance
improvement when reusing four 512 kB accelerator memory
blocks to form a level-3 cache. Making these blocks
reusable as NUCA slices incurs on average in a 1.89\%
area overhead with respect to equally-sized ad hoc
cache slices.",
acknowledgement = ack-nhfb,
affiliation = "Cota, EG (Reprint Author), Columbia Univ, New York, NY
10027 USA. Cota, Emilio G.; Mantovani, Paolo; Carloni,
Luca P., Columbia Univ, New York, NY 10027 USA.
Petracca, Michele, Cadence Design Syst Inc, San Jose,
CA USA. Casu, Mario R., Politecn Torino, Turin,
Italy.",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [1018236,
1219001]; ONR Young Investigator Award; Gigascale
Systems Research Center; Focus Center Research Program
(FCRP), a Semiconductor Research Corporation entity",
funding-text = "This research is partially supported by the National
Science Foundation under Awards \#: 1018236 and
1219001, an ONR Young Investigator Award, and the
Gigascale Systems Research Center, one of six research
centers funded under the Focus Center Research Program
(FCRP), a Semiconductor Research Corporation entity.
The authors thank John Demme and the anonymous
reviewers for their insightful comments.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; accelerator architectures; Accelerator
architectures; accelerator architectures; accelerator
memory reuse; cache formation; Cache memory; cache
slice; cache storage; dark silicon era; general purpose
CPU; general-purpose architecture; GP-CPU; Memory
management; nonuniform cache architecture; NUCA
substrate; Power demand; Silicon; Transform coding",
keywords-plus = "CACHES",
number-of-cited-references = "18",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Cota:2014:AMR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Chou:2014:EPE,
author = "Yu-Liang Chou and Shaoshan Liu and Eui-Young Chung and
Jean-Luc Gaudiot",
title = "An Energy and Performance Efficient {DVFS} Scheme for
Irregular Parallel Divide-and-Conquer Algorithms on the
{Intel SCC}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.1",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The divide-and-conquer paradigm can be used to express
many computationally significant problems, but an
important subset of these applications is inherently
load-imbalanced. Load balancing is a challenge for
irregular parallel divide-and-conquer algorithms and
efficiently solving these applications will be a key
requirement for future many-core systems. To address
the load imbalance issue, instead of attempting to
dynamically balancing the workloads, this paper
proposes an energy and performance efficient Dynamic
Voltage and Frequency Scaling (DVFS) scheduling scheme,
which takes into account the load imbalance behavior
exhibited by these applications. More specifically, we
examine the core of the divide-and-conquer paradigm and
determine that the base-case-reached point where
recursion stops is a suitable place in a
divide-and-conquer paradigm to apply the proposed DVFS
scheme. To evaluate the proposed scheme, we implement
four representative irregular parallel
divide-and-conquer algorithms, tree traversal,
quicksort, finding primes, and n-queens puzzle, on the
Intel Single-chip Cloud Computer (SCC) many-core
machine. We demonstrate that, on average, the proposed
scheme can improve performance by 41\% while reducing
energy consumption by 36\% compared to the baseline
running the whole computation with the default
frequency configuration (400MHz).",
acknowledgement = ack-nhfb,
affiliation = "Chou, YL (Reprint Author), Univ Calif Irvine, Irvine,
CA 92697 USA. Chou, Yu-Liang; Gaudiot, Jean-Luc, Univ
Calif Irvine, Irvine, CA 92697 USA. Liu, Shaoshan,
Microsoft Corp, Redmond, WA 98052 USA. Chung,
Eui-Young, Yonsei Univ, Seoul 120749, South Korea.",
author-email = "d943010010@gmail.com shaoliu@microsoft.com
eychung@yonsei.ac.kr gaudiot@uci.edu",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "US National Science Foundation
[CCF-1065448]; National Research Foundation of Korea
(NRF) [2012S1A2A1A01031420]; Ministry of Education,
Science and Technology [2012-047670]; National Science
Council [NSC 101-2917-I-564-079]",
funding-text = "This work is partly supported by the US National
Science Foundation under Grant No. CCF-1065448, by the
National Research Foundation of Korea (NRF) under Grant
No. 2012S1A2A1A01031420, by the Ministry of Education,
Science and Technology under Grant No. 2012-047670, and
by the National Science Council under Grant No. NSC
101-2917-I-564-079. Any opinions, findings, and
conclusions expressed in this material are those of the
authors and do not necessarily reflect the views of
these sponsors.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "base-case-reached point; D Software/Software
Engineering; D.4 Operating Systems; D.4 Operating
Systems < D.4.7 Organization and Design; D.4.7.b
Distributed systems; D.4.7.f Parallel systems; D.4.8
Performance < D.4.8.a Measurements < Distributed
processing; divide and conquer methods;
Divide-and-conquer; DVFS; dynamic voltage and frequency
scaling; energy conservation; energy consumption
reduction; energy efficient DVFS scheme; finding
primes; frequency 400 MHz; Intel SCC; Intel single-chip
cloud computer; irregular parallel divide-and-conquer
algorithms; Load Imbalance; load imbalance behavior;
many-core machine; microprocessor chips;
multiprocessing systems; n-queens puzzle; Operating
systems; parallel algorithms; Parallel processing;
performance efficient DVFS scheme; Performance
evaluation; power aware computing; processor
scheduling; quicksort; recursion stops; resource
allocation; Software engineering; tree traversal",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Chou:2014:EPE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Rotem:2014:BUI,
author = "Nadav Rotem and Yosi {Ben Asher}",
title = "Block Unification {IF}-conversion for High Performance
Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.28",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Graphics Processing Units accelerate data-parallel
graphic calculations using wide SIMD vector units.
Compiling programs to use the GPU's SIMD architectures
require converting multiple control flow paths into a
single stream of instructions. IF-conversion is a
compiler transformation, which converts control
dependencies into data dependencies, and it is used by
vectorizing compilers to eliminate control flow and
enable efficient code generation. In this work we
enhance the IF-conversion transformation by using a
block unification method to improve the currently used
block flattening method. Our experimental results
demonstrate that our IF-conversion method is effective
in reducing the number of predicated instructions and
in boosting kernel execution speed.",
acknowledgement = ack-nhfb,
affiliation = "Rotem, N (Reprint Author), Univ Haifa, Dept Comp Sci,
IL-31999 Haifa, Israel. Rotem, Nadav; Ben Asher, Yosi,
Univ Haifa, Dept Comp Sci, IL-31999 Haifa, Israel.",
author-email = "rotemn@cs.haifa.ac.il yosi@cs.haifa.ac.il",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "block flattening method; block unification
IF-conversion; block unification method; code
generation; Code generation; compiler transformation;
Compilers; Computer architecture; data-parallel graphic
calculations; GPU SIMD architectures; Graphics
processing unit; graphics processing units; high
performance architectures; Kernel; Merging; multiple
control flow paths; parallel processing; Processors;
program compilers; Programming Languages; Registers;
Software/Software Engineering; vectorizing compilers;
Vectors; wide SIMD vector units",
number-of-cited-references = "15",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Rotem:2014:BUI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Ilic:2014:CAR,
author = "Aleksandar Ilic and Frederico Pratas and Leonel
Sousa",
title = "Cache-aware Roofline model: Upgrading the loft",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "21--24",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.6",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The Roofline model graphically represents the
attainable upper bound performance of a computer
architecture. This paper analyzes the original Roofline
model and proposes a novel approach to provide a more
insightful performance modeling of modern architectures
by introducing cache-awareness, thus significantly
improving the guidelines for application optimization.
The proposed model was experimentally verified for
different architectures by taking advantage of built-in
hardware counters with a curve fitness above 90\%.",
acknowledgement = ack-nhfb,
affiliation = "Ilic, A (Reprint Author), Univ Tecn Lisboa, INESC ID
IST, Lisbon, Portugal. Ilic, Aleksandar; Pratas,
Frederico; Sousa, Leonel, Univ Tecn Lisboa, INESC ID
IST, Lisbon, Portugal.",
author-email = "ilic@inesc-id.pt fcpp@inesc-id.pt las@inesc-id.pt",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "national funds through FCT (Fundacao para a
Ciencia e a Tecnologia) [PTDC/EEI-ELC/3152/2012,
PEst-OE/EEI/LA0021/2011, PTDC/EEA-ELC/117329/2010]; FCT
[SFRH/BPD/87734/2012]",
funding-text = "This work was supported by national funds through FCT
(Fundacao para a Ciencia e a Tecnologia), under
projects PTDC/EEI-ELC/3152/2012,
PEst-OE/EEI/LA0021/2011, and PTDC/EEA-ELC/117329/2010.
F. Pratas also acknowledges the FCT scholarship
SFRH/BPD/87734/2012.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Application optimization; application optimization;
Application optimization; built-in hardware counters;
C.0.d Modeling of computer architecture < C.0 General <
C Computer Systems Organization; C.0.e System
architectures; C.4.d Modeling techniques < C.4
Performance of Systems < C Computer Systems
Organization; C.4.g Measurement; cache storage;
cache-aware Roofline model; cache-awareness; computer
architecture; computer architecture upper bound
performance; curve fitness; evaluation; integration and
modeling < C.0 General < C Computer Systems
Organization; Modeling; modeling; Multicore computer
architectures; Multiprocessing systems; multiprocessing
systems; Performance evaluation; Performance modeling;
Simulation; simulation of multiple-processor systems <
C.4 Performance of Systems < C Computer Syst",
number-of-cited-references = "10",
ORCID-numbers = "Ilic, Aleksandar/0000-0002-8594-3539 Sousa,
Leonel/0000-0002-8066-221X",
research-areas = "Computer Science",
researcherid-numbers = "Ilic, Aleksandar/L-1943-2014 Sousa,
Leonel/B-2749-2009",
times-cited = "24",
unique-id = "Ilic:2014:CAR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Efraim:2014:EAR,
author = "Rotem Efraim and Ran Ginosar and C. Weiser and Avi
Mendelson",
title = "Energy Aware Race to Halt: a Down to {EARtH} Approach
for Platform Energy Management",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "25--28",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.32",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The EARtH algorithm finds the optimal voltage and
frequency operational point of the processor in order
to achieve minimum energy of the computing platform.
The algorithm is based on a theoretical model employing
a small number of parameters, which are extracted from
real systems using off-line and run-time methods. The
model and algorithm have been validated on real systems
using 45nm, 32nm and 22nm Intel (R) Core processors.
The algorithm can save up to 44\% energy compared with
the commonly used fixed frequency policies.",
acknowledgement = ack-nhfb,
affiliation = "Efraim, R (Reprint Author), Intel Corp, Santa Clara,
CA 95051 USA. Efraim, Rotem, Intel Corp, Santa Clara,
CA 95051 USA. Ginosar, Ran; Weiser, C.; Mendelson, Avi,
Technion Israeli Inst Technol, Haifa, Israel.",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithm design and analysis; B Hardware; B.9 Power
Management; B.9.2 Energy-aware systems; C Computer
Systems Organization; C.4 Performance of Systems; C.5
Computer System Implementation; C.5.4 VLSI Systems;
C.5.5 Servers; Computational modeling; Earth; EARtH
algorithm; energy aware race to halt; Energy
management; Energy measurement; fixed frequency
policies; Frequency measurement; frequency operational
point; Heterogeneous cores; Intel core processors;
microprocessor chips; off-line methods; optimal
voltage; platform energy management; power aware
computing; Power Management; run-time methods; size 22
nm; size 32 nm; size 45 nm; Voltage measurement",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "Efraim:2014:EAR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Cakmakci:2014:EVA,
author = "Yaman {\c{C}}akmak{\c{c}}i and O{\u{g}}uz Ergin",
title = "Exploiting Virtual Addressing for Increasing
Reliability",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "29--32",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.2",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "A novel method to protect a system against errors
resulting from soft errors occurring in the virtual
address (VA) storing structures such as translation
lookaside buffers (TLB), physical register file (PRF)
and the program counter (PC) is proposed in this paper.
The work is motivated by showing how soft errors impact
the structures that store virtual page numbers (VPN). A
solution is proposed by employing linear block encoding
methods to be used as a virtual addressing scheme at
link time. Using the encoding scheme to assign VPNs for
VAs, it is shown that the system can tolerate soft
errors using software with the help of the discussed
decoding techniques applied to the page fault handler.
The proposed solution can be used on all of the
architectures using virtually indexed addressing. The
main contribution of this paper is the decreasing of
AVF for data TLB by 42.5\%, instruction TLB by 40.3\%,
PC by 69.2\% and PRF by 33.3\%.",
acknowledgement = ack-nhfb,
affiliation = "{\c{C}}akmak{\c{c}}i, Y (Reprint Author), TOBB Univ
Econ \& Technol, Dept Comp Engn, Ankara, Turkey.
{\c{C}}akmak{\c{c}}i, Yaman; Ergin, O{\u{g}}uz, TOBB
Univ Econ \& Technol, Dept Comp Engn, Ankara, Turkey.",
author-email = "ycakmakci@etu.edu.tr oergin@etu.edu.tr",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Scientific and Technological Research
Council of Turkey (TUBITAK) [112E004]",
funding-text = "This work was supported in part by the Scientific and
Technological Research Council of Turkey (TUBITAK)
under Grant 112E004. The work is in the framework of
COST ICT Action 1103 Manufacturable and Dependable
Multicore Architectures at Nanoscale.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "AVF; B Hardware; B.3 Memory Structures; B.3.2 Design
Styles; B.3.2.h Virtual memory; B.3.4 Reliability,
Testing and Fault-Tolerance; buffer storage; decoding
techniques; encoding; Fault tolerance; Hardware; linear
block encoding methods; Memory management; page fault
handler; PC; physical register file; PRF; program
counter; soft errors; TLB; translation lookaside
buffers; virtual address storing structures; virtual
addressing; virtual addressing scheme; Virtual memory;
virtual page numbers; virtually indexed addressing;
VPN",
keywords-plus = "SOFT ERRORS",
number-of-cited-references = "10",
ORCID-numbers = "Ergin, O{\u{g}}uz/0000-0003-2701-3787",
research-areas = "Computer Science",
researcherid-numbers = "Ergin, O{\u{g}}uz/E-5717-2010",
times-cited = "1",
unique-id = "Cakmakci:2014:EVA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zhu:2014:EWC,
author = "Yuhao Zhu and Aditya Srikanth and Jingwen Leng and
Vijay Janapa Reddi",
title = "Exploiting Webpage Characteristics for
Energy-Efficient Mobile {Web} Browsing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "33--36",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.33",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Web browsing on mobile devices is undoubtedly the
future. However, with the increasing complexity of
webpages, the mobile device's computation capability
and energy consumption become major pitfalls for a
satisfactory user experience. In this paper, we propose
a mechanism to effectively leverage processor frequency
scaling in order to balance the performance and energy
consumption of mobile web browsing. This mechanism
explores the performance and energy tradeoff in webpage
loading, and schedules webpage loading according to the
webpages' characteristics, using the different
frequencies. The proposed solution achieves 20.3\%
energy saving compared to the performance mode, and
improves webpage loading performance by 37.1\% compared
to the battery saving mode.",
acknowledgement = ack-nhfb,
affiliation = "Zhu, YH (Reprint Author), Univ Texas Austin, Dept
Elect \& Comp Engn, Austin, TX 78712 USA. Zhu, Yuhao;
Srikanth, Aditya; Leng, Jingwen; Reddi, Vijay Janapa,
Univ Texas Austin, Dept Elect \& Comp Engn, Austin, TX
78712 USA.",
author-email = "yzhu@utexas.edu aditya.srik@utexas.edu
jingwen@utexas.edu vj@ece.utexas.edu",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "C Computer Systems Organization; C.2
Communication/Networking and Information Technology;
C.2.8 Mobile Computing; Cascading style sheets; Cutoff;
EDP; Energy; energy conservation; energy consumption;
Energy consumption; energy-efficient mobile Web
browsing; HTML; Internet; Load modeling; Loading;
Market research; Mobile communication; mobile
computing; mobile device computation capability;
Performance; power aware computing; processor frequency
scaling; user experience; Web page characteristics; Web
page loading performance; Webpages",
number-of-cited-references = "6",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Zhu:2014:EWC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Morad:2014:GMO,
author = "Amir Morad and Tomer Y. Morad and Leonid Yavits and
Ran Ginosar and Uri Weiser",
title = "Generalized {MultiAmdahl}: Optimization of
Heterogeneous Multi-Accelerator {SoC}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "37--40",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.34",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Consider a workload comprising a consecutive sequence
of program execution segments, where each segment can
either be executed on general purpose processor or
offloaded to a hardware accelerator. An analytical
optimization framework based on MultiAmdahl framework
and Lagrange multipliers, for selecting the optimal set
of accelerators and for allocating resources among them
under constrained area is proposed. Due to the
practical implementation of accelerators, the optimal
architecture under area constraints may exclude some of
the accelerators. As the fraction of the workload that
can be accelerated decreases, resources (e.g. area) may
shift from accelerators into the general purpose
processor. The framework can be extended in a number of
ways, spanning from SoC partitioning, bandwidth to
power distribution, energy and other constrained
resources.",
acknowledgement = ack-nhfb,
affiliation = "Morad, A (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
Morad, Amir; Morad, Tomer Y.; Yavits, Leonid; Ginosar,
Ran; Weiser, Uri, Technion Israel Inst Technol, Dept
Elect Engn, IL-32000 Haifa, Israel.",
author-email = "amirm@tx.technion.ac.il tomerm@tx.technion.ac.il
yavits@tx.technion.ac.il ran@ee.technion.ac.il
uri.weiser@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; analytical optimization framework; Chip
Multiprocessors; general purpose processor; generalized
multiAmdhal framework; Hardware; hardware accelerator;
heterogeneous multiaccelerator SoC partitioning;
Lagrange multiplier; Mathematical model; Modeling of
computer architecture; MultiAmdahl; Multicore
processing; optimisation; Optimization; power
distribution bandwidth; program execution segment;
resource allocation; Resource management;
System-on-a-chip; system-on-chip",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Morad:2014:GMO",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kvatinsky:2014:MBM,
author = "Shahar Kvatinsky and Yuval H. Nacson and Yoav Etsion
and Eby G. Friedman and Avinoam Kolodny and Uri C.
Weiser",
title = "Memristor-Based Multithreading",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "41--44",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.3",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "Switch on Event Multithreading (SoE MT, also known as
coarse-grained MT and block MT) processors run multiple
threads on a pipeline machine, while the pipeline
switches threads on stall events (e.g., cache miss).
The thread switch penalty is determined by the number
of stages in the pipeline that are flushed of in-flight
instructions. In this paper, Continuous Flow
Multithreading (CFMT), a new architecture of SoE MT, is
introduced. In CFMT, a multistate pipeline register
(MPR) holds the microarchitectural state of multiple
different threads within the execution pipeline stages,
where only one thread is active at a time. The MPRs
eliminate the need to flush in-flight instructions and
therefore significantly improve performance. In recent
years, novel memory technologies such as Resistive RAM
(RRAM) and Spin Torque Transfer Magnetoresistive RAM
(STT-MRAM), have been developed. All of these
technologies are nonvolatile, store data as resistance,
and can be described as ``memristors''. Memristors are
power efficient, dense, and fast as compared to
standard memory technologies such as SRAM, DRAM, and
Flash. Memristors therefore provide the opportunity to
place the MPRs physically within the pipeline stages. A
performance analysis of CFMT is compared to
conventional SoE MT processors, demonstrating up to a
2X performance improvement, while the operational
mechanism, due to the use of memristors, is low power
and low complexity as compared to conventional SoE MT
processors.",
acknowledgement = ack-nhfb,
affiliation = "Kvatinsky, S (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
Kvatinsky, Shahar; Etsion, Yoav; Kolodny, Avinoam;
Weiser, Uri C., Technion Israel Inst Technol, Dept
Elect Engn, IL-32000 Haifa, Israel. Etsion, Yoav,
Technion Israel Inst Technol, Dept Comp Sci, IL-32000
Haifa, Israel. Friedman, Eby G., Univ Rochester, Dept
Elect \& Comp Engn, Rochester, NY 14627 USA.",
author-email = "skva@tx.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Hasso Plattner Institute",
funding-text = "This work was supported by the Hasso Plattner
Institute. The authors thank Ravi Patel for his
comments and area overhead estimation and to Nimrod
Wald and Guy Satat for their help in evaluating the
architecture.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B Hardware; B.3 Memory Structures; B.7 Integrated
Circuits; B.7.1 Types and Design Styles; B.7.1.e Memory
technologies; C Computer Systems Organization; C.0
General; C.0.a Emerging technologies; C.0.d Modeling of
computer architecture; CFMT; Computer architecture;
continuous flow multithreading; in-flight instructions;
Integrated circuits; Memory management; memristor;
memristor-based multithreading; memristors; MPR;
multi-threading; multistate pipeline register;
multithreaded processors; Multithreading; novel memory
technologies; phase change memory; random-access
storage; resistive RAM; RRAM; RRAM, STT-MRAM; SoE MT
processors; spin torque transfer magnetoresistive RAM;
STT- MRAM; STT-MRAM; switch on event multithreading
processors; Systems design and analysis",
keywords-plus = "RESISTIVE SWITCHING MEMORIES",
number-of-cited-references = "21",
research-areas = "Computer Science",
times-cited = "10",
unique-id = "Kvatinsky:2014:MBM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wingbermuehle:2014:OAS,
author = "Joseph G. Wingbermuehle and Ron K. Cytron and Roger D.
Chamberlain",
title = "Optimization of Application-Specific Memories",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "45--48",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.7",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memory access times are the primary bottleneck for
many applications today. This ``memory wall'' is due to
the performance disparity between processor cores and
main memory. To address the performance gap, we propose
the use of custom memory subsystems tailored to the
application rather than attempting to optimize the
application for a fixed memory subsystem. Custom
subsystems can take advantage of application-specific
properties as well as memory-specific properties to
improve access times or write-backs given constraints
on size or power.",
acknowledgement = ack-nhfb,
affiliation = "Wingbermuehle, JG (Reprint Author), Washington Univ,
Dept Comp Sci \& Engn, St Louis, MO 63130 USA.
Wingbermuehle, Joseph G.; Cytron, Ron K.; Chamberlain,
Roger D., Washington Univ, Dept Comp Sci \& Engn, St
Louis, MO 63130 USA.",
author-email = "wingbej@wustl.edu cytron@wustl.edu roger@wustl.edu",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "National Science Foundation [CNS-09095368,
CNS-0931693]",
funding-text = "This work is supported by the National Science
Foundation under grants CNS-09095368 and CNS-0931693.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "access time improvement; application-specific memory
optimization; B Hardware; B.3 Memory Structures; B.3.2
Design Styles; B.3.3 Performance Analysis and Design
Aids; B.3.3.b Simulation; C Computer Systems
Organization; C.1 Processor Architectures; C.1.5
Micro-architecture implementation considerations;
C.1.5.e Memory hierarchy; cache; cache storage;
Computer architecture; custom memory subsystems; fixed
memory subsystem; Hardware; memory access times; Memory
management; memory wall; memory-specific properties;
Multiprocessing systems; performance disparity;
Performance evaluation; performance gap; processor
cores; write-backs given constraints",
number-of-cited-references = "21",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Wingbermuehle:2014:OAS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Xu:2014:STM,
author = "Yunlong Xu and Rui Wang and Nilanjan Goswami and Tao
Li and Depei Qian",
title = "Software Transactional Memory for {GPU}
Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "49--52",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.4",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "To make applications with dynamic data sharing among
threads benefit from GPU acceleration, we propose a
novel software transactional memory system for GPU
architectures (GPU-STM). The major challenges include
ensuring good scalability with respect to the massively
multithreading of GPUs, and preventing livelocks caused
by the SIMT execution paradigm of GPUs. To this end, we
propose (1) a hierarchical validation technique and (2)
an encounter-time lock-sorting mechanism to deal with
the two challenges, respectively. Evaluation shows that
GPU-STM outperforms coarse-grain locks on GPUs by up to
20x.",
acknowledgement = ack-nhfb,
affiliation = "Xu, YL (Reprint Author), Xi An Jiao Tong Univ, Sch
Elect \& Informat Engn, Xian 710049, Peoples R China.
Xu, Yunlong; Qian, Depei, Xi An Jiao Tong Univ, Sch
Elect \& Informat Engn, Xian 710049, Peoples R China.
Wang, Rui; Qian, Depei, Beihang Univ, Sch Engn \& Comp
Sci, Beijing, Peoples R China. Goswami, Nilanjan; Li,
Tao, Univ Florida, ECE Dept, Gainesville, FL USA.",
author-email = "xjtu.ylxu@stu.xjtu.edu.cn rui.wang@jsi.buaa.edu.cn
nil@ufl.edu taoli@ece.ufl.edu depeiq@xjtu.edu.cn",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF of China [61133004, 61128004,
61073011]; 863 Program of China [2012AA010902]",
funding-text = "This work is supported by NSF of China under grant
61133004, 61128004 and 61073011, and 863 Program of
China under grant 2012AA010902.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "dynamic data sharing; encounter-time lock-sorting
mechanism; GPU acceleration; GPU architectures;
GPU-STM; graphics processing units; hierarchical
validation technique; multi-threading; Multicore
processing; multicore processor; Multicore Processors;
multiprocessing systems; Multiprocessing systems;
multithreading; parallel architectures; Parallel
processing; Parallel Programming; parallel programming;
Parallel Programming; Run-time Environments; Runtime
environment; SIMD processor; SIMD Processors; SIMT
execution paradigm; software transactional memory
system; sorting",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Xu:2014:STM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Shim:2014:TMP,
author = "Keun Sup Shim and Mieszko Lis and Omer Khan and
Srinivas Devadas",
title = "Thread Migration Prediction for Distributed Shared
Caches",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "53--56",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2012.30",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Chip-multiprocessors (CMPs) have become the mainstream
parallel architecture in recent years; for scalability
reasons, designs with high core counts tend towards
tiled CMPs with physically distributed shared caches.
This naturally leads to a Non-Uniform Cache Access
(NUCA) design, where on-chip access latencies depend on
the physical distances between requesting cores and
home cores where the data is cached. Improving data
locality is thus key to performance, and several
studies have addressed this problem using data
replication and data migration. In this paper, we
consider another mechanism, hardware-level thread
migration. This approach, we argue, can better exploit
shared data locality for NUCA designs by effectively
replacing multiple round-trip remote cache accesses
with a smaller number of migrations. High migration
costs, however, make it crucial to use thread
migrations judiciously; we therefore propose a novel,
on-line prediction scheme which decides whether to
perform a remote access (as in traditional NUCA
designs) or to perform a thread migration at the
instruction level. For a set of parallel benchmarks,
our thread migration predictor improves the performance
by 24\% on average over the shared-NUCA design that
only uses remote accesses.",
acknowledgement = ack-nhfb,
affiliation = "Shim, KS (Reprint Author), MIT, 77 Massachusetts Ave,
Cambridge, MA 02139 USA. Shim, Keun Sup; Lis, Mieszko;
Devadas, Srinivas, MIT, Cambridge, MA 02139 USA. Khan,
Omer, Univ Connecticut, Storrs, CT USA.",
da = "2019-06-20",
doc-delivery-number = "AT5MU",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B Hardware; B.3 Memory Structures; B.3.2 Design
Styles; B.3.2.g Shared memory; Benchmark testing; C
Computer Systems Organization; C.1 Processor
Architectures; C.1.4 Parallel Architectures; Cache
Coherence; cache storage; chip-multiprocessors; CMPs;
Coherence; Computer architecture; Context; core counts;
Data Locality; data locality improvement; data
migration; data replication; Distributed Caches;
hardware-level thread migration prediction; home cores;
Instruction sets; integrated circuit design; mainstream
parallel architecture; microprocessor chips;
multiprocessing systems; nonuniform cache access
design; on-chip access latencies; online prediction
scheme; Parallel Architecture; parallel architectures;
physical distributed shared caches; Protocols;
Registers; requesting cores; shared-NUCA design",
number-of-cited-references = "13",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Shim:2014:TMP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2014:TCa,
author = "Anonymous",
title = "Table of Contents",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "C1--C4",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2360655",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2014:ITPa,
author = "Anonymous",
title = "{{\booktitle{IEEE Transactions on Pattern Analysis and
Machine Intelligence}} Editorial Board}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "C2--C2",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2360656",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2014:ITPb,
author = "Anonymous",
title = "{{\booktitle{IEEE Transactions on Pattern Analysis and
Machine Intelligence}}} Information for Authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "C3--C3",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2360657",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2014:ICSa,
author = "Anonymous",
title = "{IEEE Computer Society}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "1",
pages = "C4--C4",
month = jan # "\slash " # jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2360658",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Lavasani:2014:FBL,
author = "Maysam Lavasani and Hari Angepat and Derek Chiou",
title = "An {FPGA}-based In-Line Accelerator for {Memcached}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "57--60",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.17",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We present a method for accelerating server
applications using a hybrid CPU+FPGA architecture and
demonstrate its advantages by accelerating Memcached, a
distributed key-value system. The accelerator,
implemented on the FPGA fabric, processes request
packets directly from the network, avoiding the CPU in
most cases. The accelerator is created by profiling the
application to determine the most commonly executed
trace of basic blocks which are then extracted. Traces
are executed speculatively within the FPGA. If the
control flow exits the trace prematurely, the side
effects of the computation are rolled back and the
request packet is passed to the CPU. When compared to
the best reported software numbers, the Memcached
accelerator is 9.15x more energy efficient for common
case requests.",
acknowledgement = ack-nhfb,
affiliation = "Lavasani, M (Reprint Author), Univ Texas Austin, Dept
Elect \& Comp Engn, Austin, TX 78712 USA. Lavasani,
Maysam; Angepat, Hari; Chiou, Derek, Univ Texas Austin,
Dept Elect \& Comp Engn, Austin, TX 78712 USA.",
author-email = "maysamlavasani@utexas.edu hangepat@utexas.edu
derek@utexas.edu",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "accelerating server; C.1.3.f Heterogeneous (hybrid)
systems; C.2.4.a Client/server; cache storage;
Client-server systems; Computer architecture; control
flow; distributed key-value system; distributed
processing; field programmable gate arrays; Field
programmable gate arrays; FPGA-based in-line
accelerator; hybrid CPU+FPGA architecture; Hybrid
systems; Memcached accelerator; Program processors;
reconfigurable architectures; request packet; rolled
back; software numbers",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "24",
unique-id = "Lavasani:2014:FBL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Song:2014:AFB,
author = "Xiang Song and Jian Yang and Haibo Chen",
title = "Architecting Flash-based Solid-State Drive for
High-performance {I/O} Virtualization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "61--64",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.22",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Flash-based solid-state drive (SSD) is now being
widely deployed in cloud computing platforms due to the
potential advantages of better performance and less
energy consumption. However, current virtualization
architecture lacks support for high-performance I/O
virtualization over persistent storage, which results
in sub-optimal I/O performance for guest virtual
machines (VMs) on SSD. Further, current software-based
I/O virtualization violates the ``don't hide power''
principle due to inefficient support for some advanced
SSD commands (e.g., TRIM) and constrained parallelism,
leading to sub-optimal performance and life cycle. This
paper observes that the massive internal parallelism
and the block emulation in the flash translation layer
(FTL) make flash-based SSD an ideal candidate to
support high-performance I/O virtualization for
persistent storage. Based on this observation, we
propose VFlash, the first storage I/O virtualization
architecture that extends existing SSDs with trivial
hardware changes to directly expose multiple virtual
SSDs to guest VMs. Performance evaluation using a
modified FlashSim with two FTL schemes (i.e., DFTL and
FAST) shows that VFlash incurs only small performance
overhead over native SSDs and can efficiently exploit
parallelism.",
acknowledgement = ack-nhfb,
affiliation = "Chen, HB (Reprint Author), Shanghai Jiao Tong Univ,
Sch Software, Inst Parallel \& Distributed Syst,
Shanghai 200030, Peoples R China. Song, Xiang; Yang,
Jian; Chen, Haibo, Shanghai Jiao Tong Univ, Sch
Software, Inst Parallel \& Distributed Syst, Shanghai
200030, Peoples R China.",
author-email = "haibochen@sjtu.edu.cn",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "China National Natural Science Foundation
[61003002]; Intel",
funding-text = "This work was supported by China National Natural
Science Foundation under grant numbered 61003002 and a
grant from Intel.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B.4.4 Performance Analysis and Design Aids; C.4.g
Measurement; cloud computing; Cloud computing; cloud
computing platforms; Computer architecture; energy
consumption; evaluation; flash memories; flash-based
solid-state drive; high performance I/O virtualization
architecture; I/O virtualization; modeling;
Multiprocessing systems; Parallel processing;
Performance evaluation; performance evaluation; Random
access memory; simulation of multiple-processor
systems; software-based I/O virtualization; Solid state
circuits; Solid State Drive; SSD commands; virtual
machines; virtualisation; VM",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Song:2014:AFB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wu:2014:ATE,
author = "Carole-Jean Wu",
title = "Architectural Thermal Energy Harvesting Opportunities
for Sustainable Computing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "65--68",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.16",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Increased power dissipation in computing devices has
led to a sharp rise in thermal hotspots, creating
thermal runaway. To reduce the additional power
requirement caused by increased temperature, current
approaches apply cooling mechanisms to remove heat or
apply management techniques to avoid thermal
emergencies by slowing down heat generation. This paper
proposes to tackle the heat management problem of
computing platforms with a fundamentally new approach -
instead of heat removal using cooling mechanisms and
heat avoidance using dynamic thermal/power management
techniques, this work investigates the mechanisms to
recover wasted heat into reusable energy for
sustainable computing. Through recent advancements in
thermoelectric materials, we allow wasted heat energy
generated by computing devices to be recovered,
transformed, and harvested as electricity that can be
directly used within the system. We demonstrate a
real-system setup where we recover 0.3 to 1 watt of
power with the CPU running at 70 to 105 degrees C,
using a COTS thermoelectric device on top of the CPU.
Through this research, we hope to motivate more
in-depth efforts to explore heat energy harvesting
opportunities on computing devices and inspire
plausible solutions to overcome the technical
challenges discussed in this paper.",
acknowledgement = ack-nhfb,
affiliation = "Wu, CJ (Reprint Author), Arizona State Univ, Sch Comp,
Dept Comp Sci Engn, Tempe, AZ 85281 USA. Arizona State
Univ, Sch Comp, Dept Comp Sci Engn, Tempe, AZ 85281
USA.",
author-email = "carole-jean.wu@asu.edu",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architectural thermal energy harvesting; cooling;
Cooling; cooling mechanisms; dynamic thermal-power
management technique; Energy conservation; energy
harvesting; Energy-aware systems; heat generation; heat
management problem; power dissipation; Power
distribution; power engineering computing; Resistance
heating; sustainable computing; Temperature
measurement; Temperature-aware design; thermal energy
storage; thermal runaway; Waste heat",
number-of-cited-references = "6",
research-areas = "Computer Science",
times-cited = "5",
unique-id = "Wu:2014:ATE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Yavits:2014:CHO,
author = "Leonid Yavits and Amir Morad and Ran Ginosar",
title = "Cache Hierarchy Optimization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "69--72",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.18",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Power consumption, off-chip memory bandwidth, chip
area and Network on Chip (NoC) capacity are among main
chip resources limiting the scalability of Chip
Multiprocessors (CMP). A closed form analytical
solution for optimizing the CMP cache hierarchy and
optimally allocating area among hierarchy levels under
such constrained resources is developed. The
optimization framework is extended by incorporating the
impact of data sharing on cache miss rate. An
analytical model for cache access time as a function of
cache size is proposed and verified using CACTI
simulation.",
acknowledgement = ack-nhfb,
affiliation = "Yavits, L (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
Yavits, Leonid; Morad, Amir; Ginosar, Ran, Technion
Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa,
Israel.",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "ICRI-CI; Hasso-Plattner-Institut",
funding-text = "We thank Prof. Uri Weiser and Yaniv Ben Itzhak for
their review and remarks. This research was partially
funded by the ICRI-CI and Hasso-Plattner-Institut.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Analytical models; Analytical Performance Models;
Bandwidth; Cache Hierarchy; cache hierarchy
optimization; cache storage; CACTI simulation; chip
area; Chip Multiprocessor; chip multiprocessors; CMP;
Computational modeling; data sharing; Integrated
circuit modeling; Multiprocessing systems; network on
chip; network-on-chip; NoC; off-chip memory bandwidth;
optimisation; Optimization; power consumption; Resource
Allocation Optimization; Resource Allocation
Optimizations; Resource management",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Yavits:2014:CHO",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Yazdanshenas:2014:CLL,
author = "Sadegh Yazdanshenas and Marzieh Ranjbar Pirbasti and
Mahdi Fazeli and Ahmad Patooghy",
title = "Coding Last Level {STT-RAM} Cache For High Endurance
And Low Power",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "73--76",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.8",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "STT-RAM technology has recently emerged as one of the
most promising memory technologies. However, its major
problems, limited write endurance and high write
energy, are still preventing it from being used as a
drop-in replacement of SRAM cache. In this paper, we
propose a novel coding scheme for STT-RAM last level
cache based on the concept of value locality. We reduce
switching probability in cache by swapping common
patterns with limited weight codes (LWC) to make writes
less often as well as more uniform. We also define some
policies for swapping these patterns. Our evaluation
shows that bit write variance in memory cells can be
reduced by about 20\% on average resulting in a more
uniform wear-out directly enhancing lifetime and
improving cell reliability. In addition, writes in
cache lines can be reduced by about 12\% compared to
one of the most effective circuit level techniques
known as early write termination (EWT) [12]. Our method
increases memory hierarchy access time by about 0.08\%
on average, which is negligible. We have shown that our
method doesn't adversely affect last level cache
energy-delay(2). The non-uniformity caused by the
coding scheme can be used for another coding scheme at
main memory or L1 cache depending on their
technologies.",
acknowledgement = ack-nhfb,
affiliation = "Yazdanshenas, S (Reprint Author), Iran Univ Sci \&
Technol, Sch Comp Engn, Tehran, Iran. Yazdanshenas,
Sadegh; Pirbasti, Marzieh Ranjbar; Fazeli, Mahdi;
Patooghy, Ahmad, Iran Univ Sci \& Technol, Sch Comp
Engn, Tehran, Iran.",
author-email = "sadegh\_yazdanshenas@comp.iust.ac.ir
m\_ranjbar@comp.iust.ac.ir m\_fazeli@iust.ac.ir
patooghy@iust.ac.ir",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B Hardware; B.3 Memory Structures; bit write variance;
C Computer Systems Organization; C.1 Processor
Architectures; cache; cache storage; cell reliability;
circuit level technique; coding scheme; Computer
architecture; early write termination; Encoding;
limited weight code; limited weight codes; memory
endurance; memory technology; nonvolatile memory;
Nonvolatile memory; probability; Random access memory;
random-access storage; STT-RAM; STT-RAM cache;
switching probability; Three-dimensional displays;
write energy; write hotspot",
keywords-plus = "MEMORY; CIRCUIT; ENERGY; MRAM",
number-of-cited-references = "13",
ORCID-numbers = "Fazeli, Mahdi/0000-0002-2874-6256 Patooghy,
Ahmad/0000-0003-2647-2797",
research-areas = "Computer Science",
researcherid-numbers = "Fazeli/S-9574-2018",
times-cited = "14",
unique-id = "Yazdanshenas:2014:CLL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Martinsen:2014:HTL,
author = "Jan Kasper Martinsen and Hakan Grahn and Anders
Isberg",
title = "Heuristics for Thread-Level Speculation in {Web}
Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "77--80",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.26",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/java2010.bib",
abstract = "JavaScript is a sequential programming language, and
Thread-Level Speculation has been proposed to
dynamically extract parallelism in order to take
advantage of parallel hardware. In previous work, we
have showed significant speed-ups with a simple on/off
speculation heuristic. In this paper, we propose and
evaluate three heuristics for dynamically adapt the
speculation: a 2-bit heuristic, an exponential
heuristic, and a combination of these two. Our results
show that the combined heuristic is able to both
increase the number of successful speculations and
decrease the execution time for 15 popular web
applications.",
acknowledgement = ack-nhfb,
affiliation = "Martinsen, JK (Reprint Author), Blekinge Inst Technol,
Sch Comp, SE-37179 Karlskrona, Sweden. Martinsen, Jan
Kasper; Grahn, Hakan, Blekinge Inst Technol, Sch Comp,
SE-37179 Karlskrona, Sweden. Isberg, Anders, Sony
Mobile Commun AB, SE-22188 Lund, Sweden.",
author-email = "Jan.Kasper.Martinsen@bth.se Hakan.Grahn@bth.se
Anders.Isberg@sonymobile.com",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Industrial Excellence Center EASE -
Embedded Applications Software Engineering; BESQ+
research project --- Knowledge Foundation in Sweden
[20100311]",
funding-text = "This work was partly funded by the Industrial
Excellence Center EASE --- Embedded Applications
Software Engineering, (http://ease.cs.lth.se), and the
BESQ+ research project funded by the Knowledge
Foundation (grant number 20100311) in Sweden.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "2-bit heuristic; Automatic Parallelization; Benchmark
testing; C.1.4 Parallel Architectures; C.1.4.f
Speculative multi-threading; exponential heuristic;
Instruction sets; Internet; Java; JavaScript; Multicore
processors; Multithreading; Parallel Computing;
parallel hardware; Parallel processing; parallel
programming; sequential programming language; Social
network services; thread-level speculation; Web
applications",
number-of-cited-references = "12",
oa = "Green Published",
ORCID-numbers = "Martinsen, Jan Kasper/0000-0001-8915-3633 Grahn,
Hakan/0000-0001-9947-1088",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Martinsen:2014:HTL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Nandakumar:2014:OKS,
author = "Vivek S. Nandakumar and Ma{\l}gorzata Marek-Sadowska",
title = "On Optimal Kernel Size for Integrated {CPU--GPUs} ---
a Case Study",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "81--84",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.27",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Integrated CPU-GPU architectures with a fully
addressable shared memory completely eliminate any
CPU-GPU data transfer overhead. Since such
architectures are relatively new, it is unclear what
level of interaction between the CPU and GPU attains
the best energy efficiency. Too coarse grained or
larger kernels with fairly low CPU--GPU interaction
could cause poor utilization of the shared resources
while too fine grained kernels could cause frequent
interrupts of GPU computation and performance
degradation. Also larger kernels require larger shared
resources causing increase in area and parasitics which
affect the latency sensitive CPU cores. In this paper,
we show the effect of granularity on the overall
system's energy efficiency using a synthetic workload.
We describe how our framework models a truly unified
shared memory in integrated architectures with frequent
CPU--GPU communication.",
acknowledgement = ack-nhfb,
affiliation = "Nandakumar, VS (Reprint Author), Univ Calif Santa
Barbara, Dept Elect \& Comp Engn, Santa Barbara, CA
93106 USA. Nandakumar, Vivek S.; Marek-Sadowska,
Malgorzata, Univ Calif Santa Barbara, Dept Elect \&
Comp Engn, Santa Barbara, CA 93106 USA.",
author-email = "vivek@ece.ucsb.edu mms@ece.uscb.edu",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "SRC grant [2236]",
funding-text = "This work was supported by SRC grant \#2236.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B.3.2.g Shared memory; B.4.4.b Simulation; B.9.2
Energy-aware systems; C.1.3.f Heterogeneous (hybrid)
systems; C.4.g Measurement; Central Processing Unit;
Computational modeling; CPU-GPU communication; CPU-GPU
data transfer overhead; CPU-GPU interaction; D.4.4
Communications Management; energy efficiency; Energy
efficiency; evaluation; fine grained kernels; fully
addressable shared memory; GPU computation; graphics
processing units; Graphics processing units; integrated
CPU-GPU architectures; latency sensitive CPU cores;
Memory management; modeling; optimal kernel size;
overall system energy efficiency; performance
degradation; performance evaluation; power aware
computing; shared memory systems; simulation of
multiple-processor systems",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Nandakumar:2014:OKS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Liu:2014:PTE,
author = "Qixiao Liu and Victor Jimenez and Miquel Moreto and
Jaume Abella and Francisco J. Cazorla and Mateo
Valero",
title = "Per-task Energy Accounting in Computing Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "85--88",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.24",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We present for the first time the concept of per-task
energy accounting (PTEA) and relate it to per-task
energy metering (PTEM). We show the benefits of
supporting both in future computing systems. Using the
shared last-level cache (LLC) as an example: (1) We
illustrate the complexities in providing PTEM and PTEA;
(2) we present an idealized PTEM model and an accurate
and low-cost implementation of it; and (3) we introduce
a hardware mechanism to provide accurate PTEA in the
cache.",
acknowledgement = ack-nhfb,
affiliation = "Liu, QX (Reprint Author), Univ Politecn Cataluna,
E-08028 Barcelona, Spain. Liu, Qixiao; Jimenez, Victor;
Moreto, Miquel; Valero, Mateo, Univ Politecn Cataluna,
E-08028 Barcelona, Spain. Liu, Qixiao; Jimenez, Victor;
Moreto, Miquel; Abella, Jaume; Cazorla, Francisco J.;
Valero, Mateo, Barcelona Supercomp Ctr, Barcelona,
Spain. Cazorla, Francisco J., Spanish Natl Res Council
IIIA CSIC, Barcelona, Spain.",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Spanish Ministry of Science and Innovation
[TIN2012-34557]; HiPEAC Network of Excellence; Chinese
Scholarship Council [2010608015]",
funding-text = "This work has been partially supported by the Spanish
Ministry of Science and Innovation under grant
TIN2012-34557 and the HiPEAC Network of Excellence.
Qixiao Liu has also been funded by the Chinese
Scholarship Council under grant 2010608015.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; cache storage; Computational
modeling; computing systems; Energy consumption; Energy
management; Monitoring; Multicore processing; per-task
energy accounting; per-task energy metering; power
aware computing; PTEA; PTEM model; Radiation detectors;
shared last-level cache",
number-of-cited-references = "20",
oa = "Green Published",
ORCID-numbers = "Cazorla, Francisco/0000-0002-3344-376X Moreto Planas,
Miquel/0000-0002-9848-8758 Valero,
Mateo/0000-0003-2917-2482 Abella,
Jaume/0000-0001-7951-4028 Liu,
Qixiao/0000-0002-8196-7584",
research-areas = "Computer Science",
researcherid-numbers = "Cazorla, Francisco/D-7261-2016 Moreto Planas,
Miquel/C-1823-2016 Valero, Mateo/L-5709-2014 Abella,
Jaume/B-7422-2016",
times-cited = "2",
unique-id = "Liu:2014:PTE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Mahmoodi:2014:RCC,
author = "Hamid Mahmoodi and Sridevi Srinivasan Lakshmipuram and
Manish Arora and Yashar Asgarieh and Houman Homayoun
and Bill Lin and Dean M. Tullsen",
title = "Resistive Computation: a Critique",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "89--92",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.23",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Resistive Computation was suggested by [6] as an idea
for tacking the power wall by replacing conventional
CMOS logic with Magnetic Tunnel Junction (MTJ) based
Look-Up Tables (LUTs). Spin Transfer Torque RAM
(STTRAM) is an emerging CMOS-compatible non-volatile
memory technology based on Magnetic Tunnel Junctions as
a memory bit [3]. The principal advantage of STTRAM is
that it is leakage-resistant, which is an important
characteristic beyond the 45nm technology node, where
leakage concerns are becoming a limiting factor in
microprocessor performance. Although STTRAM is a good
candidate for replacing SRAM for on-chip memory, we
argue in this article MTJ-based LUTs are unnecessarily
expensive in terms of area, power, and performance when
implementing fixed combinational logic that does not
require the reprogramming ability provided by MTJs.",
acknowledgement = ack-nhfb,
affiliation = "Mahmoodi, H (Reprint Author), San Francisco State
Univ, San Francisco, CA 94132 USA. Arora, Manish;
Asgarieh, Yashar; Lin, Bill; Tullsen, Dean M., Univ
Calif San Diego, La Jolla, CA 92093 USA. Mahmoodi,
Hamid; Lakshmipuram, Sridevi Srinivasan, San Francisco
State Univ, San Francisco, CA 94132 USA. Homayoun,
Houman, George Mason Univ, Fairfax, VA 22030 USA.",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "B.2.1 Design Styles; B.6.1.e Memory used as logic;
B.7.1.a Advanced technologies; B.9.1 Low-power design;
C.0.a Emerging technologies; CMOS integrated circuits;
CMOS-compatible nonvolatile memory technology; Delays;
dynamic current-mode logic; fixed combinational logic;
leakage power; leakage-resistance; Logic gates; look-up
tables; Low power electronics; magnetic tunnel
junction; Magnetic tunneling; magnetic tunnelling;
magnetic-tunnel junctions; memory bit; MRAM; MTJ-based
LUT; Power distribution; random-access storage;
Resistive computation; resistive computation; Resistive
computation; spin transfer torque RAM; STTRAM; Table
lookup; table lookup; Transistors",
keywords-plus = "TECHNOLOGY; CIRCUIT",
number-of-cited-references = "10",
ORCID-numbers = "Lin, Binshan/0000-0002-8481-302X",
research-areas = "Computer Science",
researcherid-numbers = "Lin, Binshan/A-9772-2009",
times-cited = "4",
unique-id = "Mahmoodi:2014:RCC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Eyerman:2014:RCW,
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Restating the Case for Weighted-{IPC} Metrics to
Evaluate Multiprogram Workload Performance",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "93--96",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.9",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Weighted speedup is nowadays the most commonly used
multiprogram workload performance metric. Weighted
speedup is a weighted-IPC metric, i.e., the
multiprogram IPC of each program is first weighted with
its isolated IPC. Recently, Michaud questions the
validity of weighted-IPC metrics by arguing that they
are inconsistent and that weighted speedup favors
unfairness [4]. Instead, he advocates using the
arithmetic or harmonic mean of the raw IPC values of
the programs in the multiprogram workload. We show that
weighted-IPC metrics are not inconsistent, and that
weighted speedup is fair in giving equal importance to
each program. We argue that, in contrast to raw-IPC
metrics, weighted-IPC metrics have a system-level
meaning, and that raw-IPC metrics are affected by the
inherent behavior of the programs. We also show that
the choice of a metric may adversely affect the
conclusions from an experiment. We suggest to use two
weighted-IPC metrics-system throughput (STP) and
average normalized turnaround time (ANTT)-for
evaluating multiprogram workload performance, and to
avoid raw-IPC metrics.",
acknowledgement = ack-nhfb,
affiliation = "Eyerman, S (Reprint Author), Univ Ghent, B-9000 Ghent,
Belgium. Eyerman, Stijn; Eeckhout, Lieven, Univ Ghent,
B-9000 Ghent, Belgium.",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Research Foundation --- Flanders (FWO);
European Research Council under the European Community
[259295]",
funding-text = "Stijn Eyerman is supported through a postdoctoral
fellowship by the Research Foundation --- Flanders
(FWO). Additional support is provided by the European
Research Council under the European Community's Seventh
Framework Programme (FP7/2007-2013) / ERC Grant
agreement no. 259295.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ANTT; average normalized turnaround time; Benchmark
testing; C Computer Systems Organization; C.1 Processor
Architectures; C.1.3 Other Architecture Styles; C.1.3.h
Multithreaded processors; C.1.4 Parallel Architectures;
C.1.4.e Multi-core/single-chip multiprocessors; C.4
Performance of Systems; C.4.c Measurement techniques;
Degradation; Harmonic analysis; harmonic mean;
Multicore processing; multiprocessing systems;
multiprogram IPC; multiprogram workload performance
metric; multiprogramming; raw-IPC metrics; STP; system
throughput; system-level meaning; Throughput; Weight
measurement; weighted speedup; weighted-IPC metric",
number-of-cited-references = "6",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "Eyerman:2014:RCW",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wolff:2014:RUR,
author = "Sonya R. Wolff and Ronald D. Barnes",
title = "Revisiting Using the Results of Pre-Executed
Instructions in Runahead Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "97--100",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.21",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Long-latency cache accesses cause significant
performance-impacting delays for both in-order and
out-of-order processor systems. To address these
delays, runahead pre-execution has been shown to
produce speedups by warming-up cache structures during
stalls caused by long-latency memory accesses. While
improving cache related performance, basic runahead
approaches do not otherwise utilize results from
accurately pre-executed instructions during normal
operation. This simple model of execution is
potentially inefficient and performance constraining.
However, a previous study showed that exploiting the
results of accurately pre-executed runahead
instructions for out-of-order processors provide little
performance improvement over simple re-execution. This
work will show that, unlike out-of-order runahead
architectures, the performance improvement from
runahead result use for an in-order pipeline is more
significant, on average, and in some situations
provides dramatic performance improvements. For a set
of SPEC CPU2006 benchmarks which experience performance
improvement from basic runahead, the addition of result
use to the pipeline provided an additional speedup of
1.14X (high --- 1.48X) for an in-order processor model
compared to only 1.05X (high --- 1.16X) for an
out-of-order one. When considering benchmarks with poor
data cache locality, the average speedup increased to
1.21X for in-order compared to only 1.10X for
out-of-order.",
acknowledgement = ack-nhfb,
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; C.1.5.c Superscalar
dynamically-scheduled and statically-scheduled
implementation; C.1.5.e Memory hierarchy; cache
storage; data cache locality; Hidden Markov models;
in-order processor systems; long-latency cache
accesses; long-latency memory accesses; Memory Wall;
multiprocessing systems; Out of order; out-of-order
processor systems; out-of-order runahead architectures;
Pipeline processing; Pre-Execution; preexecuted
runahead instructions; Registers; Runahead; runahead
processors; SPEC CPU2006 benchmarks",
keywords-plus = "PIPELINES",
number-of-cited-references = "20",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Wolff:2014:RUR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2014:SGA,
author = "Youngsok Kim and Jaewon Lee and Donggyu Kim and
Jangwoo Kim",
title = "{ScaleGPU}: {GPU} Architecture for Memory-Unaware
{GPU} Programming",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "101--104",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.19",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Programmer-managed GPU memory is a major challenge in
writing GPU applications. Programmers must rewrite and
optimize an existing code for a different GPU memory
size for both portability and performance.
Alternatively, they can achieve only portability by
disabling GPU memory at the cost of significant
performance degradation. In this paper, we propose
ScaleGPU, a novel GPU architecture to enable
high-performance memory-unaware GPU programming.
ScaleGPU uses GPU memory as a cache of CPU memory to
provide programmers a view of CPU memory-sized
programming space. ScaleGPU also achieves high
performance by minimizing the amount of CPU-GPU data
transfers and by utilizing the GPU memory's high
bandwidth. Our experiments show that ScaleGPU can run a
GPU application on any GPU memory size and also
improves performance significantly. For example,
ScaleGPU improves the performance of the hotspot
application by similar to 48\% using the same size of
GPU memory and reduces its memory size requirement by
similar to 75\% maintaining the target performance.",
acknowledgement = ack-nhfb,
affiliation = "Kim, Y (Reprint Author), POSTECH, Dept Comp Sci \&
Engn, Pohang, South Korea. Kim, Youngsok; Lee, Jaewon;
Kim, Donggyu; Kim, Jangwoo, POSTECH, Dept Comp Sci \&
Engn, Pohang, South Korea.",
author-email = "elixir@postech.ac.kr spiegel0@postech.ac.kr
vteori@postech.ac.kr jangwoo@postech.ac.kr",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Basic Science Research Program through the
National Research Foundation of Korea (NRF) ---
Ministry of Education, Science and Technology
[2011-0014817]; NRF Grant --- Korean Government
(NRF-Global Ph.D. Fellowship Program)",
funding-text = "This research was supported by Basic Science Research
Program through the National Research Foundation of
Korea (NRF) funded by the Ministry of Education,
Science and Technology (2011-0014817) and NRF Grant
funded by the Korean Government (NRF-2012-Global Ph.D.
Fellowship Program).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "C.1.2.j SIMD processors; C.1.4.e
Multi-core/single-chip multiprocessors; C.1.5.e Memory
hierarchy; cache; cache storage; code rewrite; CPU
memory-sized programming space; CPU-GPU data transfers;
Data transfer; GPU applications; GPU architecture; GPU
memory high bandwidth; GPU memory size; graphics
processing units; Graphics processing units; graphics
processing units; high-performance memory-unaware GPU
programming; I.3.1.a Graphics processors; Instruction
sets; memory architecture; Memory management; memory
size requirement; programmer-managed GPU memory;
Programming; Random access memory; ScaleGPU",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "7",
unique-id = "Kim:2014:SGA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Sankar:2014:SFL,
author = "Sriram Sankar and Sudhanva Gurumurthi",
title = "Soft Failures in Large Datacenters",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "105--108",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.25",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "A major problem in managing large-scale datacenters is
diagnosing and fixing machine failures. Most large
datacenter deployments have a management infrastructure
that can help diagnose failure causes, and manage
assets that were fixed as part of the repair process.
Previous studies identify only actual hardware
replacements to calculate Annualized Failure Rate (AFR)
and component reliability. In this paper, we show that
service availability is significantly affected by soft
failures and that this class of failures is becoming an
important issue at large datacenters with minimum human
intervention. Soft failures in the datacenter do not
require actual hardware replacements, but still result
in service downtime, and are equally important because
they disrupt normal service operation. We show failure
trends observed in a large datacenter deployment of
commodity servers and motivate the need to modify
conventional datacenter designs to help reduce soft
failures and increase service availability.",
acknowledgement = ack-nhfb,
affiliation = "Sankar, S (Reprint Author), Microsoft Corp, Redmond,
WA 98052 USA. Sankar, Sriram, Microsoft Corp, Redmond,
WA 98052 USA. Sankar, Sriram; Gurumurthi, Sudhanva,
Univ Virginia, Charlottesville, VA 22903 USA.
Gurumurthi, Sudhanva, Adv Micro Devices Inc, AMD Res,
Sunnyvale, CA 94088 USA.",
author-email = "sriram.sankar@microsoft.com
Sudhanva.Gurumurthi@amd.com",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "AFR; annualized failure rate; asset management; C.4
Performance of Systems; C.5.5 Servers;
Characterization; Client-server systems; commodity
servers; component reliability; computer centres; Data
centers; Datacenter; datacenter deployments; datacenter
designs; datacenter management; failure cause
diagnosis; fault diagnosis; Hard disks; hardware
replacements; Large-scale systems; machine failure
diagnosis; machine failure fixing; Maintenance
engineering; Management; management infrastructure;
Market research; Reliability; repair process; service
availability; soft failures; Transient analysis",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Sankar:2014:SFL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2014:VPT,
author = "Daehoon Kim and Hwanju Kim and Jaehyuk Huh",
title = "{vCache}: Providing a Transparent View of the {LLC} in
Virtualized Environments",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "109--112",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/L-CA.2013.20",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Since most of the current multi-core processors use a
large last-level cache (LLC), efficient use of an LLC
is critical for the overall performance of multi-cores.
To improve the caching efficiency, page coloring is a
representative software-based approach to allow the OS
to control placement of pages on an LLC to improve
their cache utility and to avoid conflicts among cores.
However, system virtualization, with additional address
translation by the hypervisor, can make page coloring
techniques used by the guest OS ineffective, as guest
physical addresses used by the guest OS for coloring
differ from real addresses used for cache indexing in
the LLCs. In this paper, we propose a novel LLC
architecture to provide the guest OS with a flexible
control over LLC placement in virtualized systems. The
proposed vCache architecture can preserve coloring
information set by the guest OS. In addition to color
preservation, vCache can potentially eliminate the
traditional limitation of page coloring, the cost of
dynamic color changes for memory pages. By using the
pollute buffer mechanism, one of the color-based cache
optimization techniques, vCache shows performance
improvement of benchmark applications up to 33\%
without degrading the performance of another co-running
application in the VM.",
acknowledgement = ack-nhfb,
affiliation = "Kim, D (Reprint Author), Korea Adv Inst Sci \&
Technol, Dept Comp Sci, Taejon, South Korea. Kim,
Daehoon; Kim, Hwanju; Huh, Jaehyuk, Korea Adv Inst Sci
\& Technol, Dept Comp Sci, Taejon, South Korea.",
author-email = "daehoon@calab.kaist.ac.kr hjukim@calab.kaist.ac.kr
jhuh@calab.kaist.ac.kr",
da = "2019-06-20",
doc-delivery-number = "AX5PM",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "SW Computing R\&D Program of
KEIT(UX-oriented Mobile SW Platform) --- Ministry of
Trade, Industry, and Energy [2011-10041313]",
funding-text = "This research was supported by the SW Computing R\&D
Program of KEIT(2011-10041313, UX-oriented Mobile SW
Platform) funded by the Ministry of Trade, Industry,
and Energy.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "address translation; B.3.2.b Cache memories; benchmark
applications; buffer mechanism; C.1.4.e
Multi-core/single-chip multiprocessors; C.1.5.e Memory
hierarchy; cache indexing; Cache partitioning; cache
storage; Cache storage; cache utility improvement;
caching efficiency improvement; co-running application;
color-based cache optimization techniques; coloring
information preservation; core conflict avoidance;
dynamic color cost; guest OS; guest physical address;
hypervisor; last-level cache; LLC architecture; LLC
placement; Memory management; memory pages; Multicore
processing; multicore processor performance;
multiprocessing systems; operating systems (computers);
Page coloring; page coloring; page placement control;
paged storage; software-based approach; system
virtualization; transparent LLC view; vCache
architecture; Virtual machine monitors; virtual
machines; virtualisation; Virtualization; virtualized
environments; VM",
number-of-cited-references = "8",
research-areas = "Computer Science",
researcherid-numbers = "Huh, Jaehyuk/C-1716-2011",
times-cited = "2",
unique-id = "Kim:2014:VPT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2014:TCb,
author = "Anonymous",
title = "Table of Contents",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "C1--C1",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2368891",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2014:ICAa,
author = "Anonymous",
title = "{{\booktitle{IEEE Computer Architecture Letters}}
Editorial Board}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "C2--C2",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2368892",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2014:ICAb,
author = "Anonymous",
title = "{{\booktitle{IEEE Computer Architecture Letters}}}
Information for Authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "C3--C3",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2368893",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2014:ICSb,
author = "Anonymous",
title = "{IEEE Computer Society} [advertisement]",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "13",
number = "2",
pages = "C4--C4",
month = jul # "\slash " # dec,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2368894",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Liao:2015:AWL,
author = "Jianwei Liao and Fengxiang Zhang and Li Li and
Guoqiang Xiao",
title = "Adaptive Wear-Leveling in Flash-Based Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "1--4",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2329871",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The paper presents an adaptive wear-leveling scheme
based on several wear-thresholds in different periods.
The basic idea behind this scheme is that blocks can
have different wear-out speeds and the wear-leveling
mechanism does not conduct data migration until the
erasure counts of some hot blocks hit a threshold.
Through a series of emulation experiments based on
several realistic disk traces, we show that the
proposed wear-leveling mechanism can reduce total
erasure counts and yield uniform erasure counts among
all blocks at the late lifetime of the storage devices.
As a result, not only can the performance of storage
systems be advanced, the lifespan of the flash-based
memory can also be extended to a certain degree.",
acknowledgement = ack-nhfb,
affiliation = "Liao, JW (Reprint Author), Southwest Univ, Coll Comp
\& Informat Sci, Chongqing, Peoples R China. Liao,
Jianwei; Zhang, Fengxiang; Li, Li; Xiao, Guoqiang,
Southwest Univ, Coll Comp \& Informat Sci, Chongqing,
Peoples R China.",
author-email = "liaojianwei@il.is.s.u-okyo.ac.jp zhangfx@swu.edu.cn
lily@swu.edu.cn gqxiao@swu.edu.cn",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Adaptive systems; adaptive wear-leveling; Ash;
Benchmark testing; data migration; delayed migration;
disk traces; emulation experiments; Equations; erasure
evenness; extending lifetime; flash memories;
flash-based memory; Flash-based storage devices; Market
research; Servers; Standards; total erasure count
reduction; wear; wear-leveling; wear-leveling
mechanism; wear-out speeds; wear-thresholds",
number-of-cited-references = "11",
ORCID-numbers = "Liao, Jianwei/0000-0001-6149-6650",
research-areas = "Computer Science",
researcherid-numbers = "Liao, Jianwei/C-5339-2016",
times-cited = "4",
unique-id = "Liao:2015:AWL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2015:IIC,
author = "Anonymous",
title = "2014 Index {{\booktitle{IEEE Computer Architecture
Letters}}} Vol. 13",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "1--5",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2387774",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Indexes",
}
@Article{Chen:2015:HSC,
author = "Jie Chen and Guru Venkataramani",
title = "A Hardware-Software Cooperative Approach for
Application Energy Profiling",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "5--8",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2323711",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Energy consumption by software applications is a
critical issue that determines the future of multicore
software development. In this article, we propose a
hardware-software cooperative approach that uses
hardware support to efficiently gather the
energy-related hardware counters during program
execution, and utilizes parameter estimation models in
software to compute the energy consumption by
instructions at a finer grain level (say basic block).
We design mechanisms to minimize collinearity in
profiler data, and present results to validate our
energy estimation methodology.",
acknowledgement = ack-nhfb,
affiliation = "Chen, J (Reprint Author), George Washington Univ, Dept
Elect \& Comp Engn, Washington, DC 20052 USA. Chen,
Jie; Venkataramani, Guru, George Washington Univ, Dept
Elect \& Comp Engn, Washington, DC 20052 USA.",
author-email = "jiec@gwu.edu guruv@gwu.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "application energy profiling; Benchmark testing;
Energy consumption; energy consumption; energy
debugging; energy estimation; energy estimation
methodology; Energy profiling; energy-related hardware
counters; Estimation; Hardware; hardware-software
codesign; hardware-software cooperative approach;
Mathematical model; multicore software development;
multiprocessing systems; Parameter estimation;
parameter estimation models; power aware computing;
profiler data collinearity; program execution;
Software; software applications",
keywords-plus = "POWER",
number-of-cited-references = "12",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Chen:2015:HSC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2015:ASM,
author = "Dae-Hyun Kim and Prashant J. Nair and Moinuddin K.
Qureshi",
title = "Architectural Support for Mitigating Row Hammering in
{DRAM} Memories",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "9--12",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2332177",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "DRAM scaling has been the prime driver of increasing
capacity of main memory systems. Unfortunately, lower
technology nodes worsen the cell reliability as it
increases the coupling between adjacent DRAM cells,
thereby exacerbating different failure modes. This
paper investigates the reliability problem due to Row
Hammering, whereby frequent activations of a given row
can cause data loss for its neighboring rows. As DRAM
scales to lower technology nodes, the threshold for the
number of row activations that causes data loss for the
neighboring rows reduces, making Row Hammering a
challenging problem for future DRAM chips. To overcome
Row Hammering, we propose two architectural solutions:
First, Counter-Based Row Activation (CRA), which uses a
counter with each row to count the number of row
activations. If the count exceeds the row hammering
threshold, a dummy activation is sent to neighboring
rows proactively to refresh the data. Second,
Probabilistic Row Activation (PRA), which obviates
storage overhead of tracking and simply allows the
memory controller to proactively issue dummy
activations to neighboring rows with a small
probability for all memory access. Our evaluations show
that these solutions are effective at mitigating Row
hammering while causing negligible performance loss (<
1 percent).",
acknowledgement = ack-nhfb,
affiliation = "Kim, DH (Reprint Author), Georgia Inst Technol, Dept
ECE, Atlanta, GA 30363 USA. Kim, Dae-Hyun; Nair,
Prashant J.; Qureshi, Moinuddin K., Georgia Inst
Technol, Dept ECE, Atlanta, GA 30363 USA.",
author-email = "dhkim@ece.gatech.edu pnair6@ece.gatech.edu
moin@ece.gatech.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architectural support; cell reliability; Computer
architecture; counter-based row activation; data
errors; data retention; DRAM chips; DRAM memories; DRAM
scaling; Dynamic random access memory; Dynamic random
access memory, row hammering, data retention, data
errors; Leakage currents; Logic gates; Microprocessors;
probabilistic row activation; probability; Radiation
detectors; Random access memory; reliability;
reliability problem; row hammering; Transistors",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "23",
unique-id = "Kim:2015:ASM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Nathan:2015:AGC,
author = "Ralph Nathan and Daniel J. Sorin",
title = "{Argus-G}: Comprehensive, Low-Cost Error Detection for
{GPGPU} Cores",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "13--16",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2298391",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "We have developed and evaluated Argus-G, an error
detection scheme for general purpose GPU (GPGPU) cores.
Argus-G is a natural extension of the Argus error
detection scheme for CPU cores, and we demonstrate how
to modify Argus such that it is compatible with GPGPU
cores. Using an RTL prototype, we experimentally show
that Argus-G can detect the vast majority of injected
errors at relatively low performance, area, and power
costs.",
acknowledgement = ack-nhfb,
affiliation = "Nathan, R (Reprint Author), Duke Univ, Durham, NC
27708 USA. Nathan, Ralph; Sorin, Daniel J., Duke Univ,
Durham, NC 27708 USA.",
author-email = "ralph.nathan@duke.edu sorin@ee.duke.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Argus-G; Benchmark testing; Conferences; CPU cores;
error detection; fault tolerance; general purpose GPU
cores; GPGPU cores; Graphics processing units; graphics
processing units; Graphics processors; Hardware;
Hardware design languages; Instruction sets; low-cost
error detection; Registers",
number-of-cited-references = "18",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Nathan:2015:AGC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{O:2015:CCI,
author = "Seongil O and Sanghyuk Kwon and Young Hoon Son and
Yujin Park and Jung Ho Ahn",
title = "{CIDR}: a Cache Inspired Area-Efficient {DRAM}
Resilience Architecture against Permanent Faults",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2324894",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "area overhead; area-efficient DRAM resilience
architecture; Arrays; augmented cache; bit errors;
Bloom filter; cache data array; cache storage; cache
tags; cache-inspired DRAM resilience architecture;
CIDR; Circuit faults; cost-sensitive main-memory DRAM
devices; data structures; Decoding; device failure
rates; DRAM arrays; DRAM chips; DRAM, error resilience,
permanent faults, row and column sparing, Bloom filter,
DRAM-side caching; energy overhead minimization; error
statistics; fault diagnosis; faulty cells; I/O pads;
memory architecture; permanent faults; processor-memory
interfaces; Random access memory; Resilience;
single-bit error rates; Testing; testing phase",
}
@Article{Seongil:2015:CCI,
author = "O. Seongil and Sanghyuk Kwon and Young Hoon Son and
Yujin Park and Jung Ho Ahn",
title = "{CIDR}: a Cache Inspired Area-Efficient {DRAM}
Resilience Architecture against Permanent Faults",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "17--20",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2324894",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Thu Jun 20 17:18:18 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Faulty cells have become major problems in
cost-sensitive main-memory DRAM devices. Conventional
solutions to reduce device failure rates due to cells
with permanent faults, such as populating spare rows
and relying on error-correcting codes, have had limited
success due to high area overheads. In this paper, we
propose CIDR, a novel cache-inspired DRAM resilience
architecture, which substantially reduces the area
overhead of handling bit errors from these faulty
cells. A DRAM device adopting CIDR has a small cache
next to its I/O pads to replace accesses to the
addresses that include the faulty cells with ones that
correspond to the cache data array. We minimize the
energy overhead of accessing the cache tags for every
read or write by adding a Bloom filter in front of the
cache. The augmented cache is programmed once during
the testing phase and is out of the critical path on
normal accesses because both cache and DRAM arrays are
accessed in parallel, making CIDR transparent to
existing processor-memory interfaces. Compared to the
conventional architecture relying on spare rows, CIDR
lowers the area overhead of achieving equal failure
rates over a wide range of single-bit error rates, such
as 23.6 x lower area overhead for a bit-error rate of
10(-5) and a device failure rate of 10(-3).",
acknowledgement = ack-nhfb,
affiliation = "Seongil, O (Reprint Author), Seoul Natl Univ, Dept
Transdisciplinary Studies, Seoul, South Korea. Seongil,
O.; Kwon, Sanghyuk; Son, Young Hoon; Park, Yujin; Ahn,
Jung Ho, Seoul Natl Univ, Dept Transdisciplinary
Studies, Seoul, South Korea.",
author-email = "swdfish@snu.ac.kr kkwon114@snu.ac.kr yhson96@snu.ac.kr
comesay@snu.ac.kr gajh@snu.ac.kr",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bloom filter; DRAM; DRAM-side caching; error
resilience; permanent faults; row and column sparing",
number-of-cited-references = "13",
oa = "Bronze",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Seongil:2015:CCI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Gupta:2015:CEO,
author = "Ujjwal Gupta and Umit Y. Ogras",
title = "Constrained Energy Optimization in Heterogeneous
Platforms Using Generalized Scaling Models",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "21--25",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2326603",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Platform energy consumption and responsiveness are two
major considerations for mobile systems since they
determine the battery life and user satisfaction,
respectively. We first present models for power
consumption, response time and energy consumption of
heterogeneous mobile platforms. Then, we use these
models to optimize the energy consumption of baseline
platforms under response time and temperature
constraints with and without introducing new resources.
We show that the optimal design choices depend on
dynamic power management algorithm, and adding new
resources is more energy efficient than scaling
existing resources alone.",
acknowledgement = ack-nhfb,
affiliation = "Gupta, U (Reprint Author), Arizona State Univ, Sch
Elect Comp \& Energy Engn, Tempe, AZ 85281 USA. Gupta,
Ujjwal; Ogras, Umit Y., Arizona State Univ, Sch Elect
Comp \& Energy Engn, Tempe, AZ 85281 USA.",
author-email = "ujjwal@asu.edu umit@asu.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "battery life determine; Computers; constrained energy
optimization; dynamic power management algorithm;
Energy consumption; Energy optimization; generalized
scaling models; heterogeneous architectures;
heterogeneous mobile platforms; Mobile communication;
mobile computing; mobile platforms; mobile systems;
MpSoC; Multicore processing; Optimization; performance;
platform energy consumption; power aware computing;
power consumption; Power demand; response time;
temperature constraints; Time factors; user
satisfaction",
keywords-plus = "AMDAHLS LAW; MULTIAMDAHL; ACCELERATOR; MANAGEMENT;
CPU; ERA",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Gupta:2015:CEO",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Farmahini-Farahani:2015:DAA,
author = "Amin Farmahini-Farahani and Jung Ho Ahn and Katherine
Morrow and Nam Sung Kim",
title = "{DRAMA}: an Architecture for Accelerated Processing
Near Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "26--29",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2333735",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Improving energy efficiency is crucial for both mobile
and high-performance computing systems while a large
fraction of total energy is consumed to transfer data
between storage and processing units. Thus, reducing
data transfers across the memory hierarchy of a
processor (i.e., off-chip memory, on-chip caches, and
register file) can greatly improve the energy
efficiency. To this end, we propose an architecture,
DRAMA, that 3D-stacks coarse-grain reconfigurable
accelerators (CGRAs) atop off-chip DRAM devices. DRAMA
does not require changes to the DRAM device
architecture, apart from through-silicon vias (TSVs)
that connect the DRAM device's internal I/O bus to the
CGRA layer. We demonstrate that DRAMA can reduce the
energy consumption to transfer data across the memory
hierarchy by 66-95 percent while achieving speedups of
up to 18 x over a commodity processor.",
acknowledgement = ack-nhfb,
affiliation = "Farmahini-Farahani, A (Reprint Author), Univ
Wisconsin, Dept Elect \& Comp Engn, 1415 Johnson Dr,
Madison, WI 53706 USA. Farmahini-Farahani, Amin;
Morrow, Katherine; Kim, Nam Sung, Univ Wisconsin, Dept
Elect \& Comp Engn, Madison, WI 53706 USA. Ahn, Jung
Ho, Seoul Natl Univ, Dept Transdisciplinary Studies,
Seoul 151742, South Korea.",
author-email = "farmahinifar@wisc.edu gajh@snu.ac.kr
kati@engr.wisc.edu nskim3@wisc.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "3D-stacking; 3D-stacks coarse-grain reconfigurable
accelerators; accelerated near memory processing;
Acceleration; accelerator; Arrays; data transfers;
DRAM; DRAM chips; DRAM devices; DRAMA architecture;
dynamic random access memory; energy conservation;
energy consumption reduction; energy efficiency;
energy-efficient computing; high-performance computing
systems; Kernel; memory hierarchy; Memory management;
mobile computing systems; Near memory processing; Near
memory processing, DRAM, 3D-stacking, energy-efficient
computing, accelerator; processing units; Random access
memory; Registers; storage management; storage units;
through-silicon vias; total energy fraction; TSV",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "7",
unique-id = "Farmahini-Farahani:2015:DAA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Carlson:2015:EPM,
author = "Trevor E. Carlson and Siddharth Nilakantan and Mark
Hempstead and Wim Heirman",
title = "Epoch Profiles: Microarchitecture-Based Application
Analysis and Optimization",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "30--33",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2329873",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The performance of data-intensive applications, when
running on modern multi-and many-core processors, is
largely determined by their memory access behavior. Its
most important contributors are the frequency and
latency of off-chip accesses and the extent to which
long-latency memory accesses can be overlapped with
useful computation or with each other. In this paper we
present two methods to better understand application
and microarchitectural interactions. An epoch profile
is an intuitive way to understand the relationships
between three important characteristics: the on-chip
cache size, the size of the reorder window of an
out-of-order processor, and the frequency of processor
stalls caused by long-latency, off-chip requests
(epochs). By relating these three quantities one can
more easily understand an application's memory
reference behavior and thus significantly reduce the
design space. While epoch profiles help to provide
insight into the behavior of a single application,
developing an understanding of a number of applications
in the presence of area and core count constraints
presents additional challenges. Epoch-based
microarchitectural analysis is presented as a better
way to understand the trade-offs for memory-bound
applications in the presence of these physical
constraints. Through epoch profiling and optimization,
one can significantly reduce the multidimensional
design space for hardware/software optimization through
the use of high-level model-driven techniques.",
acknowledgement = ack-nhfb,
affiliation = "Carlson, TE (Reprint Author), Univ Ghent, Sint
Pietersnieuwstr 41, B-9000 Ghent, East Flanders,
Belgium. Carlson, Trevor E., Univ Ghent, B-9000 Ghent,
East Flanders, Belgium. Nilakantan, Siddharth;
Hempstead, Mark, Drexel Univ, Dept Elect \& Comp Engn,
Bossone Res Ctr, Philadelphia, PA 19104 USA. Heirman,
Wim, Intel Corp, Leuven, Flemish Brabant, Belgium.",
author-email = "trevor.carlson@elis.ugent.be sn446@drexel.edu
mhempstead@drexel.edu wim.heirman@intel.com",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; Computational modeling; Frequency
measurement; memory-level parallelism;
Microarchitecture; Microarchitecture analysis; Out of
order; System-on-chip; visualization",
number-of-cited-references = "6",
oa = "Green Published",
ORCID-numbers = "Carlson, Trevor/0000-0001-8742-134X Nilakantan,
Siddharth/0000-0003-1067-700X Heirman,
Wim/0000-0003-2286-1525",
research-areas = "Computer Science",
researcherid-numbers = "Carlson, Trevor/M-4945-2016",
times-cited = "0",
unique-id = "Carlson:2015:EPM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Power:2015:GGH,
author = "Jason Power and Joel Hestness and Marc S. Orr and Mark
D. Hill and David A. Wood",
title = "{gem5-gpu}: a Heterogeneous {CPU--GPU} Simulator",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "34--36",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2299539",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/pvm.bib",
abstract = "gem5-gpu is a new simulator that models tightly
integrated CPU-GPU systems. It builds on gem5, a
modular full-system CPU simulator, and GPGPU-Sim, a
detailed GPGPU simulator. gem5-gpu routes most memory
accesses through Ruby, which is a highly configurable
memory system in gem5. By doing this, it is able to
simulate many system configurations, ranging from a
system with coherent caches and a single virtual
address space across the CPU and GPU to a system that
maintains separate GPU and CPU physical address spaces.
gem5-gpu can run most unmodified CUDA 3.2 source code.
Applications can launch non-blocking kernels, allowing
the CPU and GPU to execute simultaneously. We present
gem5-gpu's software architecture and a brief
performance validation. We also discuss possible
extensions to the simulator. gem5-gpu is open source
and available at gem5-gpu.cs.wisc.edu.",
acknowledgement = ack-nhfb,
affiliation = "Power, J (Reprint Author), Univ Wisconsin, Dept Comp
Sci, 1210 W Dayton St, Madison, WI 53706 USA. Power,
Jason; Hestness, Joel; Orr, Marc S.; Hill, Mark D.;
Wood, David A., Univ Wisconsin, Dept Comp Sci, Madison,
WI 53706 USA.",
author-email = "powerjg@cs.wisc.edu hestness@cs.wisc.edu
morr@cs.wisc.edu markhill@cs.wisc.edu
david@cs.wisc.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Coherence; Computational modeling; Computer
architecture; computer architecture; gem5-gpu
simulator; general-purpose graphics processors;
GPGPUSim; Graphics processing units; graphics
processing units; heterogeneous (hybrid) systems;
heterogeneous CPU-GPU simulator; Kernel; Modeling
techniques; modular full-system CPU simulator;
nonblocking kernels; Object oriented modeling;
Protocols; simulators; software architecture",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "62",
unique-id = "Power:2015:GGH",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Manatunga:2015:HSS,
author = "Dilan Manatunga and Joo Hwan Lee and Hyesoon Kim",
title = "Hardware Support for Safe Execution of Native Client
Applications",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "37--40",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2309601",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Over the past few years, there has been vast growth in
the area of the web browser as an applications
platform. One example of this trend is Google's Native
Client (NaCl) platform, which is a software-fault
isolation mechanism that allows the running of native
x86 or ARM code on the browser. One of the security
mechanisms employed by NaCl is that all branches must
jump to the start of a valid instruction. In order to
achieve this criteria though, all return instructions
are replaced by a specific branch instruction sequence,
which we call NaCl returns, that are guaranteed to
return to a valid instruction. However, these NaCl
returns lose the advantage of the highly accurate
return-address stack (RAS) in exchange for the less
accurate indirect branch predictor. In this paper, we
propose a NaCl-RAS mechanism that can identify and
accurately predict 76.9 on average compared to the 39.5
of a traditional BTB predictor.",
acknowledgement = ack-nhfb,
affiliation = "Manatunga, D (Reprint Author), Georgia Inst Technol,
Sch Comp Sci, Atlanta, GA 30332 USA. Manatunga, Dilan;
Lee, Joo Hwan; Kim, Hyesoon, Georgia Inst Technol, Sch
Comp Sci, Atlanta, GA 30332 USA.",
author-email = "dmanatunga@gatech.edu joohwan.lee@gatech.edu
hyesoon@cc.gatech.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; ARM code; Benchmark testing; branch
instruction sequence; branch prediction accuracy; BTB
predictor; Detectors; fault diagnosis; Google;
Hardware; hardware support; NaCl-RAS mechanism; Native
client; native client applications; native x86; online
front-ends; return address prediction; return-address
stack; safe execution; Security; security mechanism;
security of data; Software; software fault isolation;
software-fault isolation mechanism; Web browser",
keywords-plus = "SANDBOX; CODE",
number-of-cited-references = "5",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Manatunga:2015:HSS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Liu:2015:LHP,
author = "Longjun Liu and Chao Li and Hongbin Sun and Yang Hu
and Jingmin Xin and Nanning Zheng and Tao Li",
title = "Leveraging Heterogeneous Power for Improving
Datacenter Efficiency and Resiliency",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "41--45",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2363084",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Power mismatching between supply and demand has
emerged as a top issue in modern datacenters that are
under-provisioned or powered by intermittent power
supplies. Recent proposals are primarily limited to
leveraging uninterruptible power supplies (UPS) to
handle power mismatching, and therefore lack the
capability of efficiently handling the irregular peak
power mismatches. In this paper we propose hPower, the
first heterogeneous energy buffering strategy that
incorporates supercapacitors into existing datacenters
to handle power mismatch. Our technique exploits power
supply diversity and smart load assignment to provide
efficiency-aware and emergency-aware power mismatch
management. We show that hPower could improve energy
efficiency by 30 percent, extend UPS lifetime by 4.3 x,
and reduce system downtime by 36 percent. It allows
datacenters to adapt themselves to various power supply
anomalies, thereby improving operational efficiency and
resiliency.",
acknowledgement = ack-nhfb,
affiliation = "Liu, LJ (Reprint Author), Xi An Jiao Tong Univ, Sch
Elect \& Informat Engn, Xian 710049, Peoples R China.
Liu, Longjun; Sun, Hongbin; Xin, Jingmin; Zheng,
Nanning, Xi An Jiao Tong Univ, Sch Elect \& Informat
Engn, Xian 710049, Peoples R China. Li, Chao, Shanghai
Jiao Tong Univ, Dept Comp Sci \& Engn, Shanghai 200030,
Peoples R China. Hu, Yang; Li, Tao, Univ Florida, Dept
Elect \& Comp Engn, Gainesville, FL USA.",
author-email = "longjun.liu@stu.xjtu.edu.cn lichao@cs.sjtu.edu.cn
hsun@mail.xjtu.edu.cn huyang.ece@ufl.edu
jxin@mail.xjtu.edu.cn nnzheng@mail.xjtu.edu.cn
taoli@ece.ufl.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Batteries; computer centres; computer system
implementation; Computer System Implementation;
computer system implementation; data center efficiency;
data center resiliency; efficiency-aware power mismatch
management; emergency-aware power mismatch management;
energy conservation; Energy efficiency; Energy-aware
systems; Energy-Aware Systems; heterogeneous energy
buffering strategy; heterogeneous power; hPower;
performance of systems; Performance of Systems; power
aware computing; Power demand; power mismatching; power
supply anomalies; power supply diversity; Servers;
smart load assignment; Supercapacitors;
supercapacitors; system downtime reduction;
uninterruptible power supplies; Uninterruptible power
systems; UPS",
number-of-cited-references = "16",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Liu:2015:LHP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wang:2015:LNV,
author = "Rui Wang and Wangyuan Zhang and Tao Li and Depei
Qian",
title = "Leveraging Non-Volatile Storage to Achieve Versatile
Cache Optimizations",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "46--49",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2298412",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The efficiency of caches plays a vital role in
microprocessor. In this paper, we introduce a novel and
flexible cache substrate that employs non-volatile yet
versatile SRAM (NV2-SRAM) cell design, which
synergistically integrates new memory devices into the
standard SRAM cells. Our experiments show that it can
achieve a 67 percent energy saving and 3: 1 x
reliability improvement over the SRAM based cache,
outperforming the drowsy cache design in terms of both
power efficiency and reliability. Moreover, the
proposed cache architecture can be used to improve the
performance of prefetching schemes by 10 percent.",
acknowledgement = ack-nhfb,
affiliation = "Wang, R (Reprint Author), Beihang Univ, Sch Comp Sci
\& Engn, State Key Lab Software Dev Environm, Beijing
100191, Peoples R China. Wang, Rui; Qian, Depei,
Beihang Univ, Sch Comp Sci \& Engn, State Key Lab
Software Dev Environm, Beijing 100191, Peoples R China.
Zhang, Wangyuan; Li, Tao, Univ Florida, ECE Dept,
Gainesville, FL 32611 USA.",
author-email = "rui.wang@jsi.buaa.edu.cn zhangwangyuan@gmail.com
taoli@ece.ufl.edu depeiq@buaa.edu.cn",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache architecture; Cache memories; cache storage;
Computer architecture; energy saving; flexible cache
substrate; low-power design; Magnetic tunneling; memory
structures; microprocessor; Microprocessors;
Nonvolatile memory; nonvolatile storage; nonvolatile
yet versatile SRAM cell design; NV2-SRAM cell design;
Prefetching; prefetching schemes; reliability
improvement; SRAM; SRAM based cache; SRAM cells; SRAM
chips; storage management; versatile cache
optimizations",
number-of-cited-references = "19",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Wang:2015:LNV",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Mohammadi:2015:DDB,
author = "Milad Mohammadi and Song Han and Tor M. Aamodt and
William J. Dally",
title = "On-Demand Dynamic Branch Prediction",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "50--53",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2330820",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In out-of-order (OoO) processors, speculative
execution with high branch prediction accuracy is
employed to achieve good single thread performance. In
these processors the branch prediction unit tables
(BPU) are accessed in parallel with the instruction
cache before it is known whether a fetch group contains
branch instructions. For integer applications, we find
85 percent of BPU lookups are done for non-branch
operations and of the remaining lookups, 42 percent are
done for highly biased branches that can be predicted
statically with high accuracy. We evaluate on-demand
branch prediction (ODBP), a novel technique that uses
compiler generated hints to identify those instructions
that can be more accurately predicted statically to
eliminate unnecessary BPU lookups. We evaluate an
implementation of ODBP that combines static and dynamic
branch prediction. For a four wide superscalar
processor, ODBP delivers as much as 9 percent
improvement in average energy-delay (ED) product, 7
percent core average energy saving, and 3 percent
speedup. ODBP also enables the use of large BPU's for a
given power budget.",
acknowledgement = ack-nhfb,
affiliation = "Mohammadi, M (Reprint Author), Stanford Univ, Dept
Elect Engn, Stanford, CA 94305 USA. Mohammadi, Milad;
Han, Song; Dally, William J., Stanford Univ, Dept Elect
Engn, Stanford, CA 94305 USA. Aamodt, Tor M., Univ
British Columbia, Dept Elect \& Comp Engn, Vancouver,
BC V6T 1Z4, Canada.",
author-email = "milad@stanford.edu songhan@stanford.edu
aamodt@ece.ubc.ca dally@stanford.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; ahead prediction; BPU lookup; branch
instruction; branch prediction accuracy; branch
prediction unit table; cache storage; compiler
generated hints; Computer architecture; core average
energy saving; ED product; Energy efficiency;
energy-delay product; energy-delay product
optimization; Equations; instruction cache; instruction
sets; Mathematical model; nonbranch operation; ODBP;
on-demand branch prediction; on-demand dynamic branch
prediction; OoO processor; out-of-order processor;
parallel processing; Pipelines; power budget; program
compilers; Program processors; single thread
performance; speculative execution; static and dynamic
branch prediction hybrid; static branch prediction;
superscalar processor; table lookup; Tin",
keywords-plus = "MICROPROCESSOR; DESIGN",
number-of-cited-references = "27",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Mohammadi:2015:DDB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Azriel:2015:PMT,
author = "Leonid Azriel and Avi Mendelson and Uri Weiser",
title = "Peripheral Memory: a Technique for Fighting Memory
Bandwidth Bottleneck",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "54--57",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2319077",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memory bottleneck has always been a major cause for
limiting the performance of computer systems. While in
the past latency was the major concern, today, lack of
bandwidth becomes a limiting factor as well, as a
result of exploiting more parallelism with the growing
number of cores per die, which intensifies the pressure
on the memory bus. In such an environment, any
additional traffic to memory, such as the I/O traffic
may lead to degradation of the overall performance of
the system. This work introduces the concept of
Peripheral Memory, a software controlled memory that
resides in the I/O domain and can be used for
offloading I/O traffic from CPU memory. The Peripheral
Memory handles `I/O exclusive data', data originated
and terminated at I/O domain, and which does not need
any processing by the CPU. The paper analyses the
impact of I/O traffic on the overall performance of the
current systems and demonstrates that in numerous
applications, I/O exclusive data occupies major part of
memory bandwidth, as a result, degrading CPU processing
performance and increasing power. The paper considers
four different implementations of the Peripheral
Memory: pageable, pinned, non-coherent split-traffic
and copy-on-access. Our full-system simulator indicates
that non-coherent split traffic configuration is the
most efficient implementation, which can provide up to
four times speedup in the I/O processing rate for
typical I/O intensive applications. In addition, based
on Power model and measurements tools, the paper
demonstrates that the Peripheral Memory in a server
system can lead to reduction of tens of Watts in the
overall system power consumption or 10-20 percent of
the system power budget.",
acknowledgement = ack-nhfb,
affiliation = "Azriel, L (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-32000 Haifa, Israel.
Azriel, Leonid; Mendelson, Avi; Weiser, Uri, Technion
Israel Inst Technol, Dept Elect Engn, IL-32000 Haifa,
Israel.",
author-email = "leonida@tx.technion.ac.il
avi.mendelson@tce.technion.ac.il
uri.weiser@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Bandwidth; bandwidth allocation; Benchmark testing;
computer system performance; CPU memory; full-system
simulator; I/O domain; I/O traffic offloading;
input/output devices; Instruction sets; interconnection
architectures; main memory; memory bandwidth
bottleneck; memory bus; Memory management; parallelism;
performance evaluation; Performance evaluation;
peripheral memory; Power demand; Power measurement;
server system; software controlled memory; storage
management; system buses",
keywords-plus = "NETWORK; I/O",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "1",
unique-id = "Azriel:2015:PMT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Wang:2015:PTM,
author = "Zhaoguo Wang and Han Yi and Ran Liu and Mingkai Dong
and Haibo Chen",
title = "Persistent Transactional Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "58--61",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2329832",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper proposes persistent transactional memory
(PTM), a new design that adds durability to
transactional memory (TM) by incorporating with the
emerging non-volatile memory (NVM). PTM dynamically
tracks transactional updates to cache lines to ensure
the ACI (atomicity, consistency and isolation)
properties during cache flushes and leverages an undo
log in NVM to ensure PTM can always consistently
recover transactional data structures from a machine
crash. This paper describes the PTM design based on
Intel's restricted transactional memory. A preliminary
evaluation using a concurrent key/value store and a
database with a cache-based simulator shows that the
additional cache line flushes are small.",
acknowledgement = ack-nhfb,
affiliation = "Wang, ZG (Reprint Author), Shanghai Jiao Tong Univ,
Shanghai Key Lab Scalable Comp \& Syst, Shanghai
200030, Peoples R China. Wang, Zhaoguo; Yi, Han; Liu,
Ran; Dong, Mingkai; Chen, Haibo, Shanghai Jiao Tong
Univ, Shanghai Key Lab Scalable Comp \& Syst, Shanghai
200030, Peoples R China. Wang, Zhaoguo; Yi, Han; Liu,
Ran; Dong, Mingkai; Chen, Haibo, Shanghai Jiao Tong
Univ, Inst Parallel \& Distributed Syst, Shanghai
200030, Peoples R China.",
author-email = "tigerwang1986@gmail.com ken.yihan1990@gmail.com
naruilone@gmail.com mingkaidong@gmail.com
haibochen@sjtu.edu.cn",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "ACI properties; Batteries; cache line flushes; cache
storage; cache-based simulator; Computer crashes; Data
structures; Databases; Hardware; Hardware transactional
memory; non-volatile random access memory; Nonvolatile
memory; nonvolatile memory; NVM; persistent
transactional memory; PTM design; Registers",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "5",
unique-id = "Wang:2015:PTM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Gibert:2015:PSR,
author = "Enric Gibert and Raul Mart{\'\i}nez and Carlos
Madriles and Josep M. Codina",
title = "Profiling Support for Runtime Managed Code: Next
Generation Performance Monitoring Units",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "62--65",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2321398",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Given the increase of runtime managed code
environments in desktop, server, and mobile segments,
agile, flexible, and accurate performance monitoring
capabilities are required in order to perform wise code
transformations and optimizations. Common profiling
strategies, mainly based on instrumentation and current
performance monitoring units (PMUs), are not adequate
and new innovative designs are necessary. In this
paper, we present the desired characteristics of what
we call next generation PMUs and advocate for
hardware/software collaborative approaches where
hardware implements the profiling hooks and mechanisms
and software implements the complex heuristics. We then
propose a first design in which the hardware uses a
small, yet flexible table to profile specific code
regions and the software decides what/when/how to
profile. This first design meets all required features
and we aim it as the seed for future PMUs extensions to
enable novel dynamic code transformations and
optimizations.",
acknowledgement = ack-nhfb,
affiliation = "Gibert, E (Reprint Author), Intel Corp, Intel Labs,
Intel Barcelona Res Ctr IBRC, Edifici Nexus 2, Planta
0-D, Jordi Girona 29, Barcelona, Spain. Gibert, Enric;
Martinez, Raul; Madriles, Carlos; Codina, Josep M.,
Intel Corp, Intel Labs, Intel Barcelona Res Ctr IBRC,
Barcelona, Spain.",
author-email = "enric.gibert.codina@intel.com raul.martinez@intel.com
carlos.madriles.gimeno@intel.com
josep.m.codina@intel.com",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "dynamic code optimizations; dynamic code
transformations; groupware; Hardware; hardware-software
collaborative approaches; instrumentation; Instruments;
just in time (JIT) compiler; Monitoring; next
generation performance monitoring units; optimising
compilers; Optimization; Performance monitoring unit
(PMU); Phasor measurement units; PMUs; profiling;
profiling hooks; profiling support; Runtime; runtime
managed code; runtime managed code environments;
Software; software performance evaluation; system
monitoring",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Gibert:2015:PSR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{You:2015:QSA,
author = "Daecheol You and Ki-Seok Chung",
title = "Quality of Service-Aware Dynamic Voltage and Frequency
Scaling for Embedded {GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "66--69",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2319079",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Dynamic voltage and frequency scaling (DVFS) is a key
technique for reducing processor power consumption in
mobile devices. In recent years, mobile system-on-chips
(SoCs) has supported DVFS for embedded graphics
processing units (GPUs) as the processing power of
embedded GPUs has been increasing steadily. The major
challenge of applying DVFS to a processing unit is to
meet the quality of service (QoS) requirement while
achieving a reasonable power reduction. In the case of
GPUs, the QoS requirement can be specified as the
frame-per-second (FPS) which the target GPU should
achieve. The proposed DVFS technique ensures a
consistent GPU performance by scaling the operating
clock frequency in a way that it maintains a uniform
FPS.",
acknowledgement = ack-nhfb,
affiliation = "You, D (Reprint Author), Hanyang Univ, Dept Elect Comp
\& Commun Engn, Embedded Syst Chip Lab, Seoul 133791,
South Korea. You, Daecheol; Chung, Ki-Seok, Hanyang
Univ, Dept Elect Comp \& Commun Engn, Embedded Syst
Chip Lab, Seoul 133791, South Korea.",
author-email = "khsrdc@hanyang.ac.kr kchung@hanyang.ac.kr",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; Clocks; Correlation; DVFS; dynamic
voltage scaling; embedded GPU; Energy consumption;
energy-aware systems; frequency scaling; graphics
processing unit; Graphics processing units; graphics
processing units; Graphics processors;
hardware/software interfaces; low-power design; mobile
device; mobile system-on-chips; operating clock
frequency; power aware computing; processor power
consumption; Quality of service; quality of service;
SoC; System-on-chip; system-on-chip",
number-of-cited-references = "9",
research-areas = "Computer Science",
times-cited = "9",
unique-id = "You:2015:QSA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lee:2015:RDA,
author = "Sungjin Lee and Jihong Kim and Arvind",
title = "Refactored Design of {I/O} Architecture for Flash
Storage",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "70--74",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2329423",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Flash storage devices behave quite differently from
hard disk drives (HDDs); a page on flash has to be
erased before it can be rewritten, and the erasure has
to be performed on a block which consists of a large
number of contiguous pages. It is also important to
distribute writes evenly among flash blocks to avoid
premature wearing. To achieve interoperability with
existing block I/O subsystems for HDDs, NAND flash
devices employ an intermediate software layer, called
the flash translation layer (FTL), which hides these
differences. Unfortunately, FTL implementations require
powerful processors with a large amount of DRAM in
flash controllers and also incur many unnecessary I/O
operations which degrade flash storage performance and
lifetime. In this paper, we present a refactored design
of I/O architecture for flash storage which
dramatically increases storage performance and lifetime
while decreasing the cost of the flash controller. In
comparison with page-level FTL, our preliminary
experiments show a reduction of 19 percent in I/O
operations, improvement of I/O performance by 9 percent
and storage lifetime by 36 percent. In addition, our
scheme uses only 1/128 DRAM memory in the flash
controller.",
acknowledgement = ack-nhfb,
affiliation = "Lee, S (Reprint Author), MIT, 77 Massachusetts Ave,
Cambridge, MA 02139 USA. Lee, Sungjin; Arvind, MIT,
Cambridge, MA 02139 USA. Kim, Jihong, Seoul Natl Univ,
Sch Comp Sci \& Engn, Seoul, South Korea.",
author-email = "chamdoo@gmail.com jihong@davinci.snu.ac.kr
arvind@csail.mit.edu",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Benchmark testing; block I/O subsystems; Computer
architecture; DRAM chips; DRAM memory; file systems;
flash blocks; flash memories; flash storage; flash
translation layer; hard disk drives; HDDs; I/O
architecture; I/O architectures; input-output programs;
intermediate software layer; interoperability; NAND
circuits; NAND flash devices; NAND flash memory;
page-level FTL; Performance evaluation; premature
wearing; Random access memory; Runtime; Storage
management; Storage systems",
number-of-cited-references = "15",
research-areas = "Computer Science",
times-cited = "7",
unique-id = "Lee:2015:RDA",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Yuan:2015:SGR,
author = "Fengkai Yuan and Zhenzhou Ji and Suxia Zhu",
title = "Set-Granular Regional Distributed Cooperative
Caching",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "75--78",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2319258",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The last level cache (LLC) in private configurations
offer lower latency and isolation but extinguishes the
possibility of sharing underutilized cache resources.
Cooperative Caching (CC) provides capacity sharing by
spilling a line evicted from one cache to another.
Current studies focus on efficient capacity sharing,
while the adaptability of CC to manycore environment
deserves more attentions. In this paper, we present
Set-granular Regional Distributed Cooperative Caching
to optimize CC in manycore CMPs with private LLCs. We
achieve efficient capacity sharing by a low-traffic
global receiver tracking mechanism and provide a method
to manage set-grain cache state transitions for
exclusive LLCs. Experiment results show that SRDCC
performs better than baseline system, running different
workloads varying in receiver-spiller number and
distribution, in execution time up to 15.55 percent and
memory access up to 40.25 percent, at a negligible cost
of network traffics (6.21 percent more than baseline
system at worst).",
acknowledgement = ack-nhfb,
affiliation = "Yuan, FK (Reprint Author), Harbin Inst Technol, Sch
Comp Sci \& Technol, Harbin 150006, Heilongjiang,
Peoples R China. Yuan, Fengkai; Ji, Zhenzhou; Zhu,
Suxia, Harbin Inst Technol, Sch Comp Sci \& Technol,
Harbin 150006, Heilongjiang, Peoples R China.",
author-email = "yuan.fengkai@gmail.com jizhenzhou@hit.edu.cn",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache coherence protocol; cache resource sharing;
Cache storage; cache storage; capacity sharing; CC;
chip multiprocessors; cooperative caching; Cooperative
caching; last level cache; LLC; manycore CMP;
multiprocessing systems; on-chip networks; private
cache configuration; Protocols; Radiation detectors;
receiver-spiller distribution; receiver-spiller number;
Receivers; set-grain cache state transition;
set-granular regional distributed cooperative caching;
Telecommunication traffic; Tiled CMP",
keywords-plus = "CHIP MULTIPROCESSORS",
number-of-cited-references = "9",
ORCID-numbers = "Yuan, Fengkai/0000-0003-2615-8642",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Yuan:2015:SGR",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Lee:2015:SSI,
author = "Junghee Lee and Youngjae Kim and Jongman Kim and Galen
M. Shipman",
title = "Synchronous {I/O} Scheduling of Independent Write
Caches for an Array of {SSDs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "79--82",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2298394",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Solid-state drives (SSD) offer a significant
performance improvement over the hard disk drives
(HDD), however, it can exhibit a significant variance
in latency and throughput due to internal garbage
collection (GC) process on the SSD. When the SSDs are
configured in a RAID, the performance variance of
individual SSDs could significantly degrade the overall
performance of the RAID of SSDs. The internal cache on
the RAID controller can help mitigate the performance
variability issues of SSDs in the array; however, the
state-of-the-art cache algorithm of the RAID controller
does not consider the characteristics of SSDs. In this
paper, we examine the most recent write cache algorithm
for the array of disks, and propose a synchronous
independent write cache (SIW) algorithm. We also
present a pre-parity-computation technique for the RAID
of SSDs with parity computations, which calculates
parities of blocks in advance before they are stored in
the write cache. With this new technique, we propose a
complete paradigm shift in the design of write cache.
In our evaluation study, large write requests dominant
workloads show up to about 50 and 20 percent
improvements in average response times on RAID-0 and
RAID-5 respectively as compared to the state-of-the-art
write cache algorithm.",
acknowledgement = ack-nhfb,
affiliation = "Lee, J (Reprint Author), Univ Texas San Antonio, San
Antonio, TX 78229 USA. Lee, Junghee, Univ Texas San
Antonio, San Antonio, TX 78229 USA. Kim, Youngjae, Ajou
Univ, Suwon 441749, South Korea. Kim, Jongman, Georgia
Inst Technol, Atlanta, GA 30332 USA. Shipman, Galen M.,
Oak Ridge Natl Lab, Oak Ridge, TN USA.",
author-email = "junghee.lee@utsa.edu youkim@gmail.com
jkim@ece.gatech.edu gshipman@ornl.gov",
da = "2019-06-20",
doc-delivery-number = "CL1QK",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Algorithm design and analysis; Arrays; cache storage;
Delays; disks array; flash memory; GC process; hard
disk drives; HDD; I/O scheduling; independent write
caches; input-output programs; internal cache; internal
garbage collection process; memory architecture;
pre-parity-computation technique; RAID; RAID
controller; Redundant array of independent disks
(RAID); Redundant Array of Independent Disks (RAID);
Redundant array of independent disks (RAID);
scheduling; SIW algorithm; solid-state drive (SSD);
Solid-State Drive (SSD); solid-state drive (SSD);
solid-state drives; SSD; Strips; Synchronization;
synchronous I/O scheduling; synchronous independent
write cache algorithm; Time factors; write cache; Write
cache; write cache; write cache design; write
requests",
number-of-cited-references = "8",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Lee:2015:SSI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2015:RSW,
author = "Anonymous",
title = "Rock Stars of Wearables",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "83--83",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2447192",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:RSC,
author = "Anonymous",
title = "Rock Stars of Cybersecurity 2015 Conference",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "84--84",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2447191",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:TCa,
author = "Anonymous",
title = "Table of Contents",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "C1--C1",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2446391",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:ICAa,
author = "Anonymous",
title = "{{\booktitle{IEEE Computer Architecture Letters}}
Editorial Board}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "C2--C2",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2446392",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:ICAb,
author = "Anonymous",
title = "{{\booktitle{IEEE Computer Architecture Letters}}}
Information for Authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "C3--C3",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2446393",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:ICSa,
author = "Anonymous",
title = "{IEEE Computer Society}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "1",
pages = "C4--C4",
month = jan # "\slash " # jun,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2446394",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Shi:2015:CLM,
author = "Qingchuan Shi and Henry Hoffmann and Omer Khan",
title = "A Cross-Layer Multicore Architecture to Tradeoff
Program Accuracy and Resilience Overheads",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "85--89",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2365204",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "To protect multicores from soft-error perturbations,
resiliency schemes have been developed with high
coverage but high power/performance overheads (similar
to 2x). We observe that not all soft-errors affect
program correctness, some soft-errors only affect
program accuracy, i.e., the program completes with
certain acceptable deviations from soft-error free
outcome. Thus, it is practical to improve processor
efficiency by trading off resilience overheads with
program accuracy. We propose the idea of declarative
resilience that selectively applies resilience schemes
to both crucial and non-crucial code, while ensuring
program correctness. At the application level, crucial
and non-crucial code is identified based on its impact
on the program outcome. The hardware collaborates with
software support to enable efficient resilience with
100 percent soft-error coverage. Only program accuracy
is compromised in the worst-case scenario of a
soft-error strike during non-crucial code execution.
For a set of multithreaded benchmarks, declarative
resilience improves completion time by an average of 21
percent over state-of-the-art hardware resilience
scheme that protects all executed code. Its performance
overhead is similar to 1.38x over a multicore that does
not support resilience.",
acknowledgement = ack-nhfb,
affiliation = "Shi, QC (Reprint Author), Univ Connecticut, Dept Elect
\& Comp Engn, Storrs, CT 06269 USA. Shi, Qingchuan;
Khan, Omer, Univ Connecticut, Dept Elect \& Comp Engn,
Storrs, CT 06269 USA. Hoffmann, Henry, Univ Chicago,
Dept Comp Sci, Chicago, IL 60637 USA.",
author-email = "qingchuan.shi@uconn.edu hankhoffmann@cs.uchicago.edu
khan@uconn.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accuracy; Benchmark testing; code execution;
Instruction sets; multi-threading; multicore
architecture; Multicore processing; multicores;
multithreaded benchmark; program accuracy; Resilience;
resilience overhead; Soft errors; soft-error
perturbation; soft-errors; software architecture;
software fault tolerance",
number-of-cited-references = "23",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Shi:2015:CLM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Zheng:2015:ACC,
author = "Zhong Zheng and Zhiying Wang and Mikko Lipasti",
title = "Adaptive Cache and Concurrency Allocation on
{GPGPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "90--93",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2359882",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memory bandwidth is critical to GPGPU performance.
Exploiting locality in caches can better utilize memory
bandwidth. However, memory requests issued by excessive
threads cause cache thrashing and saturate memory
bandwidth, degrading performance. In this paper, we
propose adaptive cache and concurrency allocation (CCA)
to prevent cache thrashing and improve the utilization
of bandwidth and computational resources, hence
improving performance. According to locality and reuse
distance of access patterns in GPGPU program, warps on
a stream multiprocessor are dynamically divided into
three groups: cached, bypassed, and waiting. The data
cache accommodates the footprint of cached warps.
Bypassed warps cannot allocate cache lines in the data
cache to prevent cache thrashing, but are able to take
advantage of available memory bandwidth and
computational resource. Waiting warps are de-scheduled.
Experimental results show that adaptive CCA can
significant improve benchmark performance, with 80
percent harmonic mean IPC improvement over the
baseline.",
acknowledgement = ack-nhfb,
affiliation = "Zheng, Z (Reprint Author), Natl Univ Def Technol,
State Key Lab High Performance Comp, Changsha, Hunan,
Peoples R China. Zheng, Zhong; Wang, Zhiying, Natl Univ
Def Technol, State Key Lab High Performance Comp,
Changsha, Hunan, Peoples R China. Zheng, Zhong; Wang,
Zhiying, Natl Univ Def Technol, Sch Comp, Changsha,
Hunan, Peoples R China. Lipasti, Mikko, Univ Wisconsin,
Dept Elect \& Comp Engn, Madison, WI 54706 USA.",
author-email = "zheng\_zhong@nudt.edu.cn zywang@nudt.edu.cn
mikko@engr.wisc.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "CSC; 863 Program [2012AA010905]; NSFC
[61070037, 61272143, 61272144, 61103016, 61202121];
NUDT [B120607]; RFDP [20114307120013]; NSF
[CCF-1318298]",
funding-text = "This work was partially supported by CSC, 863 Program
(2012AA010905), NSFC (61070037, 61272143, 61272144,
61103016, 61202121), NUDT(B120607), RFDP
(20114307120013), and NSF (CCF-1318298).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "access patterns; adaptive cache-and-concurrency
allocation; Bandwidth; bandwidth utilization
improvement; benchmark performance improvement;
Benchmark testing; bypassed warps; cache; cache lines;
cache locality; Cache memory; cache storage; cache
thrashing prevention; cached warps; CCA; computational
resource utilization improvement; concurrency;
concurrency control; Concurrent computing; GPGPU; GPGPU
performance improvement; graphics processing units;
harmonic mean IPC improvement; Instruction sets; memory
bandwidth saturation; multi-threading; multiprocessing
systems; performance evaluation; Resource management;
reuse distance; stream multiprocessor; waiting warp
descheduling",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "4",
unique-id = "Zheng:2015:ACC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Nowatzki:2015:GBP,
author = "Tony Nowatzki and Venkatraman Govindaraju and
Karthikeyan Sankaralingam",
title = "A Graph-Based Program Representation for Analyzing
Hardware Specialization Approaches",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "94--98",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2476801",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Hardware specialization has emerged as a promising
paradigm for future microprocessors. Unfortunately, it
is natural to develop and evaluate such architectures
within end-to-end vertical silos spanning application,
language/compiler, hardware design and evaluation
tools, leaving little opportunity for
cross-architecture analysis and innovation. This paper
develops a novel program representation suitable for
modeling heterogeneous architectures with specialized
hardware, called the transformable dependence graph
(TDG), which combines semantic information about
program properties and low-level hardware events in a
single representation. We demonstrate, using four
example architectures from the literature, that the TDG
is a feasible, simple, and accurate modeling technique
for transparent specialization architectures, enabling
cross-domain comparison and design-space exploration.",
acknowledgement = ack-nhfb,
affiliation = "Nowatzki, T (Reprint Author), Univ Wisconsin, Dept
Comp Sci, 1210 W Dayton St, Madison, WI 53706 USA.
Nowatzki, Tony; Govindaraju, Venkatraman;
Sankaralingam, Karthikeyan, Univ Wisconsin, Dept Comp
Sci, Madison, WI 53706 USA.",
author-email = "tjn@cs.wisc.edu venkatra@cs.wisc.edu
karu@cs.wisc.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Acceleration; accelerators; computer architecture;
Computer architecture; dependence graphs; graph theory;
graph-based program representation; Hardware
specialization; hardware specialization approach;
heterogeneous architecture modeling; Load modeling;
Microarchitecture; microprocessors; Microprocessors;
modelling; program representation; Specialization;
Specialization, accelerators, modelling, program
representation, dependence graphs; TDG; transformable
dependence graph; Transforms",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Nowatzki:2015:GBP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kim:2015:PEM,
author = "Seung Hun Kim and Dohoon Kim and Changmin Lee and Won
Seob Jeong and Won Woo Ro and Jean-Luc Gaudiot",
title = "A Performance-Energy Model to Evaluate Single Thread
Execution Acceleration",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "99--102",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2368144",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "It is well known that the cost of executing the
sequential portion of a program will limit and
sometimes even eclipse the gains brought by processing
in parallel the rest of the program. This means that
serious consideration should be brought to bear on
accelerating the execution of this unavoidable
sequential part. Such acceleration can be done by
boosting the operating frequency in a symmetric
multicore processor. In this paper, we derive a
performance and power model to describe the
implications of this approach. From our model, we show
that the ratio of performance over energy during the
sequential part improves with an increase in the number
of cores. In addition, we demonstrate how to determine
with the proposed model the optimal frequency boosting
ratio which maximizes energy efficiency.",
acknowledgement = ack-nhfb,
affiliation = "Kim, SH (Reprint Author), Yonsei Univ, Sch Elect \&
Elect Engn, Seoul 120749, South Korea. Kim, Seung Hun;
Kim, Dohoon; Lee, Changmin; Jeong, Won Seob; Ro, Won
Woo, Yonsei Univ, Sch Elect \& Elect Engn, Seoul
120749, South Korea. Gaudiot, Jean-Luc, Univ Calif
Irvine, Dept Elect Engn \& Comp Sci, Irvine, CA USA.",
author-email = "kseunghun@gmail.com dohoon.kim@yonsei.ac.kr
exahz@yonsei.ac.kr ws.jeong@yonsei.ac.kr
wro@yonsei.ac.kr gaudiot@uci.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Basic Science Research Program through the
National Research Foundation of Korea (NRF) ---
Ministry of Education [2010-0013202]; National Science
Foundation [CCF-1439165]",
funding-text = "This work was supported in part by the Basic Science
Research Program through the National Research
Foundation of Korea (NRF) funded by the Ministry of
Education (2010-0013202) and by the National Science
Foundation, under award CCF-1439165. Any opinions,
findings, and conclusions expressed in this material
are those of the authors and do not necessarily reflect
the views of the sponsors. W. W. Ro is the
corresponding author.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "energy efficiency; Energy management; energy-aware
systems; Mathematical model; Microprocessors; Multicore
processing; multiprocessing systems; multiprocessor
systems; optimal frequency boosting ratio; parallel
processing; performance evaluation; Performance
evaluation; Performance modeling; performance-energy
model; power aware computing; Power demand; single
thread execution acceleration; symmetric multicore
processor",
keywords-plus = "AMDAHLS LAW; ERA",
number-of-cited-references = "11",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kim:2015:PEM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Song:2015:ARL,
author = "William Song and Saibal Mukhopadhyay and Sudhakar
Yalamanchili",
title = "Architectural Reliability: Lifetime Reliability
Characterization and Management of Many-Core
Processors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "103--106",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2340873",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "This paper presents a lifetime reliability
characterization of many-core processors based on a
full-system simulation of integrated microarchitecture,
power, thermal, and reliability models. Under normal
operating conditions, our model and analysis reveal
that the mean-time-to-failure of cores on the die show
normal distribution. From the processor-level
perspective, the key insight is that reducing the
variance of the distribution can improve lifetime
reliability by avoiding early failures. Based on this
understanding, we present two variance reduction
techniques for proactive reliability management; (i)
proportional dynamic voltage-frequency scaling (DVFS)
and (ii) coordinated thread swapping. A major advantage
of using variance reduction techniques is that the
improvement of system lifetime reliability can be
achieved without adding design margins or spare
components.",
acknowledgement = ack-nhfb,
affiliation = "Song, W (Reprint Author), Georgia Inst Technol, Sch
Elect \& Comp Engn, Atlanta, GA 30332 USA. Song,
William; Mukhopadhyay, Saibal; Yalamanchili, Sudhakar,
Georgia Inst Technol, Sch Elect \& Comp Engn, Atlanta,
GA 30332 USA.",
author-email = "wjhsong@gatech.edu saibal.mukhopadhyay@ece.gatech.edu
sudha.yalamanchili@ece.gatech.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Semiconductor Research Corporation
[2084.001]; IBM/SRC Graduate Fellowship; Sandia
National Laboratories",
funding-text = "This research was supported by the Semiconductor
Research Corporation under task \#2084.001, IBM/SRC
Graduate Fellowship, and Sandia National
Laboratories.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "architectural reliability; Benchmark testing; Computer
architecture; Computer architecture, lifetime
estimation, modeling, semiconductor device reliability,
simulation; coordinated thread swapping; core
mean-time-to-failure; Degradation; design margins;
DVFS; full-system simulation; Gaussian distribution;
integrated circuit design; Integrated circuit
reliability; integrated microarchitecture; lifetime
estimation; lifetime reliability characterization;
many-core processors; Microarchitecture; microprocessor
chips; modeling; multiprocessing systems; normal
operating conditions; power aware computing; power
models; Program processors; proportional dynamic
voltage-frequency scaling; reliability models;
semiconductor device reliability; simulation; spare
components; thermal models; variance reduction
techniques",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "3",
unique-id = "Song:2015:ARL",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Poluri:2015:SET,
author = "Pavan Poluri and Ahmed Louri",
title = "A Soft Error Tolerant Network-on-Chip Router Pipeline
for Multi-Core Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "107--110",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2360686",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Network-on-Chip (NoC) paradigm is rapidly evolving
into an efficient interconnection network to handle the
strict communication requirements between the
increasing number of cores on a single chip.
Diminishing transistor size is making the NoC
increasingly vulnerable to both hard faults and soft
errors. This paper concentrates on soft errors in NoCs.
A soft error in an NoC router results in significant
consequences such as data corruption, packet
retransmission and deadlock among others. To this end,
we propose Soft Error Tolerant NoC Router (STNR)
architecture, that is capable of detecting and
recovering from soft errors occurring in different
control stages of the routing pipeline. STNR exploits
the use of idle cycles inherent in NoC packet routing
pipeline to perform time redundant executions necessary
for soft error tolerance. In doing so, STNR is able to
detect and correct all single transient faults in the
control stages of the pipeline. Simulation results
using PARSEC and SPLASH-2 benchmarks show that STNR is
able to accomplish such high level of soft error
protection with a minimal impact on latency (an
increase of 1.7 and 1.6 percent respectively).
Additionally, STNR incurs an area overhead of 7 percent
and power overhead of 13 percent as compared to the
baseline unprotected router.",
acknowledgement = ack-nhfb,
affiliation = "Poluri, P (Reprint Author), Univ Arizona, Dept Elect
\& Comp Engn, Tucson, AZ 85721 USA. Poluri, Pavan;
Louri, Ahmed, Univ Arizona, Dept Elect \& Comp Engn,
Tucson, AZ 85721 USA.",
author-email = "pavanp@email.arizona.edu louri@email.arizona.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "US National Science Foundation (NSF)
[CNS-1318997, ECCS-0725765, ECCS-1342702,
CCF-1420681]",
funding-text = "This research was supported by US National Science
Foundation (NSF) awards CNS-1318997, ECCS-0725765,
ECCS-1342702 and CCF-1420681.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Computer architecture; data corruption; deadlock;
fault tolerance; hard faults; idle cycles; integrated
circuit reliability; interconnection network; Multicore
processing; multicore systems; multiprocessing systems;
network routing; Network-on-chip; network-on-chip;
Network-on-chip; NoC packet routing pipeline; packet
retransmission; PARSEC; performance; Pipelines; Ports
(Computers); radiation hardening (electronics);
reliability; Resource management; single chip; single
transient faults; soft error; soft error protection;
soft error tolerance; soft error tolerant
network-on-chip router pipeline; soft error tolerant
NoC router architecture; SPLASH-2 benchmarks; STNR
architecture; Switches; time redundant executions;
Transient analysis; transistor size",
number-of-cited-references = "13",
research-areas = "Computer Science",
times-cited = "6",
unique-id = "Poluri:2015:SET",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Xiao:2015:SCD,
author = "Canwen Xiao and Yue Yang and Jianwen Zhu",
title = "A Sufficient Condition for Deadlock-Free Adaptive
Routing in Mesh Networks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "111--114",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2363829",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Deadlock remains a central problem in interconnection
network. In this paper, we establish a new theory of
deadlock-free flow control for k-ary, n-cube mesh
network, which enables the use of any minimal-path
adaptive routing algorithms while avoiding deadlock. We
prove that the proposed flow control algorithm is a
sufficient condition for deadlock freedom in any
minimal path, adaptive routing algorithms on k-ary,
n-cube mesh network.",
acknowledgement = ack-nhfb,
affiliation = "Xiao, CW (Reprint Author), Natl Univ Def Technol,
Changsha, Hunan, Peoples R China. Xiao, Canwen, Natl
Univ Def Technol, Changsha, Hunan, Peoples R China.
Yang, Yue; Zhu, Jianwen, Univ Toronto, Dept Elect \&
Comp Engn, Toronto, ON, Canada.",
author-email = "cwxiao@nudt.edu.cn yyang@eecg.toronto.edu
jzhu@eecg.toronto.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "``863'' program of China [2012AA01A301,
2013AA014301]",
funding-text = "This work is supported by ``863'' program of China
(2012AA01A301, 2013AA014301).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Adaptive systems; Aerospace electronics; concurrency
control; deadlock avoidance; Deadlock-Free;
deadlock-free adaptive routing; deadlock-free flow
control; flow control; interconnection network; k-ary;
k-ary mesh network; mesh networks; Mesh networks;
minimal path routing algorithm; minimal-path adaptive
routing algorithms; Multiprocessor interconnection;
multiprocessor interconnection networks; n-cube mesh
network; Routing; sufficient condition; System
recovery; Wireless mesh networks",
number-of-cited-references = "7",
research-areas = "Computer Science",
researcherid-numbers = "Yang, Yue/N-8370-2019",
times-cited = "1",
unique-id = "Xiao:2015:SCD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Mittal:2015:ATE,
author = "Sparsh Mittal and Jeffrey S. Vetter",
title = "{AYUSH}: a Technique for Extending Lifetime of
{SRAM--NVM} Hybrid Caches",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "115--118",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2355193",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Recently, researchers have explored way-based hybrid
SRAM-NVM (non-volatile memory) last level caches (LLCs)
to bring the best of SRAM and NVM together. However,
the limited write endurance of NVMs restricts the
lifetime of these hybrid caches. We present AYUSH, a
technique to enhance the lifetime of hybrid caches,
which works by using data-migration to preferentially
use SRAM for storing frequently-reused data.
Microarchitectural simulations confirm that AYUSH
achieves larger improvement in lifetime than a previous
technique and also maintains performance and energy
efficiency. For single, dual and quad-core workloads,
the average increase in cache lifetime with AYUSH is
6.90, 24.06 and 47.62x, respectively.",
acknowledgement = ack-nhfb,
affiliation = "Mittal, S (Reprint Author), Oak Ridge Natl Lab, Div
Math \& Comp Sci, Oak Ridge, TN 37831 USA. Mittal,
Sparsh; Vetter, Jeffrey S., Oak Ridge Natl Lab, Div
Math \& Comp Sci, Oak Ridge, TN 37831 USA.",
author-email = "mittals@ornl.gov vetter@ornl.gov",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "AYUSH; Benchmark testing; Cache memory; cache storage;
data-migration; device lifetime; energy efficiency;
Energy loss; hybrid cache; last level caches;
microarchitectural simulation; Non-volatile memory
(NVM); nonvolatile memory; Nonvolatile memory;
Radiation detectors; Random access memory; SRAM; SRAM
chips; SRAM-NVM cache; SRAM-NVM hybrid caches; write
endurance",
keywords-plus = "ENERGY; MODEL",
number-of-cited-references = "17",
ORCID-numbers = "Vetter, Jeffrey/0000-0002-2449-6720 Mittal,
Sparsh/0000-0002-2908-993X",
research-areas = "Computer Science",
times-cited = "11",
unique-id = "Mittal:2015:ATE",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Manohar:2015:CSD,
author = "Rajit Manohar",
title = "Comparing Stochastic and Deterministic Computing",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "119--122",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2412553",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Technology scaling has raised the specter of myriads
of cheap, but unreliable and/or stochastic devices that
must be creatively combined to create a reliable
computing system. This has renewed the interest in
computing that exploits stochasticity-embracing, not
combating the device physics. If a stochastic
representation is used to implement a programmable
general-purpose architecture akin to CPUs, GPUs, or
FPGAs, the preponderance of evidence indicates that
most of the system energy will be expended in
communication and storage as opposed to computation.
This paper presents an analytical treatment of the
benefits and drawbacks of adopting a stochastic
approach by examining the cost of representing a value.
We show both scaling laws and costs for low precision
representations. We also analyze the cost of
multiplication implemented using stochastic versus
deterministic approaches, since multiplication is the
prototypical inexpensive stochastic operation. We show
that the deterministic approach compares favorably to
the stochastic approach when holding precision and
reliability constant.",
acknowledgement = ack-nhfb,
affiliation = "Manohar, R (Reprint Author), Cornell Univ, Cornell
Tech, New York, NY 10011 USA. Manohar, Rajit, Cornell
Univ, Cornell Tech, New York, NY 10011 USA.",
author-email = "rajit@csl.cornell.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Complexity theory; Computer architecture;
deterministic computing; Encoding; field programmable
gate arrays; FPGAs; general-purpose architecture; GPUs;
graphics processing units; Logic gates; Receivers;
reliable computing system; stochastic computing;
Stochastic processes; stochastic processes; stochastic
representation",
number-of-cited-references = "18",
research-areas = "Computer Science",
times-cited = "5",
unique-id = "Manohar:2015:CSD",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Seo:2015:DDF,
author = "Bon-Keun Seo and Seungryoul Maeng and Joonwon Lee and
Euiseong Seo",
title = "{DRACO}: a Deduplicating {FTL} for Tangible Extra
Capacity",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "123--126",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2350984",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The rapid random access of SSDs enables efficient
searching of redundant data and their deduplication.
However, the space earned from deduplication cannot be
used as permanent storage because it must be reclaimed
when deduplication is cancelled as a result of an
update to the deduplicated data. To overcome this
limitation, we propose a novel FTL scheme that enables
the gained capacity to be used as permanent storage
space for the file system layer. The proposed approach
determines the safe amount of gained capacity that can
be provided to the upper layer based on the compression
rate prediction scheme. It then secures the required
space by compressing cold data when capacity overflow
occurs from cancelled deduplication. Our evaluation
with a kernel source repository showed that the file
system obtained approximately 79 percent additional
capacity by the proposed scheme.",
acknowledgement = ack-nhfb,
affiliation = "Seo, BK (Reprint Author), Korea Adv Inst Sci \&
Technol, Dept Comp Sci, Taejon 305701, South Korea.
Seo, Bon-Keun; Maeng, Seungryoul, Korea Adv Inst Sci \&
Technol, Dept Comp Sci, Taejon 305701, South Korea.
Lee, Joonwon; Seo, Euiseong, Sungkyunkwan Univ, Coll
Informat \& Commun Engn, Suwon 440746, South Korea.",
author-email = "joonwon@skku.edu euiseong@skku.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Basic Science Research Program through the
National Research Foundation of Korea
[2012R1A1A2A10038823]",
funding-text = "This research was supported by Basic Science Research
Program through the National Research Foundation of
Korea (2012R1A1A2A10038823).",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "capacity overflow; cold data compression; compression;
compression rate prediction scheme; data compression;
data deduplication; Data structures; deduplicating FTL;
deduplication; disc drives; DRACO; Entropy; file system
layer; file systems; File systems; file systems; flash
memories; flash memory; Flash memory; flash memory;
flash translation layer; FTL; kernel source repository;
Linux; over-provisioning; permanent storage space;
rapid random access; redundant data searching; SDRAM;
SSD; storage management; storage reclamation; tangible
extra capacity",
number-of-cited-references = "6",
research-areas = "Computer Science",
researcherid-numbers = "Maeng, Seungryoul/C-1882-2011",
times-cited = "2",
unique-id = "Seo:2015:DDF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Seshadri:2015:FBB,
author = "Vivek Seshadri and Kevin Hsieh and Amirali Boroum and
Donghyuk Lee and Michael A. Kozuch and Onur Mutlu and
Phillip B. Gibbons and Todd C. Mowry",
title = "Fast Bulk Bitwise {AND} and {OR} in {DRAM}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "127--131",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2434872",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Bitwise operations are an important component of
modern day programming, and are used in a variety of
applications such as databases. In this work, we
propose a new and simple mechanism to implement bulk
bitwise AND and OR operations in DRAM, which is faster
and more efficient than existing mechanisms. Our
mechanism exploits existing DRAM operation to perform a
bitwise AND/OR of two DRAM rows completely within DRAM.
The key idea is to simultaneously connect three cells
to a bitline before the sense-amplification. By
controlling the value of one of the cells, the sense
amplifier forces the bitline to the bitwise AND or
bitwise OR of the values of the other two cells. Our
approach can improve the throughput of bulk bitwise
AND/OR operations by 9.7X and reduce their energy
consumption by 50.5.X. Since our approach exploits
existing DRAM operation as much as possible, it
requires negligible changes to DRAM logic. We evaluate
our approach using a real-world implementation of a
bit-vector based index for databases. Our mechanism
improves the performance of commonly-used range queries
by 30 percent on average.",
acknowledgement = ack-nhfb,
affiliation = "Seshadri, V (Reprint Author), Carnegie Mellon Univ,
Pittsburgh, PA 15213 USA. Seshadri, Vivek; Hsieh,
Kevin; Boroum, Amirali; Lee, Donghyuk; Mutlu, Onur;
Mowry, Todd C., Carnegie Mellon Univ, Pittsburgh, PA
15213 USA. Kozuch, Michael A.; Gibbons, Phillip B.,
Intel Pittsburgh, Pittsburgh, PA USA.",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [0953246, 1212962, 1320531]; Intel
Science and Tech. Center; Samsung; Google; Facebook;
SRC",
funding-text = "This work was supported by NSF (awards 0953246,
1212962, and 1320531), and Intel Science and Tech.
Center, Samsung, Google, Facebook, and SRC.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "bit-vector based index; bitwise AND/OR; bulk-bitwise
AND operation; bulk-bitwise OR operation; Capacitors;
cell value control; Computer architecture; database
indexing; Decoding; DRAM; DRAM chips; DRAM memory; DRAM
memory, bitwise AND/OR, performance; DRAM operation;
energy consumption reduction; logic gates; performance;
performance improvement; Program processors; Random
access memory; range queries; sense amplifier;
sense-amplification; Throughput; throughput
improvement",
number-of-cited-references = "20",
research-areas = "Computer Science",
times-cited = "21",
unique-id = "Seshadri:2015:FBB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Altaf:2015:LPM,
author = "Muhammad Shoaib Bin Altaf and David A. Wood",
title = "{LogCA}: a Performance Model for Hardware
Accelerators",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "132--135",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2360182",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/cryptography2010.bib;
https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "To address the Dark Silicon problem, architects have
increasingly turned to special-purpose hardware
accelerators to improve the performance and energy
efficiency of common computational kernels, such as
encryption and compression. Unfortunately, the latency
and overhead required to off-load a computation to an
accelerator sometimes outweighs the potential benefits,
resulting in a net decrease in performance or energy
efficiency. To help architects and programmers reason
about these trade-offs, we have developed the LogCA
model, a simple performance model for hardware
accelerators. LogCA provides a simplified abstraction
of a hardware accelerator characterized by five key
parameters. We have validated the model against a
variety of accelerators, ranging from on-chip
cryptographic accelerators in Sun's UltraSparc T2 and
Intel's Sandy Bridge to both discrete and integrated
GPUs.",
acknowledgement = ack-nhfb,
affiliation = "Bin Altaf, MS (Reprint Author), Univ Wisconsin,
Madison, WI 53706 USA. Bin Altaf, Muhammad Shoaib;
Wood, David A., Univ Wisconsin, Madison, WI 53706
USA.",
author-email = "shoaibbinalt@wisc.edu david@cs.wisc.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [CNS-1117280, CCF-1218323,
CNS-1302260]",
funding-text = "We thank Mark Hill, Michael Swift, Rathijit Sen, and
the members of the Wisconsin Multifacet group for their
comments on the paper. This work is supported in part
with NSF grants CNS-1117280, CCF-1218323, and
CNS-1302260. The views expressed herein are not
necessarily those of the NSF. Professor Wood has
significant financial interests in AMD, Google and
Panasas.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Accelerators; compression; computational kernel;
Computational modeling; cryptography; dark silicon
problem; encryption; energy conservation; energy
efficiency; GPU; graphics processing units; Hardware
accelerators; heterogeneous systems; Intel Sandy
Bridge; LogCA model; Modeling; modeling techniques;
modeling techniques,; on-chip cryptographic
accelerator; Performance evaluation; performance model;
performance of systems; special-purpose hardware
accelerator; UltraSparc T2",
number-of-cited-references = "12",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Altaf:2015:LPM",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Diamantopoulos:2015:MMI,
author = "Dionysios Diamantopoulos and Sotirios Xydis and Kostas
Siozios and Dimitrios Soudris",
title = "Mitigating Memory-Induced Dark Silicon in
Many-Accelerator Architectures",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "136--139",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2410791",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Many-Accelerator (MA) systems have been introduced as
a promising architectural paradigm that can boost
performance and improve power of general-purpose
computing platforms. In this paper, we focus on the
problem of resource under-utilization, i.e., Dark
Silicon, in FPGA-based MA platforms. We show that
except the typically expected peak power budget,
on-chip memory resources form a severe
under-utilization factor in MA platforms, leading up to
75 percent of dark silicon. Recognizing that static
memory allocation-the de-facto mechanism supported by
modern design techniques and synthesis tools-forms the
main source of memory-induced Dark Silicon, we
introduce a novel framework that extends conventional
high level synthesis (HLS) with dynamic memory
management (DMM) features, enabling accelerators to
dynamically adapt their allocated memory to the runtime
memory requirements, thus maximizing the overall
accelerator count through effective sharing of FPGA's
memories resources. We show that our technique delivers
significant gains in FPGA's accelerators density, i.e.
3.8x, and application throughput up to 3.1x and 21.4x
for shared and private memory accelerators.",
acknowledgement = ack-nhfb,
affiliation = "Diamantopoulos, D (Reprint Author), Natl Tech Univ
Athens, Sch Elect \& Comp Engn, Athens, Greece.
Diamantopoulos, Dionysios; Xydis, Sotirios; Siozios,
Kostas; Soudris, Dimitrios, Natl Tech Univ Athens, Sch
Elect \& Comp Engn, Athens, Greece.",
author-email = "diamantd@microlab.ntua.gr sxydis@microlab.ntua.gr
ksiop@microlab.ntua.gr dsoudris@microlab.ntua.gr",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "E.C. [644906]",
funding-text = "This research is partially supported by the E.C.
funded program AEGLE under H2020 Grant Agreement No:
644906.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "de-facto mechanism; DMM feature; dynamic memory
management; dynamic memory management feature; Dynamic
scheduling; Field programmable gate arrays; field
programmable gate arrays; FPGA-based MA platform;
high-level synthesis; high-level synthesis tool; HLS
tool; MA system; Many-accelerator architectures;
many-accelerator architectures; Many-accelerator
architectures; Memory management; memory-induced dark
silicon source; modern design technique; Network
architecture; on-chip memory resource; peak power
budget; power aware computing; Resource management;
severe under-utilization factor; silicon; static memory
allocation; storage management; System-on-chip;
Throughput",
number-of-cited-references = "14",
ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847 Siozios,
Kostas/0000-0002-0285-2202",
research-areas = "Computer Science",
researcherid-numbers = "Soudris, Dimitrios/O-8843-2019 Siozios,
Kostas/F-9726-2011",
times-cited = "1",
unique-id = "Diamantopoulos:2015:MMI",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Poremba:2015:NUF,
author = "Matthew Poremba and Tao Zhang and Yuan Xie",
title = "{NVMain 2.0}: a User-Friendly Memory Simulator to
Model (Non-) Volatile Memory Systems",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "140--143",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2402435",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "In this letter, a flexible memory simulator --- NVMain
2.0, is introduced to help the community for modeling
not only commodity DRAMs but also emerging memory
technologies, such as die-stacked DRAM caches,
non-volatile memories (e.g., STT-RAM, PCRAM, and ReRAM)
including multi-level cells (MLC), and hybrid
non-volatile plus DRAM memory systems. Compared to
existing memory simulators, NVMain 2.0 features a
flexible user interface with compelling simulation
speed and the capability of providing sub-array-level
parallelism, fine-grained refresh, MLC and data encoder
modeling, and distributed energy profiling.",
acknowledgement = ack-nhfb,
affiliation = "Poremba, M (Reprint Author), Penn State Univ, Dept
Comp Sci \& Engn, University Pk, PA 16802 USA. Poremba,
Matthew; Zhang, Tao; Xie, Yuan, Penn State Univ, Dept
Comp Sci \& Engn, University Pk, PA 16802 USA.",
author-email = "poremba@cse.psu.edu zhangtao@cse.psu.edu
yuanxie@cse.psu.edu",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "NSF [1218867, 1213052, 1409798]; Department
of Energy [DE-SC0005026]",
funding-text = "Poremba, Zhang, and Xie were supported in part by NSF
1218867, 1213052, 1409798. This material was based on
work supported by the Department of Energy under Award
Number DE-SC0005026. Matthew Poremba is the
corresponding author.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache storage; commodity DRAM; Computational modeling;
Computer architecture; die-stacked DRAM cache; DRAM
chips; DRAM memory systems; flexible memory simulator;
flexible user interface; Memory architecture; memory
architecture; Memory architecture, random access
memory, nonvolatile memory, phase change memory, SDRAM;
Memory management; memory technology; multilevel cells;
nonvolatile memory; Nonvolatile memory; nonvolatile
memory system; NVMain 2.0; PCRAM; phase change
memories; phase change memory; Phase change random
access memory; random access memory; ReRAM; SDRAM;
STT-RAM; user interfaces; user-friendly memory
simulator",
number-of-cited-references = "10",
research-areas = "Computer Science",
times-cited = "36",
unique-id = "Poremba:2015:NUF",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Vandierendonck:2015:EEB,
author = "Hans Vandierendonck and Ahmad Hassan and Dimitrios S.
Nikolopoulos",
title = "On the Energy-Efficiency of Byte-Addressable
Non-Volatile Memory",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "144--147",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2355195",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Non-volatile memory (NVM) technology holds promise to
replace SRAM and DRAM at various levels of the memory
hierarchy. The interest in NVM is motivated by the
difficulty faced in scaling DRAM beyond 22 nm and,
long-term, lower cost per bit. While offering higher
density and negligible static power (leakage and
refresh), NVM suffers increased latency and energy per
memory access. This paper develops energy and
performance models of memory systems and applies them
to understand the energy-efficiency of replacing or
complementing DRAM with NVM. Our analysis focusses on
the application of NVM in main memory. We demonstrate
that NVM such as STT-RAM and RRAM is energy-efficient
for memory sizes commonly employed in servers and
high-end workstations, but PCM is not. Furthermore, the
model is well suited to quickly evaluate the impact of
changes to the model parameters, which may be achieved
through optimization of the memory architecture, and to
determine the key parameters that impact system-level
energy and performance.",
acknowledgement = ack-nhfb,
affiliation = "Vandierendonck, H (Reprint Author), Queens Univ
Belfast, Belfast BT7 1NN, Antrim, North Ireland.
Vandierendonck, Hans; Nikolopoulos, Dimitrios S.,
Queens Univ Belfast, Belfast BT7 1NN, Antrim, North
Ireland. Hassan, Ahmad, SAP Belfast, Belfast, Antrim,
North Ireland.",
author-email = "h.vandierendonck@qub.ac.uk ahmad.hassan@sap.com
d.nikolopoulos@qub.ac.uk",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "People Programme (Marie Curie Actions) of
the European Union's Seventh Framework Programme
[327744]",
funding-text = "This work was supported by the People Programme (Marie
Curie Actions) of the European Union's Seventh
Framework Programme (FP7/2007-2013), grant agreement
no. 327744.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "byte-addressable nonvolatile memory technology;
Computational modeling; DRAM; DRAM chips; energy;
energy conservation; energy efficiency; Enery
efficiency; impact system-level energy; Main memory
systems; Main memory systems, non-volatile memory,
energy, modeling; Mathematical model; memory
architecture; memory hierarchy; Memory management;
memory systems; modeling; non-volatile memory;
Nonvolatile memory; NVM technology; PCM; Phase change
materials; Random access memory; RRAM; SRAM; SRAM
chips; static power; STT-RAM",
number-of-cited-references = "15",
oa = "Green Published",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Vandierendonck:2015:EEB",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Yavits:2015:RAP,
author = "Leonid Yavits and Shahar Kvatinsky and Amir Morad and
Ran Ginosar",
title = "Resistive Associative Processor",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "148--151",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2374597",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Associative Processor (AP) combines data storage and
data processing, and functions simultaneously as a
massively parallel array SIMD processor and memory.
Traditionally, AP is based on CMOS technology, similar
to other classes of massively parallel SIMD processors.
The main component of AP is a Content Addressable
Memory (CAM) array. As CMOS feature scaling slows down,
CAM experiences scalability problems. In this work, we
propose and investigate an AP based on resistive
CAM-the Resistive AP (ReAP). We show that resistive
memory technology potentially allows scaling the AP
from a few millions to a few hundred millions of
processing units on a single silicon die. We compare
the performance and power consumption of a ReAP to a
CMOS AP and a conventional SIMD accelerator (GPU) and
show that ReAP, although exhibiting higher power
density, allows better scalability and higher
performance.",
acknowledgement = ack-nhfb,
affiliation = "Yavits, L (Reprint Author), Technion Israel Inst
Technol, Dept Elect Engn, IL-3200000 Haifa, Israel.
Yavits, Leonid; Kvatinsky, Shahar; Morad, Amir;
Ginosar, Ran, Technion Israel Inst Technol, Dept Elect
Engn, IL-3200000 Haifa, Israel.",
author-email = "yavits@txtechnion.ac.il skva@txtechnion.ac.il
amirm@txtechnion.ac.il ran@ee.technion.ac.il",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Intel Collaborative Research Institute for
Computational Intelligence; Hasso-Plattner-Institut",
funding-text = "The authors would like to thank Uri Weiser for
inspiring this research. This work was partially funded
by the Intel Collaborative Research Institute for
Computational Intelligence and by
Hasso-Plattner-Institut.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Associative processing; associative processor;
Associative Processor; associative processor; CAM
array; CMOS feature scaling; CMOS integrated circuits;
CMOS technology; complimentary metal oxide
semiconductor; Computer aided manufacturing; content
addressable memory array; content-addressable storage;
data processing; data storage; GPU; graphics processing
unit; in-memory computing; In-Memory Computing;
in-memory computing; massively parallel array SIMD
processor; memory function; memristor; Memristor;
memristor; Memristors; parallel processing; Random
access memory; ReAP; resistive associative processor;
resistive RAM; Resistive RAM; resistive RAM; SIMD; SIMD
accelerator",
number-of-cited-references = "17",
research-areas = "Computer Science",
times-cited = "22",
unique-id = "Yavits:2015:RAP",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Kang:2015:SRT,
author = "Suk Chan Kang and Chrysostomos Nicopoulos and Ada
Gavrilovska and Jongman Kim",
title = "Subtleties of Run-Time Virtual Address Stacks",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "152--155",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2337299",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "The run-time virtual address (VA) stack has some
unique properties, which have garnered the attention of
researchers. The stack one-dimensionally grows and
shrinks at its top, and contains data that is seemingly
local/private to one thread, or process. Most prior
related research has focused on these properties.
However, this article aims to demonstrate how
conventional wisdom pertaining to the run-time VA stack
fails to capture some critical subtleties and
complexities. We first explore two widely established
assumptions surrounding the VA stack area: (1) Data
accesses can be classified as falling either under
VA-stack-area accesses, or non-stack-area accesses,
with no aliasing; (2) The VA stack data is completely
private and invisible to other threads/processes.
Subsequently, we summarize a representative selection
of related work that pursued the micro-architectural
concept of using run-time VA stacks to extend the
general-purpose register file. We then demonstrate why
these assumptions are invalid, by using examples from
prior work to highlight the potential hazards regarding
data consistency, shared memory consistency, and cache
coherence. Finally, we suggest safeguards against these
hazards. Overall, we explore the function-critical
issues that future operating systems and compilers
should address to effectively reap all the benefits of
using run-time VA stacks.",
acknowledgement = ack-nhfb,
affiliation = "Kang, SC (Reprint Author), Georgia Inst Technol,
Atlanta, GA 30332 USA. Kang, Suk Chan; Gavrilovska,
Ada; Kim, Jongman, Georgia Inst Technol, Atlanta, GA
30332 USA. Nicopoulos, Chrysostomos, Univ Cyprus,
CY-1678 Nicosia, Cyprus.",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "cache coherence; cache storage; data consistency; data
decoupling; data integrity; data privacy;
function-critical issue; general-purpose register file;
Instruction sets; memory consistency;
microarchitectural concept; nonstack-area access;
register file; Run time; Run-time stack; run-time VA
stack data access; run-time virtual address stack;
shared memory; shared memory consistency; shared memory
systems; synonym page; VA-stack-area accesses;
Virtualization",
number-of-cited-references = "12",
ORCID-numbers = "Nicopoulos, Chrysostomos/0000-0001-6389-6068",
research-areas = "Computer Science",
times-cited = "0",
unique-id = "Kang:2015:SRT",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Rodopoulos:2015:TPV,
author = "Dimitrios Rodopoulos and Francky Catthoor and
Dimitrios Soudris",
title = "Tackling Performance Variability Due to {RAS}
Mechanisms with {PID}-Controlled {DVFS}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "156--159",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2385713",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "As technology nodes approach deca-nanometer
dimensions, many phenomena threaten the binary
correctness of processor operation. Computer architects
typically enhance their designs with reliability,
availability and serviceability (RAS) schemes to
correct such errors, in many cases at the cost of extra
clock cycles, which, in turn, leads to processor
performance variability. The goal of the current paper
is to absorb this variability using Dynamic Voltage and
Frequency Scaling (DVFS). A closed-loop implementation
is proposed, which configures the clock frequency based
on observed metrics that encapsulate performance
variability due to RAS mechanisms. That way,
performance dependability and predictability is
achieved. We simulate the transient and steady state
behavior of our approach, reporting responsiveness
within less than 1 ms. We also assess our idea using
the power model of real processor and report a maximum
energy overhead of roughly 10 percent for dependable
performance in the presence of RAS temporal
overheads.",
acknowledgement = ack-nhfb,
affiliation = "Rodopoulos, D (Reprint Author), Natl Tech Univ Athens,
MicroLab, Sch Elect \& Comp Engn, Athens 15780, Greece.
Rodopoulos, Dimitrios; Soudris, Dimitrios, Natl Tech
Univ Athens, MicroLab, Sch Elect \& Comp Engn, Athens
15780, Greece. Catthoor, Francky, ESAT KU Leuven,
Leuven, Belgium. Catthoor, Francky, SSET IMEC, Leuven,
Belgium.",
author-email = "drodo@microlab.ntua.gr catthoor@imec.be
dsoudris@microlab.ntua.gr",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "HARPA EC project [FP7-612069]",
funding-text = "The authors thank Prof. Y. Sazeides and Prof. C.
Nicopoulos of UCY, Cyprus for the insightful
discussions. They also acknowledge the constructive
feedback of the reviewers. This work was partially
supported by the FP7-612069-HARPA EC project. Dimitrios
Rodopoulos is the corresponding author. Finally, the
authors acknowledge conversations with Dr. Antonis
Papanikolaou.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "availability; availability and serviceability;
Availability and Serviceability; availability and
serviceability; binary correctness; closed loop
systems; closed-loop implementation; computer
architects; computer architecture; deca-nanometer
dimensions; Dynamic voltage and frequency scaling;
dynamic voltage and frequency scaling; Dynamic voltage
and frequency scaling; Dynamic Voltage and Frequency
Scaling; Mathematical model; microcomputers;
Performance evaluation; performance variability;
performance vulnerability factor; Performance
Vulnerability Factor; PID-controlled DVFS; Process
control; processor operation; RAS mechanisms;
reliability; Reliability; reliability; Reliability;
serviceability; three-term control; Voltage control",
number-of-cited-references = "21",
ORCID-numbers = "Soudris, Dimitrios/0000-0002-6930-6847",
research-areas = "Computer Science",
researcherid-numbers = "Soudris, Dimitrios/O-8843-2019",
times-cited = "4",
unique-id = "Rodopoulos:2015:TPV",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Markovic:2015:TLS,
author = "Nikola Markovic and Daniel Nemirovsky and Osman Unsal
and Mateo Valero and Adrian Cristal",
title = "Thread Lock Section-Aware Scheduling on Asymmetric
Single-{ISA} Multi-Core",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "160--163",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2014.2357805",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib;
https://www.math.utah.edu/pub/tex/bib/multithreading.bib",
abstract = "As thread level parallelism in applications has
continued to expand, so has research in chip multi-core
processors. As more and more applications become
multi-threaded we expect to find a growing number of
threads executing on a machine. As a consequence, the
operating system will require increasingly larger
amounts of CPU time to schedule these threads
efficiently. Instead of perpetuating the trend of
performing more complex thread scheduling in the
operating system, we propose a scheduling mechanism
that can be efficiently implemented in hardware as
well. Our approach of identifying multi-threaded
application bottlenecks such as thread synchronization
sections complements the Fairness-aware Scheduler
method. It achieves an average speed up of 11.5 percent
(geometric mean) compared to the state-of-the-art
Fairness-aware Scheduler.",
acknowledgement = ack-nhfb,
affiliation = "Markovic, N (Reprint Author), Barcelona Supercomputing
Ctr, Barcelona, Spain. Markovic, Nikola; Nemirovsky,
Daniel; Unsal, Osman; Valero, Mateo, Barcelona
Supercomputing Ctr, Barcelona, Spain. Markovic, Nikola;
Nemirovsky, Daniel; Valero, Mateo, Univ Politecn
Cataluna, Barcelona, Spain. Cristal, Adrian, Univ
Politecn Cataluna, Barcelona Supercomputing Ctr,
E-08028 Barcelona, Spain. Cristal, Adrian, Artificial
Intelligence Res Inst Spanish Natl Res, Barcelona,
Spain.",
author-email = "nikola.markovic@bsc.es daniel.nemirovsky@bsc.es
osman.unsal@bsc.es mateo.valero@bsc.es
adrian.cristal@bsc.es",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "Asymmetric chip multiprocessor (ACMP); asymmetric
single-ISA multicore processor; chip multicore
processors; Context modeling; fairness-aware scheduler
method; HW/SW thread scheduling; Instruction sets;
microprocessor chips; multi-threaded applications;
multi-threading; Multicore processing; multiprocessing
systems; multithreaded application; operating system;
Operating systems; operating systems (computers);
scheduling; Scheduling; Synchronization; thread lock
section-aware scheduling mechanism; thread
synchronization",
number-of-cited-references = "17",
ORCID-numbers = "UNSAL, OSMAN/0000-0002-0544-9697 Valero,
Mateo/0000-0003-2917-2482",
research-areas = "Computer Science",
researcherid-numbers = "UNSAL, OSMAN/B-9161-2016 Valero,
Mateo/L-5709-2014",
times-cited = "7",
unique-id = "Markovic:2015:TLS",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Pekhimenko:2015:TAC,
author = "Gennady Pekhimenko and Evgeny Bolotin and Mike
O'Connor and Onur Mutlu and Todd C. Mowry and Stephen
W. Keckler",
title = "Toggle-Aware Compression for {GPUs}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "164--168",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2430853",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/datacompression.bib;
https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
abstract = "Memory bandwidth compression can be an effective way
to achieve higher system performance and energy
efficiency in modern data-intensive applications by
exploiting redundancy in data. Prior works studied
various data compression techniques to improve both
capacity (e.g., of caches and main memory) and
bandwidth utilization (e.g., of the on-chip and
off-chip interconnects). These works addressed two
common shortcomings of compression: (i)
compression/decompression overhead in terms of latency,
energy, and area, and (ii) hardware complexity to
support variable data size. In this paper, we make the
new observation that there is another important problem
related to data compression in the context of the
communication energy efficiency: transferring
compressed data leads to a substantial increase in the
number of bit toggles (communication channel switchings
from 0 to 1 or from 1 to 0). This, in turn, increases
the dynamic energy consumed by on-chip and off-chip
buses due to more frequent charging and discharging of
the wires. Our results, for example, show that the bit
toggle count increases by an average of 2.2x with some
compression algorithms across 54 mobile GPU
applications. We characterize and demonstrate this new
problem across a wide variety of 221 GPU applications
and six different compression algorithms. To mitigate
the problem, we propose two new toggle-aware
compression techniques: energy control and Metadata
Consolidation. These techniques greatly reduce the bit
toggle count impact of the six data compression
algorithms we examine, while keeping most of their
bandwidth reduction benefits.",
acknowledgement = ack-nhfb,
affiliation = "Pekhimenko, G (Reprint Author), Carnegie Mellon Univ,
Dept Comp Sci, Pittsburgh, PA 15206 USA. Pekhimenko,
Gennady; Mutlu, Onur; Mowry, Todd C., Carnegie Mellon
Univ, Dept Comp Sci, Pittsburgh, PA 15206 USA. Bolotin,
Evgeny; O'Connor, Mike; Keckler, Stephen W., NVIDA,
Santa Clara, CA USA. O'Connor, Mike; Keckler, Stephen
W., Univ Texas Austin, Austin, TX 78712 USA.",
author-email = "gpekhimento@gmail.com ebolotin@nvidia.com
moconnor@nvidia.com omutlu@gmail.com tcm@cs.cmu.edu
skeckler@nvidia.com",
da = "2019-06-20",
doc-delivery-number = "CZ7DC",
eissn = "1556-6064",
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
funding-acknowledgement = "Intel Science and Technology Center for
Cloud Computing; US National Science Foundation
[1212962, 1409723, 1423172]; US Department of Energy",
funding-text = "The authors acknowledge the support of Intel Science
and Technology Center for Cloud Computing; US National
Science Foundation grants 1212962, 1409723, and
1423172; and the US Department of Energy.",
journal-iso = "IEEE Comput. Archit. Lett.",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
keywords = "bandwidth utilization; bit toggle count impact; bit
toggles; Communication channels; communication energy
efficiency; Compression algorithms;
compression/decompression overhead; Data compression;
data compression; data compression algorithms; data
compression techniques; Data compression,
interconnected systems, memory; data redundancy;
dynamic energy; energy control; graphics processing
units; Graphics processing units; hardware complexity;
interconnected systems; memory; memory bandwidth
compression; metadata consolidation; Mobile
communication; mobile GPU applications; modern
data-intensive applications; off-chip buses; on-chip
buses; power aware computing; System-on-chip;
toggle-aware compression; variable data size",
number-of-cited-references = "29",
research-areas = "Computer Science",
times-cited = "2",
unique-id = "Pekhimenko:2015:TAC",
web-of-science-categories = "Computer Science, Hardware \&
Architecture",
}
@Article{Anonymous:2015:TCb,
author = "Anonymous",
title = "Table of Contents",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "C1--C1",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2510172",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:ICAc,
author = "Anonymous",
title = "{{\booktitle{IEEE Computer Architecture Letters}}
Editorial Board}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "C2--C2",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2510173",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:ICAd,
author = "Anonymous",
title = "{{\booktitle{IEEE Computer Architecture Letters}}}
Information for Authors",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "C3--C3",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2510174",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/RecentIssue.jsp?punumber=10208",
}
@Article{Anonymous:2015:ICSb,
author = "Anonymous",
title = "{IEEE Computer Society}",
journal = j-IEEE-COMPUT-ARCHIT-LETT,
volume = "14",
number = "2",
pages = "C4--C4",
month = jul # "\slash " # dec,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1109/LCA.2015.2510176",
ISSN = "1556-6056 (print), 1556-6064 (electronic)",
ISSN-L = "1556-6056",
bibdate = "Tue Jun 25 07:41:05 2019",
bibsource = "https://www.math.utah.edu/pub/tex/bib/ieeecomputarchitlett.bib",
acknowledgement = ack-nhfb,
ajournal = "IEEE Comput. Archit. Lett.",
fjournal = "IEEE Computer Architecture Letters",
journal-URL = "http://ieeexplore.ieee.org/xpl/