%%% -*-BibTeX-*-
%%% ====================================================================
%%% BibTeX-file{
%%% author = "Nelson H. F. Beebe",
%%% version = "1.19",
%%% date = "01 October 2009",
%%% time = "09:23:47 MDT",
%%% filename = "taco.bib",
%%% address = "University of Utah
%%% Department of Mathematics, 110 LCB
%%% 155 S 1400 E RM 233
%%% Salt Lake City, UT 84112-0090
%%% USA",
%%% telephone = "+1 801 581 5254",
%%% FAX = "+1 801 581 4148",
%%% URL = "http://www.math.utah.edu/~beebe",
%%% checksum = "35797 3646 18697 171428",
%%% email = "beebe at math.utah.edu, beebe at acm.org,
%%% beebe at computer.org (Internet)",
%%% codetable = "ISO/ASCII",
%%% keywords = "ACM Transactions on Architecture and Code
%%% Optimization; bibliography; TACO",
%%% license = "public domain",
%%% supported = "yes",
%%% docstring = "This is a COMPLETE BibTeX bibliography for
%%% ACM Transactions on Architecture and Code
%%% Optimization (CODEN ????, ISSN 1544-3566),
%%% covering all journal issues from 2004 --
%%% date.
%%%
%%% At version 1.19, the COMPLETE journal
%%% coverage looked like this:
%%%
%%% 2004 ( 17) 2006 ( 19) 2008 ( 21)
%%% 2005 ( 17) 2007 ( 19) 2009 ( 16)
%%%
%%% Article: 109
%%%
%%% Total entries: 109
%%%
%%% The journal Web page can be found at:
%%%
%%% http://www.acm.org/pubs/taco.html
%%%
%%% The journal table of contents page is at:
%%%
%%% http://www.acm.org/taco/
%%% http://portal.acm.org/browse_dl.cfm?idx=J924
%%%
%%% Qualified subscribers can retrieve the full
%%% text of recent articles in PDF form.
%%%
%%% The initial draft was extracted from the ACM
%%% Web pages.
%%%
%%% ACM copyrights explicitly permit abstracting
%%% with credit, so article abstracts, keywords,
%%% and subject classifications have been
%%% included in this bibliography wherever
%%% available. Article reviews have been
%%% omitted, until their copyright status has
%%% been clarified.
%%%
%%% bibsource keys in the bibliography entries
%%% below indicate the entry originally came
%%% from the computer science bibliography
%%% archive, even though it has likely since
%%% been corrected and updated.
%%%
%%% URL keys in the bibliography point to
%%% World Wide Web locations of additional
%%% information about the entry.
%%%
%%% BibTeX citation tags are uniformly chosen
%%% as name:year:abbrev, where name is the
%%% family name of the first author or editor,
%%% year is a 4-digit number, and abbrev is a
%%% 3-letter condensation of important title
%%% words. Citation tags were automatically
%%% generated by software developed for the
%%% BibNet Project.
%%%
%%% In this bibliography, entries are sorted in
%%% publication order, using ``bibsort -byvolume.''
%%%
%%% The checksum field above contains a CRC-16
%%% checksum as the first value, followed by the
%%% equivalent of the standard UNIX wc (word
%%% count) utility output of lines, words, and
%%% characters. This is produced by Robert
%%% Solovay's checksum utility."
%%% }
%%% ====================================================================
@Preamble{"\input bibnames.sty" #
"\def \TM {${}^{\sc TM}$}"
}
%%% ====================================================================
%%% Acknowledgement abbreviations:
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|http://www.math.utah.edu/~beebe/|"}
%%% ====================================================================
%%% Journal abbreviations:
@String{j-TACO = "ACM Transactions on Architecture and
Code Optimization"}
%%% ====================================================================
%%% Bibliography entries:
@Article{Calder:2004:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "1",
number = "1",
pages = "1--2",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhang:2004:RIC,
author = "W. Zhang and J. S. Hu and V. Degalahal and M. Kandemir
and N. Vijaykrishnan and M. J. Irwin",
title = "Reducing instruction cache energy consumption using a
compiler-based strategy",
journal = j-TACO,
volume = "1",
number = "1",
pages = "3--33",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Isailovic:2004:DCQ,
author = "Nemanja Isailovic and Mark Whitney and Yatish Patel
and John Kubiatowicz and Dean Copsey and Frederic T.
Chong and Isaac L. Chuang and Mark Oskin",
title = "Datapath and control for quantum wires",
journal = j-TACO,
volume = "1",
number = "1",
pages = "34--61",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Sankaralingam:2004:TPA,
author = "Karthikeyan Sankaralingam and Ramadass Nagarajan and
Haiming Liu and Changkyu Kim and Jaehyuk Huh and Nitya
Ranganathan and Doug Burger and Stephen W. Keckler and
Robert G. McDonald and Charles R. Moore",
title = "{TRIPS}: {A} polymorphous architecture for exploiting
{ILP}, {TLP}, and {DLP}",
journal = j-TACO,
volume = "1",
number = "1",
pages = "62--93",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Skadron:2004:TAM,
author = "Kevin Skadron and Mircea R. Stan and Karthik
Sankaranarayanan and Wei Huang and Sivakumar Velusamy
and David Tarjan",
title = "Temperature-aware microarchitecture: {Modeling} and
implementation",
journal = j-TACO,
volume = "1",
number = "1",
pages = "94--125",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Aleta:2004:RCC,
author = "Alex Alet{\`a} and Josep M. Codina and Antonio
Gonz{\'a}lez and David Kaeli",
title = "Removing communications in clustered
microarchitectures through instruction replication",
journal = j-TACO,
volume = "1",
number = "2",
pages = "127--151",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Bai:2004:LPO,
author = "Yu Bai and R. Iris Bahar",
title = "A low-power in-order\slash out-of-order issue queue",
journal = j-TACO,
volume = "1",
number = "2",
pages = "152--179",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Juang:2004:IBP,
author = "Philo Juang and Kevin Skadron and Margaret Martonosi
and Zhigang Hu and Douglas W. Clark and Philip W.
Diodato and Stefanos Kaxiras",
title = "Implementing branch-predictor decay using quasi-static
memory cells",
journal = j-TACO,
volume = "1",
number = "2",
pages = "180--219",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Santana:2004:LCF,
author = "Oliverio J. Santana and Alex Ramirez and Josep L.
Larriba-Pey and Mateo Valero",
title = "A low-complexity fetch architecture for
high-performance superscalar processors",
journal = j-TACO,
volume = "1",
number = "2",
pages = "220--245",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Lin:2004:CFS,
author = "Jin Lin and Tong Chen and Wei-Chung Hsu and Pen-Chung
Yew and Roy Dz-Ching Ju and Tin-Fook Ngai and Sun
Chan",
title = "A compiler framework for speculative optimizations",
journal = j-TACO,
volume = "1",
number = "3",
pages = "247--271",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Fields:2004:ICS,
author = "Brian A. Fields and Rastislav Bodik and Mark D. Hill
and Chris J. Newburn",
title = "Interaction cost and shotgun profiling",
journal = j-TACO,
volume = "1",
number = "3",
pages = "272--304",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Sankaranarayanan:2004:PBA,
author = "Karthik Sankaranarayanan and Kevin Skadron",
title = "Profile-based adaptation for cache decay",
journal = j-TACO,
volume = "1",
number = "3",
pages = "305--322",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Xie:2004:IDV,
author = "Fen Xie and Margaret Martonosi and Sharad Malik",
title = "Intraprogram dynamic voltage scaling: {Bounding}
opportunities with analytic modeling",
journal = j-TACO,
volume = "1",
number = "3",
pages = "323--367",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Hartstein:2004:OPD,
author = "A. Hartstein and Thomas R. Puzak",
title = "The optimum pipeline depth considering both power and
performance",
journal = j-TACO,
volume = "1",
number = "4",
pages = "369--388",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Cristal:2004:TKI,
author = "Adri{\'a}n Cristal and Oliverio J. Santana and Mateo
Valero and Jos{\'e} F. Mart{\'\i}nez",
title = "Toward kilo-instruction processors",
journal = j-TACO,
volume = "1",
number = "4",
pages = "389--417",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Akkary:2004:ARE,
author = "Haitham Akkary and Ravi Rajwar and Srikanth T.
Srinivasan",
title = "An analysis of a resource efficient checkpoint
architecture",
journal = j-TACO,
volume = "1",
number = "4",
pages = "418--444",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Yang:2004:TML,
author = "Chia-Lin Yang and Alvin R. Lebeck and Hung-Wei Tseng
and Chien-Hao Lee",
title = "Tolerating memory latency through push prefetching for
pointer-intensive applications",
journal = j-TACO,
volume = "1",
number = "4",
pages = "445--475",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Calder:2005:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "2",
number = "1",
pages = "1--2",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhou:2005:EFA,
author = "Yuanyuan Zhou and Pin Zhou and Feng Qin and Wei Liu
and Josep Torrellas",
title = "Efficient and flexible architectural support for
dynamic monitoring",
journal = j-TACO,
volume = "2",
number = "1",
pages = "3--33",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhang:2005:WHC,
author = "Chuanjun Zhang and Frank Vahid and Jun Yang and Walid
Najjar",
title = "A way-halting cache for low-energy high-performance
systems",
journal = j-TACO,
volume = "2",
number = "1",
pages = "34--54",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Abella:2005:ISP,
author = "Jaume Abella and Antonio Gonz{\'a}lez and Xavier Vera
and Michael F. P. O'Boyle",
title = "{IATAC}: a smart predictor to turn-off {L2} cache
lines",
journal = j-TACO,
volume = "2",
number = "1",
pages = "55--77",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Haskins:2005:AWS,
author = "John W. {Haskins, Jr.} and Kevin Skadron",
title = "Accelerated warmup for sampled microarchitecture
simulation",
journal = j-TACO,
volume = "2",
number = "1",
pages = "78--108",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Li:2005:ABT,
author = "Tao Li and Ravi Bhargava and Lizy Kurian John",
title = "Adapting branch-target buffer to improve the target
predictability of {Java} code",
journal = j-TACO,
volume = "2",
number = "2",
pages = "109--130",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhang:2005:DIE,
author = "Lingli Zhang and Chandra Krintz",
title = "The design, implementation, and evaluation of adaptive
code unloading for resource-constrained devices",
journal = j-TACO,
volume = "2",
number = "2",
pages = "131--164",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Kulkarni:2005:FES,
author = "Prasad A. Kulkarni and Stephen R. Hines and David B.
Whalley and Jason D. Hiser and Jack W. Davidson and
Douglas L. Jones",
title = "Fast and efficient searches for effective
optimization-phase sequences",
journal = j-TACO,
volume = "2",
number = "2",
pages = "165--198",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Salami:2005:DMI,
author = "Esther Salam{\'\i} and Mateo Valero",
title = "Dynamic memory interval test vs. interprocedural
pointer analysis in multimedia applications",
journal = j-TACO,
volume = "2",
number = "2",
pages = "199--219",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Meng:2005:ELL,
author = "Yan Meng and Timothy Sherwood and Ryan Kastner",
title = "Exploring the limits of leakage power reduction in
caches",
journal = j-TACO,
volume = "2",
number = "3",
pages = "221--246",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Garzaran:2005:TBS,
author = "Mar{\'\i}a Jes{\'u}s Garzar{\'a}n and Milos Prvulovic
and Jos{\'e} Mar{\'\i}a Llaber{\'\i}a and V{\'\i}ctor
Vi{\~n}als and Lawrence Rauchwerger and Josep
Torrellas",
title = "Tradeoffs in buffering speculative memory state for
thread-level speculation in multiprocessors",
journal = j-TACO,
volume = "2",
number = "3",
pages = "247--279",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Tarjan:2005:MPG,
author = "David Tarjan and Kevin Skadron",
title = "Merging path and gshare indexing in perceptron branch
prediction",
journal = j-TACO,
volume = "2",
number = "3",
pages = "280--300",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhang:2005:WET,
author = "Xiangyu Zhang and Rajiv Gupta",
title = "Whole execution traces and their applications",
journal = j-TACO,
volume = "2",
number = "3",
pages = "301--334",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhao:2005:IWA,
author = "Wankang Zhao and David Whalley and Christopher Healy
and Frank Mueller",
title = "Improving {WCET} by applying a {WC} code-positioning
optimization",
journal = j-TACO,
volume = "2",
number = "4",
pages = "335--365",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
keywords = "WCET (worst case execution time); WC (worst case)",
}
@Article{Reis:2005:SCF,
author = "George A. Reis and Jonathan Chang and Neil
Vachharajani and Ram Rangan and David I. August and
Shubhendu S. Mukherjee",
title = "Software-controlled fault tolerance",
journal = j-TACO,
volume = "2",
number = "4",
pages = "366--396",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Li:2005:PPC,
author = "Jian Li and Jos{\'e} F. Mart{\'\i}nez",
title = "Power-performance considerations of parallel computing
on chip multiprocessors",
journal = j-TACO,
volume = "2",
number = "4",
pages = "397--422",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Sharma:2005:SPE,
author = "Saurabh Sharma and Jesse G. Beu and Thomas M. Conte",
title = "Spectral prefetcher: {An} effective mechanism for {L2}
cache prefetching",
journal = j-TACO,
volume = "2",
number = "4",
pages = "423--450",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Calder:2006:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "3",
number = "1",
pages = "1--2",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Tan:2006:BSS,
author = "Lin Tan and Brett Brotherton and Timothy Sherwood",
title = "Bit-split string-matching engines for intrusion
detection and prevention",
journal = j-TACO,
volume = "3",
number = "1",
pages = "3--34",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Nagpurkar:2006:ERP,
author = "Priya Nagpurkar and Hussam Mousa and Chandra Krintz
and Timothy Sherwood",
title = "Efficient remote profiling for resource-constrained
devices",
journal = j-TACO,
volume = "3",
number = "1",
pages = "35--66",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Lin:2006:RCG,
author = "Jin Lin and Wei-Chung Hsu and Pen-Chung Yew and Roy
Dz-Ching Ju and Tin-Fook Ngai",
title = "Recovery code generation for general speculative
optimizations",
journal = j-TACO,
volume = "3",
number = "1",
pages = "67--89",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Choi:2006:ORR,
author = "Yoonseo Choi and Hwansoo Han",
title = "Optimal register reassignment for register stack
overflow minimization",
journal = j-TACO,
volume = "3",
number = "1",
pages = "90--114",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Xue:2006:LOA,
author = "Jingling Xue and Qiong Cai",
title = "A lifetime optimal algorithm for speculative {PRE}",
journal = j-TACO,
volume = "3",
number = "2",
pages = "115--155",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Sharkey:2006:IPT,
author = "Joseph J. Sharkey and Dmitry V. Ponomarev and Kanad
Ghose and Oguz Ergin",
title = "Instruction packing: {Toward} fast and
energy-efficient instruction scheduling",
journal = j-TACO,
volume = "3",
number = "2",
pages = "156--181",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Ceze:2006:CUC,
author = "Luis Ceze and Karin Strauss and James Tuck and Josep
Torrellas and Jose Renau",
title = "{CAVA}: {Using} checkpoint-assisted value prediction
to hide {L2} misses",
journal = j-TACO,
volume = "3",
number = "2",
pages = "182--208",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhang:2006:EAR,
author = "Lixin Zhang and Mike Parker and John Carter",
title = "Efficient address remapping in distributed
shared-memory systems",
journal = j-TACO,
volume = "3",
number = "2",
pages = "209--229",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Zhao:2006:ATP,
author = "Min Zhao and Bruce R. Childers and Mary Lou Soffa",
title = "An approach toward profit-driven optimization",
journal = j-TACO,
volume = "3",
number = "3",
pages = "231--262",
month = sep,
year = "2006",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1162690.1162691",
ISSN = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "http://portal.acm.org/",
abstract = "Although optimizations have been applied for a number
of years to improve the performance of software,
problems with respect to the application of
optimizations have not been adequately addressed. For
example, in certain circumstances, optimizations may
degrade performance. However, there is no efficient way
to know when a degradation will occur. In this
research, we investigate the profitability of
optimizations, which is useful for determining the
benefit of applying optimizations. We develop a
framework that enables us to predict profitability
using analytic models. The profitability of an
optimization depends on code context, the particular
optimization, and machine resources. Thus, our
framework has analytic models for each of these
components. As part of the framework, there is also a
profitability engine that uses models to predict the
profit. In this paper, we target scalar optimizations
and, in particular, describe the models for partial
redundancy elimination (PRE), loop invariant code
motion (LICM), and value numbering (VN). We implemented
the framework for predicting the profitability of these
optimizations. Based on the predictions, we can
selectively apply profitable optimizations. We compared
the profit-driven approach with an approach that uses a
heuristic in deciding when optimizations should be
applied. Our experiments demonstrate that the
profitability of scalar optimizations can be accurately
predicted by using models. That is, without actually
applying a scalar optimization, we can determine if an
optimization is beneficial and should be applied.",
acknowledgement = ack-nhfb,
}
@Article{Hazelwood:2006:MBC,
author = "Kim Hazelwood and Michael D. Smith",
title = "Managing bounded code caches in dynamic binary
optimization systems",
journal = j-TACO,
volume = "3",
number = "3",
pages = "263--294",
month = sep,
year = "2006",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1162690.1162692",
ISSN = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "http://portal.acm.org/",
abstract = "Dynamic binary optimizers store altered copies of
original program instructions in software-managed code
caches in order to maximize reuse of transformed code.
Code caches store code blocks that may vary in size,
reference other code blocks, and carry a high
replacement overhead. These unique constraints reduce
the effectiveness of conventional cache management
policies. Our work directly addresses these unique
constraints and presents several contributions to the
code-cache management problem. First, we show that
evicting more than the minimum number of code blocks
from the code cache results in less run-time overhead
than the existing alternatives. Such granular evictions
reduce overall execution time, as the fixed costs of
invoking the eviction mechanism are amortized across
multiple cache insertions. Second, a study of the ideal
lifetimes of dynamically generated code blocks
illustrates the benefit of a replacement algorithm
based on a generational heuristic. We describe and
evaluate a generational approach to code cache
management that makes it easy to identify long-lived
code blocks and simultaneously avoid any fragmentation
because of the eviction of short-lived blocks. Finally,
we present results from an implementation of our
generational approach in the DynamoRIO framework and
illustrate that, as dynamic optimization systems become
more prevalent, effective code cache-management
policies will be essential for reliable, scalable
performance of modern applications.",
acknowledgement = ack-nhfb,
}
@Article{Rochecouste:2006:CCE,
author = "Olivier Rochecouste and Gilles Pokam and Andr{\'e}
Seznec",
title = "A case for a complexity-effective, width-partitioned
microarchitecture",
journal = j-TACO,
volume = "3",
number = "3",
pages = "295--326",
month = sep,
year = "2006",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1162690.1162693",
ISSN = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "http://portal.acm.org/",
abstract = "The analysis of program executions reveals that most
integer and multimedia applications make heavy use of
narrow-width operations, i.e., instructions exclusively
using narrow-width operands and producing a
narrow-width result. Moreover, this usage is relatively
well distributed over the application. We observed this
program property on the MediaBench and SPEC2000
benchmarks with about 40\% of the instructions being
narrow-width operations. Current superscalar processors
use 64-bit datapaths to execute all the instructions of
the applications. In this paper, we suggest the use of
a width-partitioned microarchitecture (WPM) to master
the hardware complexity of a superscalar processor. For
a four-way issue machine, we split the processor in two
two-way clusters: the main cluster executing 64-bit
operations, load/store, and complex operations and a
narrow cluster executing the 16-bit operations. We
resort to partitioning to decouple the treatment of the
narrow-width operations from that of the other program
instructions. This provides the benefit of greatly
simplifying the design of the critical processor
components in each cluster (e.g., the register file and
the bypass network). The dynamic interleaving of the
two instruction types allows maintaining the workload
balanced among clusters. WPM also helps to reduce the
complexity of the interconnection fabric and of the
issue logic. In fact, since the 16-bit cluster can only
communicate narrow-width data, the datapath-width of
the interconnect fabric can be significantly reduced,
yielding a corresponding saving of the interconnect
power and area. We explore different possible
configurations of WPM, discussing the various
implementation tradeoffs. We also examine a speculative
steering heuristic to distribute the narrow-width
operations among clusters. A detailed analysis of the
complexity factors shows using WPM instead of a
classical 64-bit two-cluster microarchitecture can save
power and silicon area with a minimal impact on the
overall performance.",
acknowledgement = ack-nhfb,
}
@Article{Zmily:2006:BAI,
author = "Ahmad Zmily and Christos Kozyrakis",
title = "Block-aware instruction set architecture",
journal = j-TACO,
volume = "3",
number = "3",
pages = "327--357",
month = sep,
year = "2006",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1162690.1162694",
ISSN = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "http://portal.acm.org/",
abstract = "Instruction delivery is a critical component for
wide-issue, high-frequency processors since its
bandwidth and accuracy place an upper limit on
performance. The processor front-end accuracy and
bandwidth are limited by instruction-cache misses,
multicycle instruction-cache accesses, and target or
direction mispredictions for control-flow operations.
This paper presents a block-aware instruction set
(BLISS) that allows software to assist with front-end
challenges. BLISS defines basic block descriptors that
are stored separately from the actual instructions in a
program. We show that BLISS allows for a decoupled
front-end that tolerates instruction-cache latency,
facilitates instruction prefetching, and leads to
higher prediction accuracy.",
acknowledgement = ack-nhfb,
}
@Article{Crandall:2006:MAS,
author = "Jedidiah R. Crandall and S. Felix Wu and Frederic T.
Chong",
title = "{Minos}: {Architectural} support for protecting
control data",
journal = j-TACO,
volume = "3",
number = "4",
pages = "359--389",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Marathe:2006:ACC,
author = "Jaydeep Marathe and Frank Mueller and Bronis R. de
Supinski",
title = "Analysis of cache-coherence bottlenecks with hybrid
hardware\slash software techniques",
journal = j-TACO,
volume = "3",
number = "4",
pages = "390--423",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Ganusov:2006:FEP,
author = "Ilya Ganusov and Martin Burtscher",
title = "Future execution: {A} prefetching mechanism that uses
multiple cores to speed up single threads",
journal = j-TACO,
volume = "3",
number = "4",
pages = "424--449",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Co:2006:ETC,
author = "Michele Co and Dee A. B. Weikle and Kevin Skadron",
title = "Evaluating trace cache energy efficiency",
journal = j-TACO,
volume = "3",
number = "4",
pages = "450--476",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Hu:2006:EMM,
author = "Shiwen Hu and Madhavi Valluri and Lizy Kurian John",
title = "Effective management of multiple configurable units
using dynamic optimization",
journal = j-TACO,
volume = "3",
number = "4",
pages = "477--501",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
}
@Article{Bentley:2006:IAB,
author = "Chris Bentley and Scott A. Watterson and David K.
Lowenthal and Barry Rountree",
title = "Implicit array bounds checking on 64-bit
architectures",
journal = j-TACO,
volume = "3",
number = "4",
pages = "502--527",
month = dec,
year = "2006",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1187976.1187982",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
abstract = "Several programming languages guarantee that array
subscripts are checked to ensure they are within the
bounds of the array. While this guarantee improves the
correctness and security of array-based code, it adds
overhead to array references. This has been an obstacle
to using higher-level languages, such as Java, for
high-performance parallel computing, where the language
specification requires that all array accesses must be
checked to ensure they are within bounds. This is
because, in practice, array-bounds checking in
scientific applications may increase execution time by
more than a factor of 2. Previous research has explored
optimizations to statically eliminate bounds checks,
but the dynamic nature of many scientific codes makes
this difficult or impossible. Our approach is, instead,
to create a compiler and operating system
infrastructure that does not generate explicit bounds
checks. It instead places arrays inside of Index
Confinement Regions (ICRs), which are large, isolated,
mostly unmapped virtual memory regions. Any array
reference outside of its bounds will cause a protection
violation; this provides implicit bounds checking. Our
results show that when applying this infrastructure to
high-performance computing programs written in Java,
the overhead of bounds checking relative to a program
with no bounds checks is reduced from an average of
63\% to an average of 9\%.",
acknowledgement = ack-nhfb,
}
@Article{Calder:2007:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "1",
}
@Article{Constantinides:2007:ARC,
author = "Kypros Constantinides and Stephen Plaza and Jason
Blome and Valeria Bertacco and Scott Mahlke and Todd
Austin and Bin Zhang and Michael Orshansky",
title = "Architecting a reliable {CMP} switch architecture",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "2",
}
@Article{Hwang:2007:SSA,
author = "Yuan-Shin Hwang and Jia-Jhe Li",
title = "Snug set-associative caches: {Reducing} leakage power
of instruction and data caches with no performance
penalties",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "6",
}
@Article{Luo:2007:CNP,
author = "Yan Luo and Jia Yu and Jun Yang and Laxmi N. Bhuyan",
title = "Conserving network processor power consumption by
exploiting traffic variability",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "4",
}
@Article{Rong:2007:SDS,
author = "Hongbo Rong and Zhizhong Tang and R. Govindarajan and
Alban Douillet and Guang R. Gao",
title = "Single-dimension software pipelining for
multidimensional loops",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "7",
}
@Article{Sasanka:2007:AES,
author = "Ruchira Sasanka and Man-Lap Li and Sarita V. Adve and
Yen-Kuang Chen and Eric Debes",
title = "{ALP}: {Efficient} support for all levels of
parallelism for complex media applications",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "3",
}
@Article{Soteriou:2007:SDP,
author = "Vassos Soteriou and Noel Eisley and Li-Shiuan Peh",
title = "Software-directed power-aware interconnection
networks",
journal = j-TACO,
volume = "4",
number = "1",
pages = "??--??",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "5",
}
@Article{Bower:2007:ODH,
author = "Fred A. Bower and Daniel J. Sorin and Sule Ozev",
title = "Online diagnosis of hard faults in microprocessors",
journal = j-TACO,
volume = "4",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1250727.1250728",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "We develop a microprocessor design that tolerates hard
faults, including fabrication defects and in-field
faults, by leveraging existing microprocessor
redundancy. To do this, we must: detect and correct
errors, diagnose hard faults at the field
deconfigurable unit (FDU) granularity, and deconfigure
FDUs with hard faults. In our reliable microprocessor
design, we use DIVA dynamic verification to detect and
correct errors. Our new scheme for diagnosing hard
faults tracks instructions' core structure occupancy
from decode until commit. If a DIVA checker detects an
error in an instruction, it increments a small
saturating error counter for every FDU used by that
instruction, including that DIVA checker. A hard fault
in an FDU quickly leads to an above-threshold error
counter for that FDU and thus diagnoses the fault. For
deconfiguration, we use previously developed schemes
for functional units and buffers and present a scheme
for deconfiguring DIVA checkers. Experimental results
show that our reliable microprocessor quickly and
accurately diagnoses each hard fault that is injected
and continues to function, albeit with somewhat
degraded performance.",
acknowledgement = ack-nhfb,
articleno = "8",
keywords = "fine-grained diagnosis; hard fault tolerance;
processor microarchitecture",
}
@Article{Michaud:2007:STM,
author = "Pierre Michaud and Andr{\'e} Seznec and Damien Fetis
and Yiannakis Sazeides and Theofanis Constantinou",
title = "A study of thread migration in temperature-constrained
multicores",
journal = j-TACO,
volume = "4",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1250727.1250729",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Temperature has become an important constraint in
high-performance processors, especially multicores.
Thread migration will be essential to exploit the full
potential of future thermally constrained multicores.
We propose and study a thread migration method that
maximizes performance under a temperature constraint,
while minimizing the number of migrations and ensuring
fairness between threads. We show that thread migration
brings important performance gains and that it is most
effective during the first tens of seconds following a
decrease of the number of running threads.",
acknowledgement = ack-nhfb,
articleno = "9",
keywords = "multicore processor; power density; temperature;
thermal management; thread migration",
}
@Article{Chen:2007:CRL,
author = "Yu Chen and Fuxin Zhang",
title = "Code reordering on limited branch offset",
journal = j-TACO,
volume = "4",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1250727.1250730",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Since the 1980's code reordering has gained popularity
as an important way to improve the spatial locality of
programs. While the effect of the processor's
microarchitecture and memory hierarchy on this
optimization technique has been investigated, little
research has focused on the impact of the instruction
set. In this paper, we analyze the effect of limited
branch offset of the MIPS-like instruction set [Hwu et
al. 2004, 2005] on code reordering, explore two simple
methods to handle the exceeded branches, and propose
the bidirectional code layout (BCL) algorithm to reduce
the number of branches exceeding the offset limit. The
BCL algorithm sorts the chains according to the
position of related chains, avoids cache conflict
misses deliberately and lays out the code
bidirectionally. It strikes a balance among the
distance of related blocks, the instruction cache miss
rate, the memory size required, and the control flow
transfer. Experimental results show that BCL can
effectively reduce exceeded branches by 50.1\%, on
average, with up to 100\% for some programs. Except for
some programs with little spatial locality, the BCL
algorithm can achieve the performance, as the case with
no branch offset limitation.",
acknowledgement = ack-nhfb,
articleno = "10",
keywords = "code reordering; Godson Processor; link-time
optimization",
}
@Article{Terechko:2007:ICC,
author = "A. S. Terechko and H. Corporaal",
title = "Inter-cluster communication in {VLIW} architectures",
journal = j-TACO,
volume = "4",
number = "2",
pages = "11:1--11:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1250727.1250731",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "The traditional VLIW (very long instruction word)
architecture with a single register file does not scale
up well to address growing performance demands on
embedded media processors. However, splitting a VLIW
processor in smaller clusters, which are comprised of
function units fully connected to local register files,
can significantly improve VLSI implementation
characteristics of the processor, such as speed, energy
consumption, and area. In our paper we reveal that
achieving the best characteristics of a clustered VLIW
requires a thorough selection of an Inter-cluster
Communication (ICC) model, which is the way clustering
is exposed in the Instruction Set Architecture. For our
study we, first, define a taxonomy of ICC models
including copy operations, dedicated issue slots,
extended operands, extended results, and multicast.
Evaluation of the execution time of the models requires
both the dynamic cycle count and clock period. We
developed an advanced instruction scheduler for all the
five ICC models in order to quantify the dynamic cycle
counts of our multimedia C benchmarks. To assess the
clock period of the ICC models we designed and laid out
VLIW datapaths using the RTL hardware descriptions
derived from a deeply pipelined commercial TriMedia
processor. In contrast to prior art, our research shows
that fully distributed register file architectures
(with eight clusters in our study) often underperform
compared to moderately clustered machines with two or
four clusters because of explosion of the cycle count
overhead in the former. Among the evaluated ICC models,
performance of the copy operation model, popular both
in academia and industry, is severely limited by the
copy operations hampering scheduling of regular
operations in high ILP (instruction-level parallelism)
code. The dedicated issue slots model combats this
limitation by dedicating extra VLIW issue slots purely
for ICC, reaching the highest 1.74 execution time
speedup relative to the unicluster. Furthermore, our
VLSI experiments show that the lowest area and energy
consumption of 42 and 57\% relative to the unicluster,
respectively, are achieved by the extended operands
model, which, nevertheless, provides higher performance
than the copy operation model.",
acknowledgement = ack-nhfb,
articleno = "11",
keywords = "clock frequency; cluster assignment; instruction-level
parallelism; instruction scheduler; intercluster
communication; optimizing compiler; pipelining;
register allocation; VLIW",
}
@Article{Dou:2007:CCM,
author = "Jialin Dou and Marcelo Cintra",
title = "A compiler cost model for speculative
parallelization",
journal = j-TACO,
volume = "4",
number = "2",
pages = "12:1--12:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1250727.1250732",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Speculative parallelization is a technique that allows
code sections that cannot be fully analyzed by the
compiler to be aggressively executed in parallel.
However, while speculative parallelization can
potentially deliver significant speedups, several
overheads associated with this technique can limit
these speedups in practice. This paper proposes a novel
compiler static cost model of speculative multithreaded
execution that can be used to predict the resulting
performance. This model attempts to predict the
expected speedups, or slowdowns, of the candidate
speculative sections based on the estimation of the
combined runtime effects of various overheads, and
taking into account the scheduling restrictions of most
speculative execution environments. The model is based
on estimating the likely execution duration of threads
and considers all the possible permutations of these
threads. This model also produces a quantitative
estimate of the speedup, which is different from prior
heuristics that only qualitatively estimate the
benefits of speculative multithreaded execution. In
previous work, a limited version of the framework was
evaluated on a number of loops from a collection of
SPEC benchmarks that suffer mainly from load imbalance
and thread dispatch and commit overheads. In this work,
an extended framework is also evaluated on loops that
may suffer from data-dependence violations.
Experimental results show that prediction accuracy is
lower when loops with violations are included.
Nevertheless, accuracy is still very high for a static
model: the framework can identify, on average, 45\% of
the loops that cause slowdowns and, on average, 96\% of
the loops that lead to speedups; it predicts the
speedups or slowdowns with an error of less than 20\%
for an average of 28\% of the loops across the
benchmarks and with an error of less than 50\% for an
average of 80\% of the loops. Overall, the framework
often outperforms, by as much as 25\%, a naive approach
that attempts to speculatively parallelize all the
loops considered, and is able to curb the large
slowdowns caused in many cases by this naive
approach.",
acknowledgement = ack-nhfb,
articleno = "12",
keywords = "speculative multithreading; speculative
parallelization; thread-level speculation",
}
@Article{Amme:2007:SBM,
author = "Wolfram Amme and Jeffery von Ronne and Michael Franz",
title = "{SSA}-based mobile code: {Implementation} and
empirical evaluation",
journal = j-TACO,
volume = "4",
number = "2",
pages = "13:1--13:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1250727.1250733",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Although one might expect transportation formats based
on static single-assignment form (SSA) to yield faster
just-in-time compilation times than those based on
stack-based virtual machines, this claim has not
previously been validated, in practice. We attempt to
quantify the effect of using an SSA-based mobile code
representation by integrating support for a verifiable
SSA-based IR into Jikes RVM. Performance results,
measured with various optimizations and on both the
IA32 and PowerPC, show improvements in both compilation
time and code quality.",
acknowledgement = ack-nhfb,
articleno = "13",
keywords = "SafeTSA; static single-assignment form; virtual
machines",
}
@Article{Li:2007:CCE,
author = "Xiaodong Li and Ritu Gupta and Sarita V. Adve and
Yuanyuan Zhou",
title = "Cross-component energy management: {Joint} adaptation
of processor and memory",
journal = j-TACO,
volume = "4",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1275937.1275938",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Researchers have proposed the use of adaptation to
reduce the energy consumption of different hardware
components, such as the processor, memory, disk, and
display for general-purpose applications. Previous
algorithms to control these adaptations, however, have
focused on a single component. This work takes the
first step toward developing algorithms that can
jointly control adaptations in multiple interacting
components for general-purpose applications, with the
goal of minimizing the total energy consumed within a
specified performance loss. Specifically, we develop a
joint-adaptation algorithm for processor and memory
adaptations. We identify two properties that enable
per-component algorithms to be easily used in a
cross-component context---the algorithms' performance
impact must be guaranteed and composable. We then
modify a current processor and a memory algorithm to
obey these properties. This allows the cross-component
problem to be reduced to determine an appropriate
(energy-optimal) allocation of the target performance
loss (slack) between the two components. We develop
such an optimal slack allocation algorithm that
exploits the above properties. The result is an
efficient cross-component adaptation framework that
minimizes the total energy of the processor and memory
without exceeding the target performance loss, while
substantially leveraging current per-component
algorithms. Our experiments show that joint processor
and memory adaptation provides significantly more
energy savings than adapting either component alone;
intelligent slack distribution is specifically
effective for highly compute- or memory-intensive
applications; and the performance slowdown never
exceeds the specification.",
acknowledgement = ack-nhfb,
articleno = "14",
keywords = "adaptive systems; control algorithms; energy
management; low-power design; memory; performance
guarantee; processor",
}
@Article{Gabor:2007:FES,
author = "Ron Gabor and Shlomo Weiss and Avi Mendelson",
title = "Fairness enforcement in switch on event
multithreading",
journal = j-TACO,
volume = "4",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1275937.1275939",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "The need to reduce power and complexity will increase
the interest in Switch On Event multithreading
(coarse-grained multithreading). Switch On Event
multithreading is a low-power and low-complexity
mechanism to improve processor throughput by switching
threads on execution stalls. Fairness may, however,
become a problem in a multithreaded processor. Unless
fairness is properly handled, some threads may starve
while others consume all of the processor cycles.
Heuristics that were devised in order to improve
fairness in simultaneous multithreading are not
applicable to Switch On Event multithreading. This
paper defines the fairness metric using the ratio of
the individual threads' speedups and shows how it can
be enforced in Switch On Event multithreading. Fairness
is controlled by forcing additional thread switch
points. These switch points are determined dynamically
by runtime estimation of the single threaded
performance of each of the individual threads. We
analyze the impact of the fairness enforcement
mechanism on aggregate IPC and weighted speedup. We
present simulation results of the performance of Switch
On Event multithreading. Switch On Event multithreading
achieves an average aggregate IPC increase of 26\% over
single thread and 12\% weighted speedup when no
fairness is enforced. In this case, a sixth of our runs
resulted in poor fairness in which one thread ran
extremely slowly (10 to 100 times slower than its
single-thread performance), while the other thread's
performance was hardly affected. By using the proposed
mechanism, we can guarantee fairness at different
levels of strictness and, in most cases, even improve
the weighted speedup.",
acknowledgement = ack-nhfb,
articleno = "15",
keywords = "coarse-grained multithreading; fairness;
multithreading; performance; SOE; Switch on Event
multithreading; throughput; weighted speedup",
}
@Article{Andrade:2007:PAA,
author = "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n
Doallo",
title = "Precise automatable analytical modeling of the cache
behavior of codes with indirections",
journal = j-TACO,
volume = "4",
number = "3",
pages = "16:1--16:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1275937.1275940",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "The performance of memory hierarchies, in which caches
play an essential role, is critical in nowadays
general-purpose and embedded computing systems because
of the growing memory bottleneck problem.
Unfortunately, cache behavior is very unstable and
difficult to predict. This is particularly true in the
presence of irregular access patterns, which exhibit
little locality. Such patterns are very common, for
example, in applications in which pointers or
compressed sparse matrices give place to indirections.
Nevertheless, cache behavior in the presence of
irregular access patterns has not been widely studied.
In this paper we present an extension of a systematic
analytical modeling technique based on PMEs
(probabilistic miss equations), previously developed by
the authors, that allows the automated analysis of the
cache behavior for codes with irregular access patterns
resulting from indirections. The model generates very
accurate predictions despite the irregularities and has
very low computing requirements, being the first model
that gathers these desirable characteristics that can
automatically analyze this kind of codes. These
properties enable this model to help drive compiler
optimizations, as we show with an example.",
acknowledgement = ack-nhfb,
articleno = "16",
keywords = "analytical modeling; irregular access patterns; memory
hierarchy; performance prediction",
}
@Article{Venstermans:2007:JOH,
author = "Kris Venstermans and Lieven Eeckhout and Koen De
Bosschere",
title = "{Java} object header elimination for reduced memory
consumption in 64-bit virtual machines",
journal = j-TACO,
volume = "4",
number = "3",
pages = "17:1--17:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1275937.1275941",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Memory performance is an important design issue for
contemporary computer systems given the huge
processor/memory speed gap. This paper proposes a
space-efficient Java object model for reducing the
memory consumption of 64-bit Java virtual machines. We
completely eliminate the object header through typed
virtual addressing (TVA) or implicit typing. TVA
encodes the object type in the object's virtual address
by allocating all objects of a given type in a
contiguous memory segment. This allows for removing the
type information as well as the status field from the
object header. Whenever type and status information is
needed, masking is applied to the object's virtual
address for obtaining an offset into type and status
information structures. Unlike previous work on
implicit typing, we apply TVA to a selected number of
frequently allocated object types, hence, the name
selective TVA (STVA); this limits the amount of memory
fragmentation. In addition to applying STVA, we also
compress the type information block (TIB) pointers for
all objects that do not fall under TVA. We implement
the space-efficient Java object model in the 64-bit
version of the Jikes RVM on an AIX IBM platform and
compare its performance against the traditionally used
Java object model using a multitude of Java benchmarks.
We conclude that the space-efficient Java object model
reduces memory consumption by on average 15\% (and up
to 45\% for some benchmarks). About one-half the
reduction comes from TIB pointer compression; the other
one-half comes from STVA. In terms of performance, the
space-efficient object model generally does not affect
performance; however, for some benchmarks we observe
statistically significant performance speedups, up to
20\%.",
acknowledgement = ack-nhfb,
articleno = "17",
keywords = "64-bit implementation; implicit typing; Java object
model; typed virtual addressing; Virtual machine",
}
@Article{Xiao:2007:VIS,
author = "Shu Xiao and Edmund M.-K. Lai",
title = "{VLIW} instruction scheduling for minimal power
variation",
journal = j-TACO,
volume = "4",
number = "3",
pages = "18:1--18:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1275937.1275942",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "The focus of this paper is on the minimization of the
variation in power consumed by a VLIW processor during
the execution of a target program through instruction
scheduling. The problem is formulated as a
mixed-integer program (MIP) and a problem-specific
branch-and-bound algorithm has been developed to solve
it more efficiently than generic MIP solvers.
Simulation results based on the TMS320C6711 VLIW
digital signal processor using benchmarks from
Mediabench and Trimaran showed that over 40\% average
reduction in power variation can be achieved without
sacrificing execution speed of these benchmarks.
Computational requirements and convergence rates of our
algorithm are also analyzed.",
acknowledgement = ack-nhfb,
articleno = "18",
keywords = "instruction scheduling; power variation reduction;
VLIW processors",
}
@Article{Tallam:2007:UCF,
author = "Sriraman Tallam and Rajiv Gupta",
title = "Unified control flow and data dependence traces",
journal = j-TACO,
volume = "4",
number = "3",
pages = "19:1--19:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1275937.1275943",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "We describe the design, generation, and compression of
the extended whole program path (eWPP), representation
that not only captures the control flow history of a
program execution but also its data dependence history.
This representation is motivated by the observation
that, typically, a significant fraction of data
dependence history can be recovered from the control
flow trace. To capture the remainder of the data
dependence history, we introduce disambiguation checks
in the program whose control flow signatures capture
the results of the checks. The resulting extended
control flow trace enables the recovery of otherwise
irrecoverable data dependences. The code for the checks
is designed to minimize the increase in program
execution time and the extended control flow trace size
when compared to directly collecting control flow and
address traces. Our experiments show that compressed
eWPPs are only one-quarter of the size of combined
compressed control flow and address traces. However,
their collection incurs a 5{\times} increase in runtime
overhead relative to the overhead required for directly
collecting the control flow and address traces,
respectively.",
acknowledgement = ack-nhfb,
articleno = "19",
keywords = "address trace; control flow trace; dynamic data
dependence trace; profiling",
}
@Article{Ipek:2008:EAD,
author = "Engin Ipek and Sally A. McKee and Karan Singh and Rich
Caruana and Bronis R. de Supinski and Martin Schulz",
title = "Efficient architectural design space exploration via
predictive modeling",
journal = j-TACO,
volume = "4",
number = "4",
pages = "1:1--1:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1328195.1328196",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Efficiently exploring exponential-size architectural
design spaces with many interacting parameters remains
an open problem: the sheer number of experiments
required renders detailed simulation intractable. We
attack this via an automated approach that builds
accurate predictive models. We simulate sampled points,
using results to teach our models the function
describing relationships among design parameters. The
models can be queried and are very fast, enabling
efficient design tradeoff discovery. We validate our
approach via two uniprocessor sensitivity studies,
predicting IPC with only 1--2\% error. In an
experimental study using the approach, training on 1\%
of a 250-K-point CMP design space allows our models to
predict performance with only 4--5\% error. Our
predictive modeling combines well with techniques that
reduce the time taken by each simulation experiment,
achieving net time savings of three-four orders of
magnitude.",
acknowledgement = ack-nhfb,
articleno = "1",
keywords = "artificial neural networks; design space exploration;
performance prediction; sensitivity studies",
}
@Article{Shi:2008:VMS,
author = "Yunhe Shi and Kevin Casey and M. Anton Ertl and David
Gregg",
title = "Virtual machine showdown: {Stack} versus registers",
journal = j-TACO,
volume = "4",
number = "4",
pages = "2:1--2:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1328195.1328197",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Virtual machines (VMs) enable the distribution of
programs in an architecture-neutral format, which can
easily be interpreted or compiled. A long-running
question in the design of VMs is whether a stack
architecture or register architecture can be
implemented more efficiently with an interpreter. We
extend existing work on comparing virtual stack and
virtual register architectures in three ways. First,
our translation from stack to register code and
optimization are much more sophisticated. The result is
that we eliminate an average of more than 46\% of
executed VM instructions, with the bytecode size of the
register machine being only 26\% larger than that of
the corresponding stack one. Second, we present a fully
functional virtual-register implementation of the Java
virtual machine (JVM), which supports Intel, AMD64,
PowerPC and Alpha processors. This register VM supports
inline-threaded, direct-threaded, token-threaded, and
switch dispatch. Third, we present experimental results
on a range of additional optimizations such as register
allocation and elimination of redundant heap loads. On
the AMD64 architecture the register machine using
switch dispatch achieves an average speedup of 1.48
over the corresponding stack machine. Even using the
more efficient inline-threaded dispatch, the register
VM achieves a speedup of 1.15 over the equivalent
stack-based VM.",
acknowledgement = ack-nhfb,
articleno = "2",
keywords = "interpreter; register architecture; stack
architecture; virtual machine",
}
@Article{Yan:2008:EVR,
author = "Jun Yan and Wei Zhang",
title = "Exploiting virtual registers to reduce pressure on
real registers",
journal = j-TACO,
volume = "4",
number = "4",
pages = "3:1--3:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1328195.1328198",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "It is well known that a large fraction of variables
are short-lived. This paper proposes a novel approach
to exploiting this fact to reduce the register pressure
for pipelined processors with data-forwarding network.
The idea is that the compiler can allocate virtual
registers (i.e., place holders to identify dependences
among instructions) to short-lived variables, which do
not need to be stored to physical storage locations. As
a result, real registers (i.e., physically existed
registers) can be reserved for long-lived variables for
mitigating the register pressure and decreasing the
register spills, leading to performance improvement. In
this paper, we develop the architectural and compiler
support for exploiting virtual registers for statically
scheduled processors. Our experimental results show
that virtual registers are very effective at reducing
the register spills, which, in many cases, can achieve
the performance close to the processor with twice
number of real registers. Our results also indicate
that, for some applications, using 24 virtual, in
addition to 8 real registers, can attain even higher
performance than that of 16 real without any virtual
registers.",
acknowledgement = ack-nhfb,
articleno = "3",
keywords = "data forwarding; register allocation; register file;
short-lived variables; virtual register",
}
@Article{Yu:2008:OCL,
author = "Zoe C. H. Yu and Francis C. M. Lau and Cho-Li Wang",
title = "Object co-location and memory reuse for {Java}
programs",
journal = j-TACO,
volume = "4",
number = "4",
pages = "4:1--4:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1328195.1328199",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "We introduce a new memory management system, STEMA,
which can improve the execution time of Java programs.
STEMA detects prolific types on-the-fly and co-locates
their objects in a special memory space which supports
reuse of memory. We argue and show that memory reuse
and co-location of prolific objects can result in
improved cache locality, reduced memory fragmentation,
reduced GC time, and faster object allocation. We
evaluate STEMA using 16 benchmarks. Experimental
results show that STEMA performs 2.7\%, 4.0\%, and
8.2\% on average better than MarkSweep, CopyMS, and
SemiSpace.",
acknowledgement = ack-nhfb,
articleno = "4",
keywords = "garbage collector; Java; memory allocator; memory
reuse; mutator; object co-location",
}
@Article{Zhang:2008:RCM,
author = "Chuanjun Zhang",
title = "Reducing cache misses through programmable decoders",
journal = j-TACO,
volume = "4",
number = "4",
pages = "5:1--5:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1328195.1328200",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Level-one caches normally reside on a processor's
critical path, which determines clock frequency.
Therefore, fast access to level-one cache is important.
Direct-mapped caches exhibit faster access time, but
poor hit rates, compared with same sized
set-associative caches because of nonuniform accesses
to the cache sets. The nonuniform accesses generate
more cache misses in some sets, while other sets are
underutilized. We propose to increase the decoder
length and, hence, reduce the accesses to heavily used
sets without dynamically detecting the cache set usage
information. We increase the access to the
underutilized cache sets by incorporating a replacement
policy into the cache design using programmable
decoders. On average, the proposed techniques achieve
as low a miss rate as a traditional 4-way cache on all
26 SPEC2K benchmarks for the instruction and data
caches, respectively. This translates into an average
IPC improvement of 21.5 and 42.4\% for SPEC2K integer
and floating-point benchmarks, respectively. The
B-Cache consumes 10.5\% more power per access, but
exhibits a 12\% total memory access-related energy
savings as a result of the miss rate reductions, and,
hence, the reduction to applications' execution time.
Compared with previous techniques that aim at reducing
the miss rate of direct-mapped caches, our technique
requires only one cycle to access all cache hits and
has the same access time of a direct-mapped cache.",
acknowledgement = ack-nhfb,
articleno = "5",
keywords = "cache; dynamic optimization; low power",
}
@Article{Golander:2008:HMP,
author = "Amit Golander and Shlomo Weiss",
title = "Hiding the misprediction penalty of a
resource-efficient high-performance processor",
journal = j-TACO,
volume = "4",
number = "4",
pages = "6:1--6:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1328195.1328201",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Misprediction is a major obstacle for increasing
speculative out-of-order processors performance.
Performance degradation depends on both the number of
misprediction events and the recovery time associated
with each one of them. In recent years a few checkpoint
based microarchitectures have been proposed. In
comparison with ROB-based processors, checkpoint
processors are scalable and highly resource efficient.
Unfortunately, in these proposals the misprediction
recovery time is proportional to the instruction queue
size.\par
In this paper we analyze methods to reduce the
misprediction recovery time. We propose a new register
file management scheme and techniques to selectively
flush the instruction queue and the load store queue,
and to isolate deeply pipelined execution units. The
result is a novel checkpoint processor with Constant
misprediction RollBack time (CRB). We further present a
streamlined, cost-efficient solution, which saves
complexity at the price of slightly lower
performance.",
acknowledgement = ack-nhfb,
articleno = "6",
keywords = "checkpoints; misprediction; out-of-order execution;
rollback; scalable architecture",
}
@Article{Calder:2008:E,
author = "Brad Calder and Dean Tullsen",
title = "Editorial",
journal = j-TACO,
volume = "5",
number = "1",
pages = "1:1--1:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1369397",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
acknowledgement = ack-nhfb,
articleno = "1",
}
@Article{Mysore:2008:FIP,
author = "Shashidhar Mysore and Banit Agrawal and Rodolfo Neuber
and Timothy Sherwood and Nisheeth Shrivastava and
Subhash Suri",
title = "Formulating and implementing profiling over adaptive
ranges",
journal = j-TACO,
volume = "5",
number = "1",
pages = "2:1--2:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1369398",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Modern computer systems are called on to deal with
billions of events every second, whether they are
executed instructions, accessed memory locations, or
forwarded packets. This presents a serious challenge to
those who seek to quantify, analyze, or optimize such
systems, because important trends and behaviors may
easily be lost in a sea of data. We present
range-adaptive profiling (RAP) as a new and
general-purpose profiling method capable of
hierarchically efficiently classifying streams of data
in hardware. Through the use of RAP, events in an input
stream are dynamically classified into increasingly
precise categories, based on the frequency with which
they occur. The more important a class, or range of
events, the more precisely it is quantified. Despite
the dynamic nature of our technique, we build upon
tight theoretic bounds covering both worst-case error,
as well as the required memory. In the limit, it is
known that error and the memory bounds can be
independent of the stream size and grow only linearly
with the level of precision desired. Significantly, we
expose the critical constants in these algorithms and
through careful engineering, algorithm redesign, and
use of heuristics, we show how a high-performance
profile system can be implemented for range-adaptive
profiling. RAP can be used on various profiles, such as
PCs, load values, and memory addresses, and has a broad
range of uses, from hot-region profiling to quantifying
cache miss value locality. We propose two methods of
implementation of RAP, one in software and the other
with specialized hardware, for which we also describe
our prototype FPGA implementation. We show that with
just 8KB of memory, range profiles can be gathered with
an average accuracy of 98\%.",
acknowledgement = ack-nhfb,
articleno = "2",
keywords = "profiling hardware; range adaptive; value locality",
}
@Article{Zhai:2008:CHS,
author = "Antonia Zhai and J. Gregory Steffan and Christopher B.
Colohan and Todd C. Mowry",
title = "Compiler and hardware support for reducing the
synchronization of speculative threads",
journal = j-TACO,
volume = "5",
number = "1",
pages = "3:1--3:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1369399",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Thread-level speculation (TLS) allows us to
automatically parallelize general-purpose programs by
supporting parallel execution of threads that might not
actually be independent. In this article, we focus on
one important limitation of program performance under
TLS, which stalls as a result of synchronizing and
forwarding scalar values between speculative threads
that would otherwise cause frequent data dependences
and, hence, failed speculation. Using SPECint
benchmarks that have been automatically transformed by
our compiler to exploit TLS, we present, evaluate in
detail, and compare both compiler and hardware
techniques for improving the communication of scalar
values. We find that through our dataflow algorithms
for three increasingly aggressive instruction
scheduling techniques, the compiler can drastically
reduce the critical forwarding path introduced by the
synchronization and forwarding of scalar values. We
also show that hardware techniques for reducing
synchronization can be complementary to compiler
scheduling, but that the additional performance
benefits are minimal and are generally not worth the
cost.",
acknowledgement = ack-nhfb,
articleno = "3",
keywords = "automatic parallelization; chip-multiprocessing;
instruction scheduling; thread-level speculation",
}
@Article{Winter:2008:ATN,
author = "Jonathan A. Winter and David H. Albonesi",
title = "Addressing thermal nonuniformity in {SMT} workloads",
journal = j-TACO,
volume = "5",
number = "1",
pages = "4:1--4:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1369400",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "We explore DTM techniques within the context of
uniform and nonuniform SMT workloads. While DVS is
suitable for addressing workloads with uniformly high
temperatures, for nonuniform workloads, performance
loss occurs because of the slowdown of the cooler
thread. To address this, we propose and evaluate DTM
mechanisms that exploit the steering-based thread
management mechanisms inherent in a clustered SMT
architecture. We show that in contrast to DVS, which
operates globally, our techniques are more effective at
controlling temperature for nonuniform workloads.
Furthermore, we devise a DTM technique that combines
steering and DVS to achieve consistently good
performance across all workloads.",
acknowledgement = ack-nhfb,
articleno = "4",
keywords = "adaptive microarchitectures; clustered
microarchitectures; dynamic thermal management; dynamic
voltage scaling; simultaneous multithreading",
}
@Article{Shahbahrami:2008:VES,
author = "Asadollah Shahbahrami and Ben Juurlink and Stamatis
Vassiliadis",
title = "Versatility of extended subwords and the matrix
register file",
journal = j-TACO,
volume = "5",
number = "1",
pages = "5:1--5:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1369401",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Extended subwords and the matrix register file (MRF)
are two micro architectural techniques that address
some of the limitations of existing SIMD architectures.
Extended subwords are wider than the data stored in
memory. Specifically, for every byte of data stored in
memory, there are four extra bits in the media register
file. This avoids the need for data-type conversion
instructions. The MRF is a register file organization
that provides both conventional row-wise, as well as
column-wise, access to the register file. In other
words, it allows to view the register file as a matrix
in which corresponding subwords in different registers
corresponds to a column of the matrix. It was
introduced to accelerate matrix transposition which is
a very common operation in multimedia applications. In
this paper, we show that the MRF is very versatile,
since it can also be used for other permutations than
matrix transposition. Specifically, it is shown how it
can be used to provide efficient access to strided
data, as is needed in, e.g., color space conversion.
Furthermore, it is shown that special-purpose
instructions (SPIs), such as the sum-of-absolute
differences (SAD) instruction, have limited usefulness
when extended subwords and a few general SIMD
instructions that we propose are supported, for the
following reasons. First, when extended subwords are
supported, the SAD instruction provides only a
relatively small performance improvement. Second, the
SAD instruction processes 8-bit subwords only, which is
not sufficient for quarter-pixel resolution nor for
cost functions used in image and video retrieval.
Results obtained by extending the SimpleScalar toolset
show that the proposed techniques provide a speedup of
up to 3.00 over the MMX architecture. The results also
show that using, at most, 13 extra media registers
yields an additional performance improvement ranging
from 1.38 to 1.57.",
acknowledgement = ack-nhfb,
articleno = "5",
keywords = "multimedia standards; SIMD architectures; SIMD
programming",
}
@Article{Guo:2008:EHC,
author = "Zhi Guo and Walid Najjar and Betul Buyukkurt",
title = "Efficient hardware code generation for {FPGAs}",
journal = j-TACO,
volume = "5",
number = "1",
pages = "6:1--6:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1369402",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "The wider acceptance of FPGAs as a computing device
requires a higher level of programming abstraction.
ROCCC is an optimizing C to HDL compiler. We describe
the code generation approach in ROCCC. The smart buffer
is a component that reuses input data between adjacent
iterations. It significantly improves the performance
of the circuit and simplifies loop control. The
ROCCC-generated datapath can execute one loop iteration
per clock cycle when there is no loop dependency or
there is only scalar recurrence variable dependency.
ROCCC's approach to supporting while-loops operating on
scalars makes the compiler able to move scalar
iterative computation into hardware.",
acknowledgement = ack-nhfb,
articleno = "6",
keywords = "data reuse; FPGA; high-level synthesis; reconfigurable
computing; VHDL",
}
@Article{Kotzmann:2008:DJH,
author = "Thomas Kotzmann and Christian Wimmer and Hanspeter
M{\"o}ssenb{\"o}ck and Thomas Rodriguez and Kenneth
Russell and David Cox",
title = "Design of the {Java HotSpot\TM} client compiler for
{Java 6}",
journal = j-TACO,
volume = "5",
number = "1",
pages = "7:1--7:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1369396.1370017",
ISSN = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Version 6 of Sun Microsystems' Java HotSpot{\TM} VM
ships with a redesigned version of the client
just-in-time compiler that includes several research
results of the last years. The client compiler is at
the heart of the VM configuration used by default for
interactive desktop applications. For such
applications, low startup and pause times are more
important than peak performance. This paper outlines
the new architecture of the client compiler and shows
how it interacts with the VM. It presents the
intermediate representation that now uses static
single-assignment (SSA) form and the linear scan
algorithm for global register allocation. Efficient
support for exception handling and deoptimization
fulfills the demands that are imposed by the dynamic
features of the Java programming language. The
evaluation shows that the new client compiler generates
better code in less time. The popular SPECjvm98
benchmark suite is executed 45\% faster, while the
compilation speed is also up to 40\% better. This
indicates that a carefully selected set of global
optimizations can also be integrated in just-in-time
compilers that focus on compilation speed and not on
peak performance. In addition, the paper presents the
impact of several optimizations on execution and
compilation speed. As the source code is freely
available, the Java HotSpot{\TM} VM and the client
compiler are the ideal basis for experiments with new
feedback-directed optimizations in a production-level
Java just-in-time compiler. The paper outlines research
projects that add fast algorithms for escape analysis,
automatic object inlining, and array bounds check
elimination.",
acknowledgement = ack-nhfb,
articleno = "7",
keywords = "compiler; deoptimization; intermediate representation;
Java; just-in-time compilation; optimization; register
allocation",
}
@Article{Rangan:2008:PSD,
author = "Ram Rangan and Neil Vachharajani and Guilherme Ottoni
and David I. August",
title = "Performance scalability of decoupled software
pipelining",
journal = j-TACO,
volume = "5",
number = "2",
pages = "8:1--8:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1400112.1400113",
ISSN = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Any successful solution to using multicore processors
to scale general-purpose program performance will have
to contend with rising intercore communication costs
while exposing coarse-grained parallelism. Recently
proposed pipelined multithreading (PMT) techniques have
been demonstrated to have general-purpose applicability
and are also able to effectively tolerate inter-core
latencies through pipelined interthread communication.
These desirable properties make PMT techniques strong
candidates for program parallelization on current and
future multicore processors and understanding their
performance characteristics is critical to their
deployment. To that end, this paper evaluates the
performance scalability of a general-purpose PMT
technique called decoupled software pipelining (DSWP)
and presents a thorough analysis of the communication
bottlenecks that must be overcome for optimal DSWP
scalability.",
acknowledgement = ack-nhfb,
articleno = "8",
keywords = "decoupled software pipelining; performance analysis",
}
@Article{Long:2008:TMM,
author = "Jieyi Long and Seda Ogrenci Memik and Gokhan Memik and
Rajarshi Mukherjee",
title = "Thermal monitoring mechanisms for chip
multiprocessors",
journal = j-TACO,
volume = "5",
number = "2",
pages = "9:1--9:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1400112.1400114",
ISSN = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "With large-scale integration and increasing power
densities, thermal management has become an important
tool to maintain performance and reliability in modern
process technologies. In the core of dynamic thermal
management schemes lies accurate reading of on-die
temperatures. Therefore, careful planning and embedding
of thermal monitoring mechanisms into high-performance
systems becomes crucial. In this paper, we propose
three techniques to create sensor infrastructures for
monitoring the maximum temperature on a multicore
system. Initially, we extend a nonuniform sensor
placement methodology proposed in the literature to
handle chip multiprocessors (CMPs) and show its
limitations. We then analyze a grid-based approach
where the sensors are placed on a static grid covering
each core and show that the sensor readings can differ
from the actual maximum core temperature by as much as
12.6^\circ C when using 16 sensors per core. Also, as
large as 10.6\% of the thermal emergencies are not
captured using the same number of sensors. Based on
this observation, we first develop an interpolation
scheme, which estimates the maximum core temperature
through interpolation of the readings collected at the
static grid points. We show that the interpolation
scheme improves the measurement accuracy and emergency
coverage compared to grid-based placement when using
the same number of sensors. Second, we present a
dynamic scheme where only a subset of the sensor
readings is collected to predict the maximum
temperature of each core. Our results indicate that, we
can reduce the number of active sensors by as much as
50\%, while maintaining similar measurement accuracy
and emergency coverage compared to the case where the
entire sensor set on the grid is sampled at all
times.",
acknowledgement = ack-nhfb,
articleno = "9",
keywords = "nonuniform and uniform sensor placement; thermal
sensor allocation",
}
@Article{Joshi:2008:DEP,
author = "Ajay Joshi and Lieven Eeckhout and Robert H. {Bell,
Jr.} and Lizy K. John",
title = "Distilling the essence of proprietary workloads into
miniature benchmarks",
journal = j-TACO,
volume = "5",
number = "2",
pages = "10:1--10:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1400112.1400115",
ISSN = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Benchmarks set standards for innovation in computer
architecture research and industry product development.
Consequently, it is of paramount importance that these
workloads are representative of real-world
applications. However, composing such representative
workloads poses practical challenges to application
analysis teams and benchmark developers (1) real-world
workloads are intellectual property and vendors
hesitate to share these proprietary applications; and
(2) porting and reducing these applications to
benchmarks that can be simulated in a tractable amount
of time is a nontrivial task. In this paper, we address
this problem by proposing a technique that
automatically distills key inherent behavioral
attributes of a proprietary workload and captures them
into a miniature synthetic benchmark clone. The
advantage of the benchmark clone is that it hides the
functional meaning of the code but exhibits similar
performance characteristics as the target application.
Moreover, the dynamic instruction count of the
synthetic benchmark clone is substantially shorter than
the proprietary application, greatly reducing overall
simulation time for SPEC CPU, the simulation time
reduction is over five orders of magnitude compared to
entire benchmark execution. Using a set of benchmarks
representative of general-purpose, scientific, and
embedded applications, we demonstrate that the power
and performance characteristics of the synthetic
benchmark clone correlate well with those of the
original application across a wide range of
microarchitecture configurations.",
acknowledgement = ack-nhfb,
articleno = "10",
keywords = "benchmark cloning; benchmarks; workload
characterization",
}
@Article{Catania:2008:RCM,
author = "Vincenzo Catania and Maurizio Palesi and Davide
Patti",
title = "Reducing complexity of multiobjective design space
exploration in {VLIW}-based embedded systems",
journal = j-TACO,
volume = "5",
number = "2",
pages = "11:1--11:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1400112.1400116",
ISSN = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "http://portal.acm.org/",
abstract = "Architectures based on very-long instruction word
(VLIW) have found fertile ground in multimedia
electronic appliances thanks to their ability to
exploit high degrees of instruction level parallelism
(ILP) with a reasonable trade-off in complexity and
silicon cost. Specialization of such architectures
involves the configuration of both hardware-related
aspects (e.g., register files, functional units, memory
subsystem) and software-related issues (e.g., the
compilation strategy). The complex interactions between
the components of such systems will force a human
designer to rely on judgment and experience in
designing them, possibly eliminating interesting
configurations, and making tuning of the system, for
either power, energy, or performance, difficult. In
this paper we propose tools and methodologies to
efficiently cope with this complexity from a
multiobjective perspective. We first analyze the impact
of ILP-oriented code transformations using two
alternative compilation profiles to quantitatively show
the effect of such transformations on typical design
objectives like performance, power dissipation, and
energy consumption. Next, by means of statistical
analysis, we collect useful data to predict the
effectiveness of a given compilation profiles for a
specific application. Information gathered from such
analysis can be exploited to drastically reduce the
computational effort needed to perform the design space
exploration.",
acknowledgement = ack-nhfb,
articleno = "11",
keywords = "design space exploration; energy; genetic algorithms;
hyperblock formation; ILP; multiobjective optimization;
performances; power; statistical analysis; VLIW
architectures",
}
@Article{Leverich:2008:CEM,
author = "Jacob Leverich and Hideho Arakida and Alex
Solomatnikov and Amin Firoozshahian and Mark Horowitz
and Christos Kozyrakis",
title = "Comparative evaluation of memory models for chip
multiprocessors",
journal = j-TACO,
volume = "5",
number = "3",
pages = "12:1--12:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1455650.1455651",
ISSN = "1544-3566",
bibdate = "Mon Dec 8 14:28:18 MST 2008",
bibsource = "http://portal.acm.org/",
abstract = "There are two competing models for the on-chip memory
in Chip Multiprocessor (CMP) systems: {\em
hardware-managed coherent caches\/} and {\em
software-managed streaming memory}. This paper performs
a direct comparison of the two models under the same
set of assumptions about technology, area, and
computational capabilities. The goal is to quantify how
and when they differ in terms of performance, energy
consumption, bandwidth requirements, and latency
tolerance for general-purpose CMPs. We demonstrate that
for data-parallel applications on systems with up to 16
cores, the cache-based and streaming models perform and
scale equally well. For certain applications with
little data reuse, streaming scales better due to
better bandwidth use and macroscopic software
prefetching. However, the introduction of techniques
such as hardware prefetching and nonallocating stores
to the cache-based model eliminates the streaming
advantage. Overall, our results indicate that there is
not sufficient advantage in building streaming memory
systems where all on-chip memory structures are
explicitly managed. On the other hand, we show that
streaming at the programming model level is
particularly beneficial, even with the cache-based
model, as it enhances locality and creates
opportunities for bandwidth optimizations. Moreover, we
observe that stream programming is actually easier with
the cache-based model because the hardware guarantees
correct, best-effort execution even when the programmer
cannot fully regularize an application's code.",
acknowledgement = ack-nhfb,
articleno = "12",
keywords = "cache coherence; Chip multiprocessors; locality
optimizations; parallel programming; streaming memory",
}
@Article{Sharkey:2008:RRP,
author = "Joseph J. Sharkey and Jason Loew and Dmitry V.
Ponomarev",
title = "Reducing register pressure in {SMT} processors through
{L2}-miss-driven early register release",
journal = j-TACO,
volume = "5",
number = "3",
pages = "13:1--13:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1455650.1455652",
ISSN = "1544-3566",
bibdate = "Mon Dec 8 14:28:18 MST 2008",
bibsource = "http://portal.acm.org/",
abstract = "The register file is one of the most critical datapath
components limiting the number of threads that can be
supported on a simultaneous multithreading (SMT)
processor. To allow the use of smaller register files
without degrading performance, techniques that maximize
the efficiency of using registers through aggressive
register allocation/deallocation can be considered. In
this article, we propose a novel technique to early
deallocate physical registers allocated to threads
which experience L2 cache misses. This is accomplished
by speculatively committing the load-independent
instructions and deallocating the registers
corresponding to the previous mappings of their
destinations, without waiting for the cache miss
request to be serviced. The early deallocated registers
are then made immediately available for allocation to
instructions within the same thread as well as within
other threads, thus improving the overall processor
throughput. On the average across the simulated mixes
of multiprogrammed SPEC 2000 workloads, our technique
results in 33\% improvement in throughput and 25\%
improvement in terms of harmonic mean of weighted IPCs
over the baseline SMT with the state-of-the-art DCRA
policy. This is achieved without creating checkpoints,
maintaining per-register counters of pending consumers,
performing tag rebroadcasts, register remappings,
and/or additional associative searches.",
acknowledgement = ack-nhfb,
articleno = "13",
keywords = "register file; Simultaneous multithreading",
}
@Article{Mehrara:2008:ESP,
author = "Mojtaba Mehrara and Todd Austin",
title = "Exploiting selective placement for low-cost memory
protection",
journal = j-TACO,
volume = "5",
number = "3",
pages = "14:1--14:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1455650.1455653",
ISSN = "1544-3566",
bibdate = "Mon Dec 8 14:28:18 MST 2008",
bibsource = "http://portal.acm.org/",
abstract = "Many embedded processing applications, such as those
found in the automotive or medical field, require
hardware designs that are at the same time low cost and
reliable. Traditionally, reliable memory systems have
been implemented using coded storage techniques, such
as ECC. While these designs can effectively detect and
correct memory faults such as transient errors and
single-bit defects, their use bears a significant cost
overhead. In this article, we propose a novel partial
memory protection scheme that provides high-coverage
fault protection for program code and data, but with
much lower cost than traditional approaches. Our
approach profiles program code and data usage to assess
which program elements are most critical to maintaining
program correctness. Critical code and variables are
then placed into a limited protected storage resources.
To ensure high coverage of program elements, our
placement technique considers all program components
simultaneously, including code, global variables, stack
frames, and heap variables. The fault coverage of our
approach is gauged using Monte Carlo fault-injection
experiments, which confirm that our technique provides
high levels of fault protection (99\% coverage) with
limited memory protection resources (36\% protected
area).",
acknowledgement = ack-nhfb,
articleno = "14",
keywords = "fault-tolerant design; memory system design; Partial
memory protection; selective placement; transient
faults",
}
@Article{Vandierendonck:2008:SRA,
author = "Hans Vandierendonck and Andr{\'e} Seznec",
title = "Speculative return address stack management
revisited",
journal = j-TACO,
volume = "5",
number = "3",
pages = "15:1--15:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1455650.1455654",
ISSN = "1544-3566",
bibdate = "Mon Dec 8 14:28:18 MST 2008",
bibsource = "http://portal.acm.org/",
abstract = "Branch prediction feeds a speculative execution
processor core with instructions. Branch mispredictions
are inevitable and have negative effects on performance
and energy consumption. With the advent of highly
accurate conditional branch predictors, nonconditional
branch instructions are gaining importance.\par
In this article, we address the prediction of procedure
returns. On modern processors, procedure returns are
predicted through a return address stack (RAS). The
overwhelming majority of the return mispredictions are
due to RAS overflows and/or overwriting the top entries
of the RAS on a mispredicted path. These sources of
misprediction were addressed by previously proposed
speculative return address stacks [Jourdan et al. 1996;
Skadron et al. 1998]. However, the remaining
misprediction rate of these RAS designs is still
significant when compared to state-of-the-art
conditional predictors.\par
We present two low-cost corruption detectors for RAS
predictors. They detect RAS overflows and wrong path
corruption with 100\% coverage. As a consequence, when
such a corruption is detected, another source can be
used for predicting the return. On processors featuring
a branch target buffer (BTB), this BTB can be used as a
free backup predictor for predicting returns when
corruption is detected.\par
Our experiments show that our proposal can be used to
improve the behavior of all previously proposed
speculative RASs. For instance, without any specific
management of the speculative states on the RAS, an
8-entry BTB-backed up RAS achieves the same performance
level as a state-of-the-art, but complex, 64-entry
self-checkpointing RAS [Jourdan et al. 1996].
Therefore, our proposal can be used either to improve
the performance of the processor or to reduce its
hardware complexity.",
acknowledgement = ack-nhfb,
articleno = "15",
keywords = "back-up predictor; corruption detection; Return
address prediction",
}
@Article{Chhabra:2009:MSP,
author = "Siddhartha Chhabra and Brian Rogers and Yan Solihin
and Milos Prvulovic",
title = "Making secure processors {OS}- and
performance-friendly",
journal = j-TACO,
volume = "5",
number = "4",
pages = "16:1--16:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1498690.1498691",
ISSN = "1544-3566",
bibdate = "Wed Mar 18 21:35:33 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "In today's digital world, computer security issues
have become increasingly important. In particular,
researchers have proposed designs for secure processors
that utilize hardware-based memory encryption and
integrity verification to protect the privacy and
integrity of computation even from sophisticated
physical attacks. However, currently proposed schemes
remain hampered by problems that make them impractical
for use in today's computer systems: lack of virtual
memory and Inter-Process Communication support as well
as excessive storage and performance overheads. In this
article, we propose (1) address independent seed
encryption (AISE), a counter-mode-based memory
encryption scheme using a novel seed composition, and
(2) bonsai Merkle trees (BMT), a novel Merkle
tree-based memory integrity verification technique, to
eliminate these system and performance issues
associated with prior counter-mode memory encryption
and Merkle tree integrity verification schemes. We
present both a qualitative discussion and a
quantitative analysis to illustrate the advantages of
our techniques over previously proposed approaches in
terms of complexity, feasibility, performance, and
storage. Our results show that AISE+BMT reduces the
overhead of prior memory encryption and integrity
verification schemes from 12\% to 2\% on average for
single-threaded benchmarks on uniprocessor systems, and
from 15\% to 4\% for coscheduled benchmarks on
multicore systems while eliminating critical
system-level problems.",
acknowledgement = ack-nhfb,
articleno = "16",
keywords = "memory encryption; memory integrity verification;
Secure processor architectures; virtualization",
}
@Article{Jimenez:2009:GNB,
author = "Daniel A. Jim{\'e}nez",
title = "Generalizing neural branch prediction",
journal = j-TACO,
volume = "5",
number = "4",
pages = "17:1--17:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1498690.1498692",
ISSN = "1544-3566",
bibdate = "Wed Mar 18 21:35:33 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "Improved branch prediction accuracy is essential to
sustaining instruction throughput with today's deep
pipelines. Traditional branch predictors exploit
correlations between pattern history and branch outcome
to predict branches, but there is a stronger and more
natural correlation between path history and branch
outcome. We explore the potential for exploiting this
correlation. We introduce {\em piecewise linear branch
prediction}, an idealized branch predictor that
develops a set of linear functions, one for each
program path to the branch to be predicted, that
separate predicted taken from predicted not taken
branches. Taken together, all of these linear functions
form a piecewise linear decision surface. We present a
limit study of this predictor showing its potential to
greatly improve predictor accuracy.\par
We then introduce a practical implementable branch
predictor based on piecewise linear branch prediction.
In making our predictor practical, we show how a
parameterized version of it unifies the previously
distinct concepts of perceptron prediction and
path-based neural prediction. Our new branch predictor
has implementation costs comparable to current
prominent predictors in the literature while
significantly improving accuracy. For a deeply
pipelined simulated microarchitecture our predictor
with a 256-KB hardware budget improves the harmonic
mean normalized instructions-per-cycle rate by 8\% over
both the original path-based neural predictor and
2Bc-{\em gskew}. The average misprediction rate is
decreased by 16\% over the path-based neural predictor
and by 22\% over 2Bc-{\em gskew}.",
acknowledgement = ack-nhfb,
articleno = "17",
keywords = "Branch prediction; machine learning",
}
@Article{Jeon:2009:AAP,
author = "Jinseong Jeon and Keoncheol Shin and Hwansoo Han",
title = "Abstracting access patterns of dynamic memory using
regular expressions",
journal = j-TACO,
volume = "5",
number = "4",
pages = "18:1--18:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1498690.1498693",
ISSN = "1544-3566",
bibdate = "Wed Mar 18 21:35:33 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "Unless the speed gap between CPU and memory
disappears, efficient memory usage remains a decisive
factor for performance. To optimize data usage of
programs in the presence of the memory hierarchy, we
are particularly interested in two compiler techniques:
{\em pool allocation\/} and {\em field layout
restructuring}. Since foreseeing runtime behaviors of
programs at compile time is difficult, most of the
previous work relied on profiling. On the contrary, our
goal is to develop a fully automatic compiler that
statically transforms input codes to use memory
efficiently. Noticing that {\em regular expressions},
which denote repetition explicitly, are sufficient for
memory access patterns, we describe how to extract
memory access patterns as regular expressions in
detail. Based on static patterns presented in regular
expressions, we apply pool allocation to repeatedly
accessed structures and exploit field layout
restructuring according to field affinity relations of
chosen structures. To make a scalable framework, we
devise and apply new abstraction techniques, which
build and interpret access patterns for the whole
programs in a bottom-up fashion. We implement our
analyses and transformations with the CIL compiler. To
verify the effect and scalability of our scheme, we
examine 17 benchmarks including 2 SPECINT 2000
benchmarks whose source lines of code are larger than
10,000. Our experiments demonstrate that the static
layout transformations for dynamic memory can reduce
L1D cache misses by 16\% and execution times by 14\% on
average.",
acknowledgement = ack-nhfb,
articleno = "18",
keywords = "Access patterns; field affinity; layout
transformation; pool allocation; regular expressions",
}
@Article{Shobaki:2009:OTS,
author = "Ghassan Shobaki and Kent Wilken and Mark Heffernan",
title = "Optimal trace scheduling using enumeration",
journal = j-TACO,
volume = "5",
number = "4",
pages = "19:1--19:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1498690.1498694",
ISSN = "1544-3566",
bibdate = "Wed Mar 18 21:35:33 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "This article presents the first optimal algorithm for
trace scheduling. The trace is a global scheduling
region used by compilers to exploit instruction-level
parallelism across basic block boundaries. Several
heuristic techniques have been proposed for trace
scheduling, but the precision of these techniques has
not been studied relative to optimality. This article
describes a technique for finding provably optimal
trace schedules, where optimality is defined in terms
of a weighted sum of schedule lengths across all code
paths in a trace. The optimal algorithm uses
branch-and-bound enumeration to efficiently explore the
entire solution space. Experimental evaluation of the
algorithm shows that, with a time limit of 1 s per
problem, 91\% of the hard trace scheduling problems in
the SPEC CPU 2006 Integer Benchmarks are solved
optimally. For 58\% of these hard problems, the optimal
schedule is improved compared to that produced by a
heuristic scheduler with a geometric mean improvement
of 3.2\% in weighted schedule length and 18\% in
compensation code size.",
acknowledgement = ack-nhfb,
articleno = "19",
keywords = "branch-and-bound enumeration; compiler optimizations;
global instruction scheduling; instruction-level
parallelism; Instruction scheduling; optimal
instruction scheduling; trace scheduling",
}
@Article{Kulkarni:2009:PEO,
author = "Prasad A. Kulkarni and David B. Whalley and Gary S.
Tyson and Jack W. Davidson",
title = "Practical exhaustive optimization phase order
exploration and evaluation",
journal = j-TACO,
volume = "6",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1509864.1509865",
ISSN = "1544-3566",
bibdate = "Thu May 7 14:55:25 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "Choosing the most appropriate optimization phase
ordering has been a long-standing problem in compiler
optimizations. Exhaustive evaluation of all possible
orderings of optimization phases for each function is
generally dismissed as infeasible for
production-quality compilers targeting accepted
benchmarks. In this article, we show that it is
possible to exhaustively evaluate the optimization
phase order space for each function in a reasonable
amount of time for most of the functions in our
benchmark suite. To achieve this goal, we used various
techniques to significantly prune the optimization
phase order search space so that it can be
inexpensively enumerated in most cases and reduce the
number of program simulations required to evaluate
program performance for each distinct phase ordering.
The techniques described are applicable to other
compilers in which it is desirable to find the best
phase ordering for most functions in a reasonable
amount of time. We also describe some interesting
properties of the optimization phase order space, which
will prove useful for further studies of related
problems in compilers.",
acknowledgement = ack-nhfb,
articleno = "1",
keywords = "exhaustive search; iterative compilation; Phase
ordering",
}
@Article{Hohenauer:2009:SOF,
author = "Manuel Hohenauer and Felix Engel and Rainer Leupers
and Gerd Ascheid and Heinrich Meyr",
title = "A {SIMD} optimization framework for retargetable
compilers",
journal = j-TACO,
volume = "6",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1509864.1509866",
ISSN = "1544-3566",
bibdate = "Thu May 7 14:55:25 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "Retargetable C compilers are currently widely used to
quickly obtain compiler support for new embedded
processors and to perform early processor architecture
exploration. A partially inherent problem of the
retargetable compilation approach, though, is the
limited code quality as compared to hand-written
compilers or assembly code due to the lack of dedicated
optimizations techniques. This problem can be
circumvented by designing flexible, retargetable code
optimization techniques that apply to a certain range
of target architectures. This article focuses on target
machines with SIMD instruction support, a common
feature in embedded processors for multimedia
applications. However, SIMD optimization is known to be
a difficult task since SIMD architectures are largely
nonuniform, support only a limited set of data types
and impose several memory alignment constraints.
Additionally, such techniques require complicated loop
transformations, which are tailored to the SIMD
architecture in order to exhibit the necessary amount
of parallelism in the code. Thus, integrating the SIMD
optimization {\em and\/} the required loop
transformations together in a single retargeting
formalism is an ambitious challenge. In this article,
we present an efficient and quickly retargetable SIMD
code optimization framework that is integrated into an
industrial retargetable C compiler. Experimental
results for different processors demonstrate that the
proposed technique applies to real-life target machines
and that it produces code quality improvements close to
the theoretical limit.",
acknowledgement = ack-nhfb,
articleno = "2",
keywords = "ASIP; retargetable compilers; SIMD; subword
parallelism; vectorization",
}
@Article{Eyerman:2009:MLP,
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Memory-level parallelism aware fetch policies for
simultaneous multithreading processors",
journal = j-TACO,
volume = "6",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1509864.1509867",
ISSN = "1544-3566",
bibdate = "Thu May 7 14:55:25 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "A thread executing on a simultaneous multithreading
(SMT) processor that experiences a long-latency load
will eventually stall while holding execution
resources. Existing long-latency load aware SMT fetch
policies limit the amount of resources allocated by a
stalled thread by identifying long-latency loads and
preventing the thread from fetching more instructions
--- and in some implementations, instructions beyond
the long-latency load are flushed to release allocated
resources.\par
This article proposes an SMT fetch policy that takes
into account the available memory-level parallelism
(MLP) in a thread. The key idea proposed in this
article is that in case of an isolated long-latency
load (i.e., there is no MLP), the thread should be
prevented from allocating additional resources.
However, in case multiple independent long-latency
loads overlap (i.e., there is MLP), the thread should
allocate as many resources as needed in order to fully
expose the available MLP. MLP-aware fetch policies
achieve better performance for MLP-intensive threads on
SMT processors, leading to higher overall system
throughput and shorter average turnaround time than
previously proposed fetch policies.",
acknowledgement = ack-nhfb,
articleno = "3",
keywords = "Fetch Policy; Memory-Level Parallelism (MLP);
Simultaneous Multithreading (SMT)",
}
@Article{Strozek:2009:EAE,
author = "Lukasz Strozek and David Brooks",
title = "Energy- and area-efficient architectures through
application clustering and architectural
heterogeneity",
journal = j-TACO,
volume = "6",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1509864.1509868",
ISSN = "1544-3566",
bibdate = "Thu May 7 14:55:25 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "Customizing architectures for particular applications
is a promising approach to yield highly
energy-efficient designs for embedded systems. This
work explores the benefits of architectural
customization for a class of embedded architectures
typically used in energy- and area-constrained
application domains, such as sensor nodes and
multimedia processing. We implement a process flow that
performs an automatic synthesis and evaluation of the
different architectures based on runtime profiles of
applications and determines an efficient architecture,
with consideration for both energy and area
constraints. An expressive architectural model, used by
our engine, is introduced that takes advantage of
efficient opcode allocation, several memory addressing
modes, and operand types. By profiling embedded
benchmarks from a variety of sensor and multimedia
applications, we show that the energy savings resulting
from various architectural optimizations relative to
the base architectures (e.g., MIPS and MSP430) are
significant and can reach 50\%, depending on the
application. We then identify the set of architectures
that achieves near-optimal savings for a group of
applications. Finally, we propose the use of
heterogeneous ISA processors implementing those
architectures as a solution to capitalize on energy
savings provided by application customization while
executing a range of applications efficiently.",
acknowledgement = ack-nhfb,
articleno = "4",
keywords = "Efficient custom architectures; heterogeneous ISA
processors",
}
@Article{Venkataramani:2009:MAM,
author = "Guru Venkataramani and Ioannis Doudalis and Yan
Solihin and Milos Prvulovic",
title = "{MemTracker}: {An} accelerator for memory debugging
and monitoring",
journal = j-TACO,
volume = "6",
number = "2",
pages = "5:1--5:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1543753.1543754",
ISSN = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "Memory bugs are a broad class of bugs that is becoming
increasingly common with increasing software
complexity, and many of these bugs are also security
vulnerabilities. Existing software and hardware
approaches for finding and identifying memory bugs have
a number of drawbacks including considerable
performance overheads, target only a specific type of
bug, implementation cost, and inefficient use of
computational resources.\par
This article describes MemTracker, a new hardware
support mechanism that can be configured to perform
different kinds of memory access monitoring tasks.
MemTracker associates each word of data in memory with
a few bits of state, and uses a programmable state
transition table to react to different events that can
affect this state. The number of state bits per word,
the events to which MemTracker reacts, and the
transition table are all fully programmable.
MemTracker's rich set of states, events, and
transitions can be used to implement different
monitoring and debugging checkers with minimal
performance overheads, even when frequent state updates
are needed. To evaluate MemTracker, we map three
different checkers onto it, as well as a checker that
combines all three. For the most demanding (combined)
checker with 8 bits state per memory word, we observe
performance overheads of only around 3\%, on average,
and 14.5\% worst-case across different benchmark
suites. Such low overheads allow continuous (always-on)
use of MemTracker-enabled checkers, even in production
runs.",
acknowledgement = ack-nhfb,
articleno = "5",
keywords = "Accelerator; debugging; memory access monitoring",
}
@Article{Gabor:2009:SLA,
author = "Ron Gabor and Avi Mendelson and Shlomo Weiss",
title = "Service level agreement for multithreaded processors",
journal = j-TACO,
volume = "6",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1543753.1543755",
ISSN = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "Multithreading is widely used to increase processor
throughput. As the number of shared resources increase,
managing them while guaranteeing predicted performance
becomes a major problem. Attempts have been made in
previous work to ease this via different fairness
mechanisms. In this article, we present a new approach
to control the resource allocation and sharing via a
service level agreement (SLA)-based mechanism; that is,
via an agreement in which multithreaded processors
guarantee a minimal level of service to the running
threads. We introduce a new metric, {\em C\/}$_{SLA}$,
for conformance to SLA in multithreaded processors and
show that controlling resources using with SLA allows
for higher gains than are achievable by previously
suggested fairness techniques. It also permits
improving one metric (e.g., power) while maintaining
SLA in another (e.g., performance). We compare SLA
enforcement to schemes based on other fairness metrics,
which are mostly targeted at equalizing execution
parameters. We show that using SLA rather than fairness
based algorithms provides a range of acceptable
execution points from which we can select the point
that best fits our optimization target, such as
maximizing the weighted speedup (sum of the speedups of
the individual threads) or reducing power. We
demonstrate the effectiveness of the new SLA approach
using switch-on-event (coarse-grained) multithreading.
Our weighted speedup improvement scheme successfully
enforces SLA while improving the weighted speedup by an
average of 10\% for unbalanced threads. This result is
significant when compared with performance losses that
may be incurred by fairness enforcement methods. When
optimizing for power reduction in unbalanced threads
SLA enforcement reduces the power by an average of
15\%. SLA may be complemented by other power reduction
methods to achieve further power savings {\em and\/}
maintain the same service level for the threads. We
also demonstrate differentiated SLA, where weighted
speedup is maximized while each thread may have a
different throughput constraint.",
acknowledgement = ack-nhfb,
articleno = "6",
keywords = "fairness; performance; power; Service level agreement;
throughput",
}
@Article{Fung:2009:DWF,
author = "Wilson W. L. Fung and Ivan Sham and George Yuan and
Tor M. Aamodt",
title = "Dynamic warp formation: {Efficient MIMD} control flow
on {SIMD} graphics hardware",
journal = j-TACO,
volume = "6",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1543753.1543756",
ISSN = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "Recent advances in graphics processing units (GPUs)
have resulted in massively parallel hardware that is
easily programmable and widely available in today's
desktop and notebook computer systems. GPUs typically
use single-instruction, multiple-data (SIMD) pipelines
to achieve high performance with minimal overhead for
control hardware. Scalar threads running the same
computing kernel are grouped together into SIMD
batches, sometimes referred to as warps. While SIMD is
ideally suited for simple programs, recent GPUs include
control flow instructions in the GPU instruction set
architecture and programs using these instructions may
experience reduced performance due to the way branch
execution is supported in hardware. One solution is to
add a stack to allow different SIMD processing elements
to execute distinct program paths after a branch
instruction. The occurrence of diverging branch
outcomes for different processing elements
significantly degrades performance using this approach.
In this article, we propose dynamic warp formation and
scheduling, a mechanism for more efficient SIMD branch
execution on GPUs. It dynamically regroups threads into
new warps on the fly following the occurrence of
diverging branch outcomes. We show that a realistic
hardware implementation of this mechanism improves
performance by 13\%, on average, with 256 threads per
core, 24\% with 512 threads, and 47\% with 768 threads
for an estimated area increase of 8\%.",
acknowledgement = ack-nhfb,
articleno = "7",
keywords = "control flow; fine-grained multithreading; GPU; SIMD",
}
@Article{Koh:2009:TPV,
author = "Cheng-Kok Koh and Weng-Fai Wong and Yiran Chen and Hai
Li",
title = "Tolerating process variations in large,
set-associative caches: {The} buddy cache",
journal = j-TACO,
volume = "6",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1543753.1543757",
ISSN = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "One important trend in today's microprocessor
architectures is the increase in size of the processor
caches. These caches also tend to be set associative.
As technology scales, process variations are expected
to increase the fault rates of the SRAM cells that
compose such caches. As an important component of the
processor, the parametric yield of SRAM cells is
crucial to the overall performance and yield of the
microchip. In this article, we propose a
microarchitectural solution, called the buddy cache
that permits large, set-associative caches to tolerate
faults in SRAM cells due to process variations. In
essence, instead of disabling a faulty cache block in a
set (as is the current practice), it is paired with
another faulty cache block in the same set --- the
buddy. Although both cache blocks are faulty, if the
faults of the two blocks do not overlap, then instead
of losing two blocks, buddying will yield a functional
block from the nonfaulty portions of the two blocks. We
found that with buddying, caches can better mitigate
the negative impacts of process variations on
performance and yield, gracefully downgrading
performance as opposed to catastrophic failure. We will
describe the details of the buddy cache and give
insights as to why it is both more performance and
yield resilient to faults.",
acknowledgement = ack-nhfb,
articleno = "8",
keywords = "caches; fault recovery; memory structures; Processor
architectures",
}
@Article{Li:2009:CDS,
author = "Lian Li and Hui Feng and Jingling Xue",
title = "Compiler-directed scratchpad memory management via
graph coloring",
journal = j-TACO,
volume = "6",
number = "3",
pages = "9:1--9:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1582710.1582711",
ISSN = "1544-3566",
bibdate = "Thu Oct 1 09:20:47 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "Scratchpad memory (SPM), a fast on-chip SRAM managed
by software, is widely used in embedded systems. This
article introduces a general-purpose compiler approach,
called memory coloring, to assign static data
aggregates, such as arrays and structs, in a program to
an SPM. The novelty of this approach lies in
partitioning the SPM into a pseudo--register file (with
interchangeable and aliased registers), splitting the
live ranges of data aggregates to create potential data
transfer statements between SPM and off-chip memory,
and finally, adapting an existing graph coloring
algorithm for register allocation to assign the data
aggregates to the pseudo--register file. Our
experimental results using a set of 10 C benchmarks
from MediaBench and MiBench show that our methodology
is capable of managing SPMs efficiently and effectively
for large embedded applications. In addition, our SPM
allocator can obtain close to optimal solutions when
evaluated and compared against an existing
heuristics-based SPM allocator and an ILP-based SPM
allocator.",
acknowledgement = ack-nhfb,
articleno = "9",
keywords = "graph coloring; live range splitting; memory
allocation; memory coloring; register coalescing;
Scratchpad memory; software-managed cache",
}
@Article{Golander:2009:CAR,
author = "Amit Golander and Shlomo Weiss",
title = "Checkpoint allocation and release",
journal = j-TACO,
volume = "6",
number = "3",
pages = "10:1--10:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1582710.1582712",
ISSN = "1544-3566",
bibdate = "Thu Oct 1 09:20:47 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "Out-of-order speculative processors need a bookkeeping
method to recover from incorrect speculation. In recent
years, several microarchitectures that employ
checkpoints have been proposed, either extending the
reorder buffer or entirely replacing it. This work
presents an in-dept-study of checkpointing in
checkpoint-based microarchitectures, from the desired
content of a checkpoint, via implementation trade-offs,
and to checkpoint allocation and release policies. A
major contribution of the article is a novel adaptive
checkpoint allocation policy that outperforms known
policies. The adaptive policy controls checkpoint
allocation according to dynamic events, such as
second-level cache misses and rollback history. It
achieves 6.8\% and 2.2\% speedup for the integer and
floating point benchmarks, respectively, and does not
require a branch confidence estimator. The results show
that the proposed adaptive policy achieves most of the
potential of an oracle policy whose performance
improvement is 9.8\% and 3.9\% for the integer and
floating point benchmarks, respectively. We exploit
known techniques for saving leakage power by adapting
and applying them to checkpoint-based
microarchitectures. The proposed applications combine
to reduce the leakage power of the register file to
about one half of its original value.",
acknowledgement = ack-nhfb,
articleno = "10",
keywords = "Checkpoint; early register release; leakage;
misprediction; out-of-order execution; rollback",
}
@Article{Xu:2009:TXP,
author = "Weifeng Xu and Russell Tessier",
title = "{Tetris-XL}: {A} performance-driven spill reduction
technique for embedded {VLIW} processors",
journal = j-TACO,
volume = "6",
number = "3",
pages = "11:1--11:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1582710.1582713",
ISSN = "1544-3566",
bibdate = "Thu Oct 1 09:20:47 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "As technology has advanced, the application space of
Very Long Instruction Word (VLIW) processors has grown
to include a variety of embedded platforms. Due to cost
and power consumption constraints, many embedded VLIW
processors contain limited resources, including
registers. As a result, a VLIW compiler that maximizes
instruction level parallelism (ILP) without considering
register constraints may generate excessive register
spills, leading to reduced overall system performance.
To address this issue, this article presents a new
spill reduction technique that improves VLIW runtime
performance by reordering operations prior to register
allocation and instruction scheduling. Unlike earlier
algorithms, our approach explicitly considers both
register reduction and data dependency in performing
operation reordering. Data dependency control limits
unexpected schedule length increases during subsequent
instruction scheduling. Our technique has been
evaluated using Trimaran, an academic VLIW compiler,
and evaluated using a set of embedded systems
benchmarks. Experimental results show that, on average,
this technique improves VLIW performance by 10\% for
VLIW processors with 32 registers and 8 functional
units compared with previous spill reduction
techniques. Limited improvement is seen versus prior
approaches for VLIW processors with 64 registers and 8
functional units.",
acknowledgement = ack-nhfb,
articleno = "11",
keywords = "instruction level parallelism; Register pressure; Very
Long Instruction Word (VLIW) processor",
}
@Article{Jones:2009:ELE,
author = "Timothy M. Jones and Michael F. P. O'Boyle and Jaume
Abella and Antonio Gonz{\'a}lez and O{\u{g}}uz Ergin",
title = "Exploring the limits of early register release:
{Exploiting} compiler analysis",
journal = j-TACO,
volume = "6",
number = "3",
pages = "12:1--12:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "http://doi.acm.org/10.1145/1582710.1582714",
ISSN = "1544-3566",
bibdate = "Thu Oct 1 09:20:47 MDT 2009",
bibsource = "http://portal.acm.org/",
abstract = "Register pressure in modern superscalar processors can
be reduced by releasing registers early and by copying
their contents to cheap back-up storage. This article
quantifies the potential benefits of register occupancy
reduction and shows that existing hardware-based
schemes typically achieve only a small fraction of this
potential. This is because they are unable to
accurately determine the last use of a register and
must wait until the redefining instruction enters the
pipeline. On the other hand, compilers have a global
view of the program and, using simple dataflow
analysis, can determine the last use. This article
evaluates the extent to which compiler analysis can aid
early releasing, explores the design space, and
introduces commit and issue-based early releasing
schemes, quantifying their benefits. Using simple
compiler analysis and microarchitecture changes, we
achieve 70\% of the potential register file occupancy
reduction. By adding more hardware support, we can
increase this to 94\%. Our schemes are compared to
state-of-the-art approaches for varying register file
sizes and are shown to outperform these existing
techniques.",
acknowledgement = ack-nhfb,
articleno = "12",
keywords = "compiler; energy efficiency; Low-power design;
microarchitecture; register file",
}