@Preamble{"\input bibnames.sty" #
"\def \TM {${}^{\sc TM}$}"
}
@String{ack-nhfb = "Nelson H. F. Beebe,
University of Utah,
Department of Mathematics, 110 LCB,
155 S 1400 E RM 233,
Salt Lake City, UT 84112-0090, USA,
Tel: +1 801 581 5254,
FAX: +1 801 581 4148,
e-mail: \path|beebe@math.utah.edu|,
\path|beebe@acm.org|,
\path|beebe@computer.org| (Internet),
URL: \path|https://www.math.utah.edu/~beebe/|"}
@String{j-TACO = "ACM Transactions on Architecture and
Code Optimization"}
@Article{Calder:2004:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "1",
number = "1",
pages = "1--2",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2004:RIC,
author = "W. Zhang and J. S. Hu and V. Degalahal and M. Kandemir
and N. Vijaykrishnan and M. J. Irwin",
title = "Reducing instruction cache energy consumption using a
compiler-based strategy",
journal = j-TACO,
volume = "1",
number = "1",
pages = "3--33",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Isailovic:2004:DCQ,
author = "Nemanja Isailovic and Mark Whitney and Yatish Patel
and John Kubiatowicz and Dean Copsey and Frederic T.
Chong and Isaac L. Chuang and Mark Oskin",
title = "Datapath and control for quantum wires",
journal = j-TACO,
volume = "1",
number = "1",
pages = "34--61",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sankaralingam:2004:TPA,
author = "Karthikeyan Sankaralingam and Ramadass Nagarajan and
Haiming Liu and Changkyu Kim and Jaehyuk Huh and Nitya
Ranganathan and Doug Burger and Stephen W. Keckler and
Robert G. McDonald and Charles R. Moore",
title = "{TRIPS}: a polymorphous architecture for exploiting
{ILP}, {TLP}, and {DLP}",
journal = j-TACO,
volume = "1",
number = "1",
pages = "62--93",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Skadron:2004:TAM,
author = "Kevin Skadron and Mircea R. Stan and Karthik
Sankaranarayanan and Wei Huang and Sivakumar Velusamy
and David Tarjan",
title = "Temperature-aware microarchitecture: {Modeling} and
implementation",
journal = j-TACO,
volume = "1",
number = "1",
pages = "94--125",
month = mar,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:09 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Aleta:2004:RCC,
author = "Alex Alet{\`a} and Josep M. Codina and Antonio
Gonz{\'a}lez and David Kaeli",
title = "Removing communications in clustered
microarchitectures through instruction replication",
journal = j-TACO,
volume = "1",
number = "2",
pages = "127--151",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bai:2004:LPO,
author = "Yu Bai and R. Iris Bahar",
title = "A low-power in-order\slash out-of-order issue queue",
journal = j-TACO,
volume = "1",
number = "2",
pages = "152--179",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Juang:2004:IBP,
author = "Philo Juang and Kevin Skadron and Margaret Martonosi
and Zhigang Hu and Douglas W. Clark and Philip W.
Diodato and Stefanos Kaxiras",
title = "Implementing branch-predictor decay using quasi-static
memory cells",
journal = j-TACO,
volume = "1",
number = "2",
pages = "180--219",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Santana:2004:LCF,
author = "Oliverio J. Santana and Alex Ramirez and Josep L.
Larriba-Pey and Mateo Valero",
title = "A low-complexity fetch architecture for
high-performance superscalar processors",
journal = j-TACO,
volume = "1",
number = "2",
pages = "220--245",
month = jun,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 5 07:08:10 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lin:2004:CFS,
author = "Jin Lin and Tong Chen and Wei-Chung Hsu and Pen-Chung
Yew and Roy Dz-Ching Ju and Tin-Fook Ngai and Sun
Chan",
title = "A compiler framework for speculative optimizations",
journal = j-TACO,
volume = "1",
number = "3",
pages = "247--271",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fields:2004:ICS,
author = "Brian A. Fields and Rastislav Bodik and Mark D. Hill
and Chris J. Newburn",
title = "Interaction cost and shotgun profiling",
journal = j-TACO,
volume = "1",
number = "3",
pages = "272--304",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sankaranarayanan:2004:PBA,
author = "Karthik Sankaranarayanan and Kevin Skadron",
title = "Profile-based adaptation for cache decay",
journal = j-TACO,
volume = "1",
number = "3",
pages = "305--322",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xie:2004:IDV,
author = "Fen Xie and Margaret Martonosi and Sharad Malik",
title = "Intraprogram dynamic voltage scaling: {Bounding}
opportunities with analytic modeling",
journal = j-TACO,
volume = "1",
number = "3",
pages = "323--367",
month = sep,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Oct 29 06:39:45 MDT 2004",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hartstein:2004:OPD,
author = "A. Hartstein and Thomas R. Puzak",
title = "The optimum pipeline depth considering both power and
performance",
journal = j-TACO,
volume = "1",
number = "4",
pages = "369--388",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cristal:2004:TKI,
author = "Adri{\'a}n Cristal and Oliverio J. Santana and Mateo
Valero and Jos{\'e} F. Mart{\'\i}nez",
title = "Toward kilo-instruction processors",
journal = j-TACO,
volume = "1",
number = "4",
pages = "389--417",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Akkary:2004:ARE,
author = "Haitham Akkary and Ravi Rajwar and Srikanth T.
Srinivasan",
title = "An analysis of a resource efficient checkpoint
architecture",
journal = j-TACO,
volume = "1",
number = "4",
pages = "418--444",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yang:2004:TML,
author = "Chia-Lin Yang and Alvin R. Lebeck and Hung-Wei Tseng
and Chien-Hao Lee",
title = "Tolerating memory latency through push prefetching for
pointer-intensive applications",
journal = j-TACO,
volume = "1",
number = "4",
pages = "445--475",
month = dec,
year = "2004",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Apr 14 12:17:47 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Calder:2005:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "2",
number = "1",
pages = "1--2",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhou:2005:EFA,
author = "Yuanyuan Zhou and Pin Zhou and Feng Qin and Wei Liu
and Josep Torrellas",
title = "Efficient and flexible architectural support for
dynamic monitoring",
journal = j-TACO,
volume = "2",
number = "1",
pages = "3--33",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2005:WHC,
author = "Chuanjun Zhang and Frank Vahid and Jun Yang and Walid
Najjar",
title = "A way-halting cache for low-energy high-performance
systems",
journal = j-TACO,
volume = "2",
number = "1",
pages = "34--54",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Abella:2005:ISP,
author = "Jaume Abella and Antonio Gonz{\'a}lez and Xavier Vera
and Michael F. P. O'Boyle",
title = "{IATAC}: a smart predictor to turn-off {L2} cache
lines",
journal = j-TACO,
volume = "2",
number = "1",
pages = "55--77",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Haskins:2005:AWS,
author = "John W. {Haskins, Jr.} and Kevin Skadron",
title = "Accelerated warmup for sampled microarchitecture
simulation",
journal = j-TACO,
volume = "2",
number = "1",
pages = "78--108",
month = mar,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon May 2 11:13:58 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2005:ABT,
author = "Tao Li and Ravi Bhargava and Lizy Kurian John",
title = "Adapting branch-target buffer to improve the target
predictability of {Java} code",
journal = j-TACO,
volume = "2",
number = "2",
pages = "109--130",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2005:DIE,
author = "Lingli Zhang and Chandra Krintz",
title = "The design, implementation, and evaluation of adaptive
code unloading for resource-constrained devices",
journal = j-TACO,
volume = "2",
number = "2",
pages = "131--164",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kulkarni:2005:FES,
author = "Prasad A. Kulkarni and Stephen R. Hines and David B.
Whalley and Jason D. Hiser and Jack W. Davidson and
Douglas L. Jones",
title = "Fast and efficient searches for effective
optimization-phase sequences",
journal = j-TACO,
volume = "2",
number = "2",
pages = "165--198",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Salami:2005:DMI,
author = "Esther Salam{\'\i} and Mateo Valero",
title = "Dynamic memory interval test vs. interprocedural
pointer analysis in multimedia applications",
journal = j-TACO,
volume = "2",
number = "2",
pages = "199--219",
month = jun,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 7 14:09:53 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Meng:2005:ELL,
author = "Yan Meng and Timothy Sherwood and Ryan Kastner",
title = "Exploring the limits of leakage power reduction in
caches",
journal = j-TACO,
volume = "2",
number = "3",
pages = "221--246",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Garzaran:2005:TBS,
author = "Mar{\'\i}a Jes{\'u}s Garzar{\'a}n and Milos Prvulovic
and Jos{\'e} Mar{\'\i}a Llaber{\'\i}a and V{\'\i}ctor
Vi{\~n}als and Lawrence Rauchwerger and Josep
Torrellas",
title = "Tradeoffs in buffering speculative memory state for
thread-level speculation in multiprocessors",
journal = j-TACO,
volume = "2",
number = "3",
pages = "247--279",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tarjan:2005:MPG,
author = "David Tarjan and Kevin Skadron",
title = "Merging path and gshare indexing in perceptron branch
prediction",
journal = j-TACO,
volume = "2",
number = "3",
pages = "280--300",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2005:WET,
author = "Xiangyu Zhang and Rajiv Gupta",
title = "Whole execution traces and their applications",
journal = j-TACO,
volume = "2",
number = "3",
pages = "301--334",
month = sep,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Oct 5 07:42:22 MDT 2005",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2005:IWA,
author = "Wankang Zhao and David Whalley and Christopher Healy
and Frank Mueller",
title = "Improving {WCET} by applying a {WC} code-positioning
optimization",
journal = j-TACO,
volume = "2",
number = "4",
pages = "335--365",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "WC (worst case); WCET (worst case execution time)",
}
@Article{Reis:2005:SCF,
author = "George A. Reis and Jonathan Chang and Neil
Vachharajani and Ram Rangan and David I. August and
Shubhendu S. Mukherjee",
title = "Software-controlled fault tolerance",
journal = j-TACO,
volume = "2",
number = "4",
pages = "366--396",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2005:PPC,
author = "Jian Li and Jos{\'e} F. Mart{\'\i}nez",
title = "Power-performance considerations of parallel computing
on chip multiprocessors",
journal = j-TACO,
volume = "2",
number = "4",
pages = "397--422",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sharma:2005:SPE,
author = "Saurabh Sharma and Jesse G. Beu and Thomas M. Conte",
title = "Spectral prefetcher: {An} effective mechanism for {L2}
cache prefetching",
journal = j-TACO,
volume = "2",
number = "4",
pages = "423--450",
month = dec,
year = "2005",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Feb 16 11:03:13 MST 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Calder:2006:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "3",
number = "1",
pages = "1--2",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tan:2006:BSS,
author = "Lin Tan and Brett Brotherton and Timothy Sherwood",
title = "Bit-split string-matching engines for intrusion
detection and prevention",
journal = j-TACO,
volume = "3",
number = "1",
pages = "3--34",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nagpurkar:2006:ERP,
author = "Priya Nagpurkar and Hussam Mousa and Chandra Krintz
and Timothy Sherwood",
title = "Efficient remote profiling for resource-constrained
devices",
journal = j-TACO,
volume = "3",
number = "1",
pages = "35--66",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lin:2006:RCG,
author = "Jin Lin and Wei-Chung Hsu and Pen-Chung Yew and Roy
Dz-Ching Ju and Tin-Fook Ngai",
title = "Recovery code generation for general speculative
optimizations",
journal = j-TACO,
volume = "3",
number = "1",
pages = "67--89",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Choi:2006:ORR,
author = "Yoonseo Choi and Hwansoo Han",
title = "Optimal register reassignment for register stack
overflow minimization",
journal = j-TACO,
volume = "3",
number = "1",
pages = "90--114",
month = mar,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 18 08:38:26 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xue:2006:LOA,
author = "Jingling Xue and Qiong Cai",
title = "A lifetime optimal algorithm for speculative {PRE}",
journal = j-TACO,
volume = "3",
number = "2",
pages = "115--155",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sharkey:2006:IPT,
author = "Joseph J. Sharkey and Dmitry V. Ponomarev and Kanad
Ghose and Oguz Ergin",
title = "Instruction packing: {Toward} fast and
energy-efficient instruction scheduling",
journal = j-TACO,
volume = "3",
number = "2",
pages = "156--181",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ceze:2006:CUC,
author = "Luis Ceze and Karin Strauss and James Tuck and Josep
Torrellas and Jose Renau",
title = "{CAVA}: {Using} checkpoint-assisted value prediction
to hide {L2} misses",
journal = j-TACO,
volume = "3",
number = "2",
pages = "182--208",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2006:EAR,
author = "Lixin Zhang and Mike Parker and John Carter",
title = "Efficient address remapping in distributed
shared-memory systems",
journal = j-TACO,
volume = "3",
number = "2",
pages = "209--229",
month = jun,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 9 06:47:22 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2006:ATP,
author = "Min Zhao and Bruce R. Childers and Mary Lou Soffa",
title = "An approach toward profit-driven optimization",
journal = j-TACO,
volume = "3",
number = "3",
pages = "231--262",
month = sep,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1162690.1162691",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Although optimizations have been applied for a number
of years to improve the performance of software,
problems with respect to the application of
optimizations have not been adequately addressed. For
example, in certain circumstances, optimizations may
degrade performance. However, there is no efficient way
to know when a degradation will occur. In this
research, we investigate the profitability of
optimizations, which is useful for determining the
benefit of applying optimizations. We develop a
framework that enables us to predict profitability
using analytic models. The profitability of an
optimization depends on code context, the particular
optimization, and machine resources. Thus, our
framework has analytic models for each of these
components. As part of the framework, there is also a
profitability engine that uses models to predict the
profit. In this paper, we target scalar optimizations
and, in particular, describe the models for partial
redundancy elimination (PRE), loop invariant code
motion (LICM), and value numbering (VN). We implemented
the framework for predicting the profitability of these
optimizations. Based on the predictions, we can
selectively apply profitable optimizations. We compared
the profit-driven approach with an approach that uses a
heuristic in deciding when optimizations should be
applied. Our experiments demonstrate that the
profitability of scalar optimizations can be accurately
predicted by using models. That is, without actually
applying a scalar optimization, we can determine if an
optimization is beneficial and should be applied.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hazelwood:2006:MBC,
author = "Kim Hazelwood and Michael D. Smith",
title = "Managing bounded code caches in dynamic binary
optimization systems",
journal = j-TACO,
volume = "3",
number = "3",
pages = "263--294",
month = sep,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1162690.1162692",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Dynamic binary optimizers store altered copies of
original program instructions in software-managed code
caches in order to maximize reuse of transformed code.
Code caches store code blocks that may vary in size,
reference other code blocks, and carry a high
replacement overhead. These unique constraints reduce
the effectiveness of conventional cache management
policies. Our work directly addresses these unique
constraints and presents several contributions to the
code-cache management problem. First, we show that
evicting more than the minimum number of code blocks
from the code cache results in less run-time overhead
than the existing alternatives. Such granular evictions
reduce overall execution time, as the fixed costs of
invoking the eviction mechanism are amortized across
multiple cache insertions. Second, a study of the ideal
lifetimes of dynamically generated code blocks
illustrates the benefit of a replacement algorithm
based on a generational heuristic. We describe and
evaluate a generational approach to code cache
management that makes it easy to identify long-lived
code blocks and simultaneously avoid any fragmentation
because of the eviction of short-lived blocks. Finally,
we present results from an implementation of our
generational approach in the DynamoRIO framework and
illustrate that, as dynamic optimization systems become
more prevalent, effective code cache-management
policies will be essential for reliable, scalable
performance of modern applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rochecouste:2006:CCE,
author = "Olivier Rochecouste and Gilles Pokam and Andr{\'e}
Seznec",
title = "A case for a complexity-effective, width-partitioned
microarchitecture",
journal = j-TACO,
volume = "3",
number = "3",
pages = "295--326",
month = sep,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1162690.1162693",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The analysis of program executions reveals that most
integer and multimedia applications make heavy use of
narrow-width operations, i.e., instructions exclusively
using narrow-width operands and producing a
narrow-width result. Moreover, this usage is relatively
well distributed over the application. We observed this
program property on the MediaBench and SPEC2000
benchmarks with about 40\% of the instructions being
narrow-width operations. Current superscalar processors
use 64-bit datapaths to execute all the instructions of
the applications. In this paper, we suggest the use of
a width-partitioned microarchitecture (WPM) to master
the hardware complexity of a superscalar processor. For
a four-way issue machine, we split the processor in two
two-way clusters: the main cluster executing 64-bit
operations, load/store, and complex operations and a
narrow cluster executing the 16-bit operations. We
resort to partitioning to decouple the treatment of the
narrow-width operations from that of the other program
instructions. This provides the benefit of greatly
simplifying the design of the critical processor
components in each cluster (e.g., the register file and
the bypass network). The dynamic interleaving of the
two instruction types allows maintaining the workload
balanced among clusters. WPM also helps to reduce the
complexity of the interconnection fabric and of the
issue logic. In fact, since the 16-bit cluster can only
communicate narrow-width data, the datapath-width of
the interconnect fabric can be significantly reduced,
yielding a corresponding saving of the interconnect
power and area. We explore different possible
configurations of WPM, discussing the various
implementation tradeoffs. We also examine a speculative
steering heuristic to distribute the narrow-width
operations among clusters. A detailed analysis of the
complexity factors shows using WPM instead of a
classical 64-bit two-cluster microarchitecture can save
power and silicon area with a minimal impact on the
overall performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zmily:2006:BAI,
author = "Ahmad Zmily and Christos Kozyrakis",
title = "Block-aware instruction set architecture",
journal = j-TACO,
volume = "3",
number = "3",
pages = "327--357",
month = sep,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1162690.1162694",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Sep 23 07:54:36 MDT 2006",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Instruction delivery is a critical component for
wide-issue, high-frequency processors since its
bandwidth and accuracy place an upper limit on
performance. The processor front-end accuracy and
bandwidth are limited by instruction-cache misses,
multicycle instruction-cache accesses, and target or
direction mispredictions for control-flow operations.
This paper presents a block-aware instruction set
(BLISS) that allows software to assist with front-end
challenges. BLISS defines basic block descriptors that
are stored separately from the actual instructions in a
program. We show that BLISS allows for a decoupled
front-end that tolerates instruction-cache latency,
facilitates instruction prefetching, and leads to
higher prediction accuracy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Crandall:2006:MAS,
author = "Jedidiah R. Crandall and S. Felix Wu and Frederic T.
Chong",
title = "{Minos}: {Architectural} support for protecting
control data",
journal = j-TACO,
volume = "3",
number = "4",
pages = "359--389",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Marathe:2006:ACC,
author = "Jaydeep Marathe and Frank Mueller and Bronis R. de
Supinski",
title = "Analysis of cache-coherence bottlenecks with hybrid
hardware\slash software techniques",
journal = j-TACO,
volume = "3",
number = "4",
pages = "390--423",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ganusov:2006:FEP,
author = "Ilya Ganusov and Martin Burtscher",
title = "Future execution: a prefetching mechanism that uses
multiple cores to speed up single threads",
journal = j-TACO,
volume = "3",
number = "4",
pages = "424--449",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Co:2006:ETC,
author = "Michele Co and Dee A. B. Weikle and Kevin Skadron",
title = "Evaluating trace cache energy efficiency",
journal = j-TACO,
volume = "3",
number = "4",
pages = "450--476",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hu:2006:EMM,
author = "Shiwen Hu and Madhavi Valluri and Lizy Kurian John",
title = "Effective management of multiple configurable units
using dynamic optimization",
journal = j-TACO,
volume = "3",
number = "4",
pages = "477--501",
month = dec,
year = "2006",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bentley:2006:IAB,
author = "Chris Bentley and Scott A. Watterson and David K.
Lowenthal and Barry Rountree",
title = "Implicit array bounds checking on 64-bit
architectures",
journal = j-TACO,
volume = "3",
number = "4",
pages = "502--527",
month = dec,
year = "2006",
CODEN = "????",
DOI = "https://doi.org/10.1145/1187976.1187982",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Several programming languages guarantee that array
subscripts are checked to ensure they are within the
bounds of the array. While this guarantee improves the
correctness and security of array-based code, it adds
overhead to array references. This has been an obstacle
to using higher-level languages, such as Java, for
high-performance parallel computing, where the language
specification requires that all array accesses must be
checked to ensure they are within bounds. This is
because, in practice, array-bounds checking in
scientific applications may increase execution time by
more than a factor of 2. Previous research has explored
optimizations to statically eliminate bounds checks,
but the dynamic nature of many scientific codes makes
this difficult or impossible. Our approach is, instead,
to create a compiler and operating system
infrastructure that does not generate explicit bounds
checks. It instead places arrays inside of Index
Confinement Regions (ICRs), which are large, isolated,
mostly unmapped virtual memory regions. Any array
reference outside of its bounds will cause a protection
violation; this provides implicit bounds checking. Our
results show that when applying this infrastructure to
high-performance computing programs written in Java,
the overhead of bounds checking relative to a program
with no bounds checks is reduced from an average of
63\% to an average of 9\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Calder:2007:I,
author = "Brad Calder and Dean Tullsen",
title = "Introduction",
journal = j-TACO,
volume = "4",
number = "1",
pages = "1:1--1:1",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Constantinides:2007:ARC,
author = "Kypros Constantinides and Stephen Plaza and Jason
Blome and Valeria Bertacco and Scott Mahlke and Todd
Austin and Bin Zhang and Michael Orshansky",
title = "Architecting a reliable {CMP} switch architecture",
journal = j-TACO,
volume = "4",
number = "1",
pages = "2:1--2:37",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sasanka:2007:AES,
author = "Ruchira Sasanka and Man-Lap Li and Sarita V. Adve and
Yen-Kuang Chen and Eric Debes",
title = "{ALP}: {Efficient} support for all levels of
parallelism for complex media applications",
journal = j-TACO,
volume = "4",
number = "1",
pages = "3:1--3:30",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Luo:2007:CNP,
author = "Yan Luo and Jia Yu and Jun Yang and Laxmi N. Bhuyan",
title = "Conserving network processor power consumption by
exploiting traffic variability",
journal = j-TACO,
volume = "4",
number = "1",
pages = "4:1--4:26",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Soteriou:2007:SDP,
author = "Vassos Soteriou and Noel Eisley and Li-Shiuan Peh",
title = "Software-directed power-aware interconnection
networks",
journal = j-TACO,
volume = "4",
number = "1",
pages = "5:1--5:40",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hwang:2007:SSA,
author = "Yuan-Shin Hwang and Jia-Jhe Li",
title = "Snug set-associative caches: Reducing leakage power of
instruction and data caches with no performance
penalties",
journal = j-TACO,
volume = "4",
number = "1",
pages = "6:1--6:28",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rong:2007:SDS,
author = "Hongbo Rong and Zhizhong Tang and R. Govindarajan and
Alban Douillet and Guang R. Gao",
title = "Single-dimension software pipelining for
multidimensional loops",
journal = j-TACO,
volume = "4",
number = "1",
pages = "7:1--7:44",
month = mar,
year = "2007",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Apr 14 10:44:57 MDT 2007",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bower:2007:ODH,
author = "Fred A. Bower and Daniel J. Sorin and Sule Ozev",
title = "Online diagnosis of hard faults in microprocessors",
journal = j-TACO,
volume = "4",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1250727.1250728",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We develop a microprocessor design that tolerates hard
faults, including fabrication defects and in-field
faults, by leveraging existing microprocessor
redundancy. To do this, we must: detect and correct
errors, diagnose hard faults at the field
deconfigurable unit (FDU) granularity, and deconfigure
FDUs with hard faults. In our reliable microprocessor
design, we use DIVA dynamic verification to detect and
correct errors. Our new scheme for diagnosing hard
faults tracks instructions' core structure occupancy
from decode until commit. If a DIVA checker detects an
error in an instruction, it increments a small
saturating error counter for every FDU used by that
instruction, including that DIVA checker. A hard fault
in an FDU quickly leads to an above-threshold error
counter for that FDU and thus diagnoses the fault. For
deconfiguration, we use previously developed schemes
for functional units and buffers and present a scheme
for deconfiguring DIVA checkers. Experimental results
show that our reliable microprocessor quickly and
accurately diagnoses each hard fault that is injected
and continues to function, albeit with somewhat
degraded performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "fine-grained diagnosis; hard fault tolerance;
processor microarchitecture",
}
@Article{Michaud:2007:STM,
author = "Pierre Michaud and Andr{\'e} Seznec and Damien Fetis
and Yiannakis Sazeides and Theofanis Constantinou",
title = "A study of thread migration in temperature-constrained
multicores",
journal = j-TACO,
volume = "4",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1250727.1250729",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Temperature has become an important constraint in
high-performance processors, especially multicores.
Thread migration will be essential to exploit the full
potential of future thermally constrained multicores.
We propose and study a thread migration method that
maximizes performance under a temperature constraint,
while minimizing the number of migrations and ensuring
fairness between threads. We show that thread migration
brings important performance gains and that it is most
effective during the first tens of seconds following a
decrease of the number of running threads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "multicore processor; power density; temperature;
thermal management; thread migration",
}
@Article{Chen:2007:CRL,
author = "Yu Chen and Fuxin Zhang",
title = "Code reordering on limited branch offset",
journal = j-TACO,
volume = "4",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1250727.1250730",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Since the 1980's code reordering has gained popularity
as an important way to improve the spatial locality of
programs. While the effect of the processor's
microarchitecture and memory hierarchy on this
optimization technique has been investigated, little
research has focused on the impact of the instruction
set. In this paper, we analyze the effect of limited
branch offset of the MIPS-like instruction set [Hwu et
al. 2004, 2005] on code reordering, explore two simple
methods to handle the exceeded branches, and propose
the bidirectional code layout (BCL) algorithm to reduce
the number of branches exceeding the offset limit. The
BCL algorithm sorts the chains according to the
position of related chains, avoids cache conflict
misses deliberately and lays out the code
bidirectionally. It strikes a balance among the
distance of related blocks, the instruction cache miss
rate, the memory size required, and the control flow
transfer. Experimental results show that BCL can
effectively reduce exceeded branches by 50.1\%, on
average, with up to 100\% for some programs. Except for
some programs with little spatial locality, the BCL
algorithm can achieve the performance, as the case with
no branch offset limitation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "code reordering; Godson Processor; link-time
optimization",
}
@Article{Terechko:2007:ICC,
author = "A. S. Terechko and H. Corporaal",
title = "Inter-cluster communication in {VLIW} architectures",
journal = j-TACO,
volume = "4",
number = "2",
pages = "11:1--11:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1250727.1250731",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The traditional VLIW (very long instruction word)
architecture with a single register file does not scale
up well to address growing performance demands on
embedded media processors. However, splitting a VLIW
processor in smaller clusters, which are comprised of
function units fully connected to local register files,
can significantly improve VLSI implementation
characteristics of the processor, such as speed, energy
consumption, and area. In our paper we reveal that
achieving the best characteristics of a clustered VLIW
requires a thorough selection of an Inter-cluster
Communication (ICC) model, which is the way clustering
is exposed in the Instruction Set Architecture. For our
study we, first, define a taxonomy of ICC models
including copy operations, dedicated issue slots,
extended operands, extended results, and multicast.
Evaluation of the execution time of the models requires
both the dynamic cycle count and clock period. We
developed an advanced instruction scheduler for all the
five ICC models in order to quantify the dynamic cycle
counts of our multimedia C benchmarks. To assess the
clock period of the ICC models we designed and laid out
VLIW datapaths using the RTL hardware descriptions
derived from a deeply pipelined commercial TriMedia
processor. In contrast to prior art, our research shows
that fully distributed register file architectures
(with eight clusters in our study) often underperform
compared to moderately clustered machines with two or
four clusters because of explosion of the cycle count
overhead in the former. Among the evaluated ICC models,
performance of the copy operation model, popular both
in academia and industry, is severely limited by the
copy operations hampering scheduling of regular
operations in high ILP (instruction-level parallelism)
code. The dedicated issue slots model combats this
limitation by dedicating extra VLIW issue slots purely
for ICC, reaching the highest 1.74 execution time
speedup relative to the unicluster. Furthermore, our
VLSI experiments show that the lowest area and energy
consumption of 42 and 57\% relative to the unicluster,
respectively, are achieved by the extended operands
model, which, nevertheless, provides higher performance
than the copy operation model.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "clock frequency; cluster assignment; instruction
scheduler; instruction-level parallelism; intercluster
communication; optimizing compiler; pipelining;
register allocation; VLIW",
}
@Article{Dou:2007:CCM,
author = "Jialin Dou and Marcelo Cintra",
title = "A compiler cost model for speculative
parallelization",
journal = j-TACO,
volume = "4",
number = "2",
pages = "12:1--12:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1250727.1250732",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Speculative parallelization is a technique that allows
code sections that cannot be fully analyzed by the
compiler to be aggressively executed in parallel.
However, while speculative parallelization can
potentially deliver significant speedups, several
overheads associated with this technique can limit
these speedups in practice. This paper proposes a novel
compiler static cost model of speculative multithreaded
execution that can be used to predict the resulting
performance. This model attempts to predict the
expected speedups, or slowdowns, of the candidate
speculative sections based on the estimation of the
combined runtime effects of various overheads, and
taking into account the scheduling restrictions of most
speculative execution environments. The model is based
on estimating the likely execution duration of threads
and considers all the possible permutations of these
threads. This model also produces a quantitative
estimate of the speedup, which is different from prior
heuristics that only qualitatively estimate the
benefits of speculative multithreaded execution. In
previous work, a limited version of the framework was
evaluated on a number of loops from a collection of
SPEC benchmarks that suffer mainly from load imbalance
and thread dispatch and commit overheads. In this work,
an extended framework is also evaluated on loops that
may suffer from data-dependence violations.
Experimental results show that prediction accuracy is
lower when loops with violations are included.
Nevertheless, accuracy is still very high for a static
model: the framework can identify, on average, 45\% of
the loops that cause slowdowns and, on average, 96\% of
the loops that lead to speedups; it predicts the
speedups or slowdowns with an error of less than 20\%
for an average of 28\% of the loops across the
benchmarks and with an error of less than 50\% for an
average of 80\% of the loops. Overall, the framework
often outperforms, by as much as 25\%, a naive approach
that attempts to speculatively parallelize all the
loops considered, and is able to curb the large
slowdowns caused in many cases by this naive
approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "speculative multithreading; speculative
parallelization; thread-level speculation",
}
@Article{Amme:2007:SBM,
author = "Wolfram Amme and Jeffery von Ronne and Michael Franz",
title = "{SSA}-based mobile code: {Implementation} and
empirical evaluation",
journal = j-TACO,
volume = "4",
number = "2",
pages = "13:1--13:??",
month = jun,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1250727.1250733",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:40:54 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Although one might expect transportation formats based
on static single-assignment form (SSA) to yield faster
just-in-time compilation times than those based on
stack-based virtual machines, this claim has not
previously been validated, in practice. We attempt to
quantify the effect of using an SSA-based mobile code
representation by integrating support for a verifiable
SSA-based IR into Jikes RVM. Performance results,
measured with various optimizations and on both the
IA32 and PowerPC, show improvements in both compilation
time and code quality.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "SafeTSA; static single-assignment form; virtual
machines",
}
@Article{Li:2007:CCE,
author = "Xiaodong Li and Ritu Gupta and Sarita V. Adve and
Yuanyuan Zhou",
title = "Cross-component energy management: {Joint} adaptation
of processor and memory",
journal = j-TACO,
volume = "4",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1275937.1275938",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Researchers have proposed the use of adaptation to
reduce the energy consumption of different hardware
components, such as the processor, memory, disk, and
display for general-purpose applications. Previous
algorithms to control these adaptations, however, have
focused on a single component. This work takes the
first step toward developing algorithms that can
jointly control adaptations in multiple interacting
components for general-purpose applications, with the
goal of minimizing the total energy consumed within a
specified performance loss. Specifically, we develop a
joint-adaptation algorithm for processor and memory
adaptations. We identify two properties that enable
per-component algorithms to be easily used in a
cross-component context---the algorithms' performance
impact must be guaranteed and composable. We then
modify a current processor and a memory algorithm to
obey these properties. This allows the cross-component
problem to be reduced to determine an appropriate
(energy-optimal) allocation of the target performance
loss (slack) between the two components. We develop
such an optimal slack allocation algorithm that
exploits the above properties. The result is an
efficient cross-component adaptation framework that
minimizes the total energy of the processor and memory
without exceeding the target performance loss, while
substantially leveraging current per-component
algorithms. Our experiments show that joint processor
and memory adaptation provides significantly more
energy savings than adapting either component alone;
intelligent slack distribution is specifically
effective for highly compute- or memory-intensive
applications; and the performance slowdown never
exceeds the specification.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "adaptive systems; control algorithms; energy
management; low-power design; memory; performance
guarantee; processor",
}
@Article{Gabor:2007:FES,
author = "Ron Gabor and Shlomo Weiss and Avi Mendelson",
title = "Fairness enforcement in switch on event
multithreading",
journal = j-TACO,
volume = "4",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1275937.1275939",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The need to reduce power and complexity will increase
the interest in Switch On Event multithreading
(coarse-grained multithreading). Switch On Event
multithreading is a low-power and low-complexity
mechanism to improve processor throughput by switching
threads on execution stalls. Fairness may, however,
become a problem in a multithreaded processor. Unless
fairness is properly handled, some threads may starve
while others consume all of the processor cycles.
Heuristics that were devised in order to improve
fairness in simultaneous multithreading are not
applicable to Switch On Event multithreading. This
paper defines the fairness metric using the ratio of
the individual threads' speedups and shows how it can
be enforced in Switch On Event multithreading. Fairness
is controlled by forcing additional thread switch
points. These switch points are determined dynamically
by runtime estimation of the single threaded
performance of each of the individual threads. We
analyze the impact of the fairness enforcement
mechanism on aggregate IPC and weighted speedup. We
present simulation results of the performance of Switch
On Event multithreading. Switch On Event multithreading
achieves an average aggregate IPC increase of 26\% over
single thread and 12\% weighted speedup when no
fairness is enforced. In this case, a sixth of our runs
resulted in poor fairness in which one thread ran
extremely slowly (10 to 100 times slower than its
single-thread performance), while the other thread's
performance was hardly affected. By using the proposed
mechanism, we can guarantee fairness at different
levels of strictness and, in most cases, even improve
the weighted speedup.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "coarse-grained multithreading; fairness;
multithreading; performance; SOE; Switch on Event
multithreading; throughput; weighted speedup",
}
@Article{Andrade:2007:PAA,
author = "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n
Doallo",
title = "Precise automatable analytical modeling of the cache
behavior of codes with indirections",
journal = j-TACO,
volume = "4",
number = "3",
pages = "16:1--16:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1275937.1275940",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The performance of memory hierarchies, in which caches
play an essential role, is critical in nowadays
general-purpose and embedded computing systems because
of the growing memory bottleneck problem.
Unfortunately, cache behavior is very unstable and
difficult to predict. This is particularly true in the
presence of irregular access patterns, which exhibit
little locality. Such patterns are very common, for
example, in applications in which pointers or
compressed sparse matrices give place to indirections.
Nevertheless, cache behavior in the presence of
irregular access patterns has not been widely studied.
In this paper we present an extension of a systematic
analytical modeling technique based on PMEs
(probabilistic miss equations), previously developed by
the authors, that allows the automated analysis of the
cache behavior for codes with irregular access patterns
resulting from indirections. The model generates very
accurate predictions despite the irregularities and has
very low computing requirements, being the first model
that gathers these desirable characteristics that can
automatically analyze this kind of codes. These
properties enable this model to help drive compiler
optimizations, as we show with an example.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "analytical modeling; irregular access patterns; memory
hierarchy; performance prediction",
}
@Article{Venstermans:2007:JOH,
author = "Kris Venstermans and Lieven Eeckhout and Koen {De
Bosschere}",
title = "{Java} object header elimination for reduced memory
consumption in 64-bit virtual machines",
journal = j-TACO,
volume = "4",
number = "3",
pages = "17:1--17:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1275937.1275941",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memory performance is an important design issue for
contemporary computer systems given the huge
processor/memory speed gap. This paper proposes a
space-efficient Java object model for reducing the
memory consumption of 64-bit Java virtual machines. We
completely eliminate the object header through typed
virtual addressing (TVA) or implicit typing. TVA
encodes the object type in the object's virtual address
by allocating all objects of a given type in a
contiguous memory segment. This allows for removing the
type information as well as the status field from the
object header. Whenever type and status information is
needed, masking is applied to the object's virtual
address for obtaining an offset into type and status
information structures. Unlike previous work on
implicit typing, we apply TVA to a selected number of
frequently allocated object types, hence, the name
selective TVA (STVA); this limits the amount of memory
fragmentation. In addition to applying STVA, we also
compress the type information block (TIB) pointers for
all objects that do not fall under TVA. We implement
the space-efficient Java object model in the 64-bit
version of the Jikes RVM on an AIX IBM platform and
compare its performance against the traditionally used
Java object model using a multitude of Java benchmarks.
We conclude that the space-efficient Java object model
reduces memory consumption by on average 15\% (and up
to 45\% for some benchmarks). About one-half the
reduction comes from TIB pointer compression; the other
one-half comes from STVA. In terms of performance, the
space-efficient object model generally does not affect
performance; however, for some benchmarks we observe
statistically significant performance speedups, up to
20\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "64-bit implementation; implicit typing; Java object
model; typed virtual addressing; Virtual machine",
}
@Article{Xiao:2007:VIS,
author = "Shu Xiao and Edmund M.-K. Lai",
title = "{VLIW} instruction scheduling for minimal power
variation",
journal = j-TACO,
volume = "4",
number = "3",
pages = "18:1--18:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1275937.1275942",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The focus of this paper is on the minimization of the
variation in power consumed by a VLIW processor during
the execution of a target program through instruction
scheduling. The problem is formulated as a
mixed-integer program (MIP) and a problem-specific
branch-and-bound algorithm has been developed to solve
it more efficiently than generic MIP solvers.
Simulation results based on the TMS320C6711 VLIW
digital signal processor using benchmarks from
Mediabench and Trimaran showed that over 40\% average
reduction in power variation can be achieved without
sacrificing execution speed of these benchmarks.
Computational requirements and convergence rates of our
algorithm are also analyzed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "instruction scheduling; power variation reduction;
VLIW processors",
}
@Article{Tallam:2007:UCF,
author = "Sriraman Tallam and Rajiv Gupta",
title = "Unified control flow and data dependence traces",
journal = j-TACO,
volume = "4",
number = "3",
pages = "19:1--19:??",
month = sep,
year = "2007",
CODEN = "????",
DOI = "https://doi.org/10.1145/1275937.1275943",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:20 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We describe the design, generation, and compression of
the extended whole program path (eWPP), representation
that not only captures the control flow history of a
program execution but also its data dependence history.
This representation is motivated by the observation
that, typically, a significant fraction of data
dependence history can be recovered from the control
flow trace. To capture the remainder of the data
dependence history, we introduce disambiguation checks
in the program whose control flow signatures capture
the results of the checks. The resulting extended
control flow trace enables the recovery of otherwise
irrecoverable data dependences. The code for the checks
is designed to minimize the increase in program
execution time and the extended control flow trace size
when compared to directly collecting control flow and
address traces. Our experiments show that compressed
eWPPs are only one-quarter of the size of combined
compressed control flow and address traces. However,
their collection incurs a 5{\times} increase in runtime
overhead relative to the overhead required for directly
collecting the control flow and address traces,
respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "address trace; control flow trace; dynamic data
dependence trace; profiling",
}
@Article{Ipek:2008:EAD,
author = "Engin Ipek and Sally A. McKee and Karan Singh and Rich
Caruana and Bronis R. de Supinski and Martin Schulz",
title = "Efficient architectural design space exploration via
predictive modeling",
journal = j-TACO,
volume = "4",
number = "4",
pages = "1:1--1:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1328195.1328196",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Efficiently exploring exponential-size architectural
design spaces with many interacting parameters remains
an open problem: the sheer number of experiments
required renders detailed simulation intractable. We
attack this via an automated approach that builds
accurate predictive models. We simulate sampled points,
using results to teach our models the function
describing relationships among design parameters. The
models can be queried and are very fast, enabling
efficient design tradeoff discovery. We validate our
approach via two uniprocessor sensitivity studies,
predicting IPC with only 1--2\% error. In an
experimental study using the approach, training on 1\%
of a 250-K-point CMP design space allows our models to
predict performance with only 4--5\% error. Our
predictive modeling combines well with techniques that
reduce the time taken by each simulation experiment,
achieving net time savings of three-four orders of
magnitude.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "artificial neural networks; design space exploration;
performance prediction; sensitivity studies",
}
@Article{Shi:2008:VMS,
author = "Yunhe Shi and Kevin Casey and M. Anton Ertl and David
Gregg",
title = "Virtual machine showdown: {Stack} versus registers",
journal = j-TACO,
volume = "4",
number = "4",
pages = "2:1--2:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1328195.1328197",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Virtual machines (VMs) enable the distribution of
programs in an architecture-neutral format, which can
easily be interpreted or compiled. A long-running
question in the design of VMs is whether a stack
architecture or register architecture can be
implemented more efficiently with an interpreter. We
extend existing work on comparing virtual stack and
virtual register architectures in three ways. First,
our translation from stack to register code and
optimization are much more sophisticated. The result is
that we eliminate an average of more than 46\% of
executed VM instructions, with the bytecode size of the
register machine being only 26\% larger than that of
the corresponding stack one. Second, we present a fully
functional virtual-register implementation of the Java
virtual machine (JVM), which supports Intel, AMD64,
PowerPC and Alpha processors. This register VM supports
inline-threaded, direct-threaded, token-threaded, and
switch dispatch. Third, we present experimental results
on a range of additional optimizations such as register
allocation and elimination of redundant heap loads. On
the AMD64 architecture the register machine using
switch dispatch achieves an average speedup of 1.48
over the corresponding stack machine. Even using the
more efficient inline-threaded dispatch, the register
VM achieves a speedup of 1.15 over the equivalent
stack-based VM.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "interpreter; register architecture; stack
architecture; virtual machine",
}
@Article{Yan:2008:EVR,
author = "Jun Yan and Wei Zhang",
title = "Exploiting virtual registers to reduce pressure on
real registers",
journal = j-TACO,
volume = "4",
number = "4",
pages = "3:1--3:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1328195.1328198",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "It is well known that a large fraction of variables
are short-lived. This paper proposes a novel approach
to exploiting this fact to reduce the register pressure
for pipelined processors with data-forwarding network.
The idea is that the compiler can allocate virtual
registers (i.e., place holders to identify dependences
among instructions) to short-lived variables, which do
not need to be stored to physical storage locations. As
a result, real registers (i.e., physically existed
registers) can be reserved for long-lived variables for
mitigating the register pressure and decreasing the
register spills, leading to performance improvement. In
this paper, we develop the architectural and compiler
support for exploiting virtual registers for statically
scheduled processors. Our experimental results show
that virtual registers are very effective at reducing
the register spills, which, in many cases, can achieve
the performance close to the processor with twice
number of real registers. Our results also indicate
that, for some applications, using 24 virtual, in
addition to 8 real registers, can attain even higher
performance than that of 16 real without any virtual
registers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "data forwarding; register allocation; register file;
short-lived variables; virtual register",
}
@Article{Yu:2008:OCL,
author = "Zoe C. H. Yu and Francis C. M. Lau and Cho-Li Wang",
title = "Object co-location and memory reuse for {Java}
programs",
journal = j-TACO,
volume = "4",
number = "4",
pages = "4:1--4:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1328195.1328199",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We introduce a new memory management system, STEMA,
which can improve the execution time of Java programs.
STEMA detects prolific types on-the-fly and co-locates
their objects in a special memory space which supports
reuse of memory. We argue and show that memory reuse
and co-location of prolific objects can result in
improved cache locality, reduced memory fragmentation,
reduced GC time, and faster object allocation. We
evaluate STEMA using 16 benchmarks. Experimental
results show that STEMA performs 2.7\%, 4.0\%, and
8.2\% on average better than MarkSweep, CopyMS, and
SemiSpace.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "garbage collector; Java; memory allocator; memory
reuse; mutator; object co-location",
}
@Article{Zhang:2008:RCM,
author = "Chuanjun Zhang",
title = "Reducing cache misses through programmable decoders",
journal = j-TACO,
volume = "4",
number = "4",
pages = "5:1--5:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1328195.1328200",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Level-one caches normally reside on a processor's
critical path, which determines clock frequency.
Therefore, fast access to level-one cache is important.
Direct-mapped caches exhibit faster access time, but
poor hit rates, compared with same sized
set-associative caches because of nonuniform accesses
to the cache sets. The nonuniform accesses generate
more cache misses in some sets, while other sets are
underutilized. We propose to increase the decoder
length and, hence, reduce the accesses to heavily used
sets without dynamically detecting the cache set usage
information. We increase the access to the
underutilized cache sets by incorporating a replacement
policy into the cache design using programmable
decoders. On average, the proposed techniques achieve
as low a miss rate as a traditional 4-way cache on all
26 SPEC2K benchmarks for the instruction and data
caches, respectively. This translates into an average
IPC improvement of 21.5 and 42.4\% for SPEC2K integer
and floating-point benchmarks, respectively. The
B-Cache consumes 10.5\% more power per access, but
exhibits a 12\% total memory access-related energy
savings as a result of the miss rate reductions, and,
hence, the reduction to applications' execution time.
Compared with previous techniques that aim at reducing
the miss rate of direct-mapped caches, our technique
requires only one cycle to access all cache hits and
has the same access time of a direct-mapped cache.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "cache; dynamic optimization; low power",
}
@Article{Golander:2008:HMP,
author = "Amit Golander and Shlomo Weiss",
title = "Hiding the misprediction penalty of a
resource-efficient high-performance processor",
journal = j-TACO,
volume = "4",
number = "4",
pages = "6:1--6:??",
month = jan,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1328195.1328201",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:35 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Misprediction is a major obstacle for increasing
speculative out-of-order processors performance.
Performance degradation depends on both the number of
misprediction events and the recovery time associated
with each one of them. In recent years a few checkpoint
based microarchitectures have been proposed. In
comparison with ROB-based processors, checkpoint
processors are scalable and highly resource efficient.
Unfortunately, in these proposals the misprediction
recovery time is proportional to the instruction queue
size.\par
In this paper we analyze methods to reduce the
misprediction recovery time. We propose a new register
file management scheme and techniques to selectively
flush the instruction queue and the load store queue,
and to isolate deeply pipelined execution units. The
result is a novel checkpoint processor with Constant
misprediction RollBack time (CRB). We further present a
streamlined, cost-efficient solution, which saves
complexity at the price of slightly lower
performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "checkpoints; misprediction; out-of-order execution;
rollback; scalable architecture",
}
@Article{Calder:2008:E,
author = "Brad Calder and Dean Tullsen",
title = "Editorial",
journal = j-TACO,
volume = "5",
number = "1",
pages = "1:1--1:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1369397",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mysore:2008:FIP,
author = "Shashidhar Mysore and Banit Agrawal and Rodolfo Neuber
and Timothy Sherwood and Nisheeth Shrivastava and
Subhash Suri",
title = "Formulating and implementing profiling over adaptive
ranges",
journal = j-TACO,
volume = "5",
number = "1",
pages = "2:1--2:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1369398",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern computer systems are called on to deal with
billions of events every second, whether they are
executed instructions, accessed memory locations, or
forwarded packets. This presents a serious challenge to
those who seek to quantify, analyze, or optimize such
systems, because important trends and behaviors may
easily be lost in a sea of data. We present
range-adaptive profiling (RAP) as a new and
general-purpose profiling method capable of
hierarchically efficiently classifying streams of data
in hardware. Through the use of RAP, events in an input
stream are dynamically classified into increasingly
precise categories, based on the frequency with which
they occur. The more important a class, or range of
events, the more precisely it is quantified. Despite
the dynamic nature of our technique, we build upon
tight theoretic bounds covering both worst-case error,
as well as the required memory. In the limit, it is
known that error and the memory bounds can be
independent of the stream size and grow only linearly
with the level of precision desired. Significantly, we
expose the critical constants in these algorithms and
through careful engineering, algorithm redesign, and
use of heuristics, we show how a high-performance
profile system can be implemented for range-adaptive
profiling. RAP can be used on various profiles, such as
PCs, load values, and memory addresses, and has a broad
range of uses, from hot-region profiling to quantifying
cache miss value locality. We propose two methods of
implementation of RAP, one in software and the other
with specialized hardware, for which we also describe
our prototype FPGA implementation. We show that with
just 8KB of memory, range profiles can be gathered with
an average accuracy of 98\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "profiling hardware; range adaptive; value locality",
}
@Article{Zhai:2008:CHS,
author = "Antonia Zhai and J. Gregory Steffan and Christopher B.
Colohan and Todd C. Mowry",
title = "Compiler and hardware support for reducing the
synchronization of speculative threads",
journal = j-TACO,
volume = "5",
number = "1",
pages = "3:1--3:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1369399",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Thread-level speculation (TLS) allows us to
automatically parallelize general-purpose programs by
supporting parallel execution of threads that might not
actually be independent. In this article, we focus on
one important limitation of program performance under
TLS, which stalls as a result of synchronizing and
forwarding scalar values between speculative threads
that would otherwise cause frequent data dependences
and, hence, failed speculation. Using SPECint
benchmarks that have been automatically transformed by
our compiler to exploit TLS, we present, evaluate in
detail, and compare both compiler and hardware
techniques for improving the communication of scalar
values. We find that through our dataflow algorithms
for three increasingly aggressive instruction
scheduling techniques, the compiler can drastically
reduce the critical forwarding path introduced by the
synchronization and forwarding of scalar values. We
also show that hardware techniques for reducing
synchronization can be complementary to compiler
scheduling, but that the additional performance
benefits are minimal and are generally not worth the
cost.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "automatic parallelization; chip-multiprocessing;
instruction scheduling; thread-level speculation",
}
@Article{Winter:2008:ATN,
author = "Jonathan A. Winter and David H. Albonesi",
title = "Addressing thermal nonuniformity in {SMT} workloads",
journal = j-TACO,
volume = "5",
number = "1",
pages = "4:1--4:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1369400",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We explore DTM techniques within the context of
uniform and nonuniform SMT workloads. While DVS is
suitable for addressing workloads with uniformly high
temperatures, for nonuniform workloads, performance
loss occurs because of the slowdown of the cooler
thread. To address this, we propose and evaluate DTM
mechanisms that exploit the steering-based thread
management mechanisms inherent in a clustered SMT
architecture. We show that in contrast to DVS, which
operates globally, our techniques are more effective at
controlling temperature for nonuniform workloads.
Furthermore, we devise a DTM technique that combines
steering and DVS to achieve consistently good
performance across all workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "adaptive microarchitectures; clustered
microarchitectures; dynamic thermal management; dynamic
voltage scaling; simultaneous multithreading",
}
@Article{Shahbahrami:2008:VES,
author = "Asadollah Shahbahrami and Ben Juurlink and Stamatis
Vassiliadis",
title = "Versatility of extended subwords and the matrix
register file",
journal = j-TACO,
volume = "5",
number = "1",
pages = "5:1--5:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1369401",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Extended subwords and the matrix register file (MRF)
are two micro architectural techniques that address
some of the limitations of existing SIMD architectures.
Extended subwords are wider than the data stored in
memory. Specifically, for every byte of data stored in
memory, there are four extra bits in the media register
file. This avoids the need for data-type conversion
instructions. The MRF is a register file organization
that provides both conventional row-wise, as well as
column-wise, access to the register file. In other
words, it allows to view the register file as a matrix
in which corresponding subwords in different registers
corresponds to a column of the matrix. It was
introduced to accelerate matrix transposition which is
a very common operation in multimedia applications. In
this paper, we show that the MRF is very versatile,
since it can also be used for other permutations than
matrix transposition. Specifically, it is shown how it
can be used to provide efficient access to strided
data, as is needed in, e.g., color space conversion.
Furthermore, it is shown that special-purpose
instructions (SPIs), such as the sum-of-absolute
differences (SAD) instruction, have limited usefulness
when extended subwords and a few general SIMD
instructions that we propose are supported, for the
following reasons. First, when extended subwords are
supported, the SAD instruction provides only a
relatively small performance improvement. Second, the
SAD instruction processes 8-bit subwords only, which is
not sufficient for quarter-pixel resolution nor for
cost functions used in image and video retrieval.
Results obtained by extending the SimpleScalar toolset
show that the proposed techniques provide a speedup of
up to 3.00 over the MMX architecture. The results also
show that using, at most, 13 extra media registers
yields an additional performance improvement ranging
from 1.3 to 1.57.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "multimedia standards; SIMD architectures; SIMD
programming",
}
@Article{Guo:2008:EHC,
author = "Zhi Guo and Walid Najjar and Betul Buyukkurt",
title = "Efficient hardware code generation for {FPGAs}",
journal = j-TACO,
volume = "5",
number = "1",
pages = "6:1--6:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1369402",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The wider acceptance of FPGAs as a computing device
requires a higher level of programming abstraction.
ROCCC is an optimizing C to HDL compiler. We describe
the code generation approach in ROCCC. The smart buffer
is a component that reuses input data between adjacent
iterations. It significantly improves the performance
of the circuit and simplifies loop control. The
ROCCC-generated datapath can execute one loop iteration
per clock cycle when there is no loop dependency or
there is only scalar recurrence variable dependency.
ROCCC's approach to supporting while-loops operating on
scalars makes the compiler able to move scalar
iterative computation into hardware.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "data reuse; FPGA; high-level synthesis; reconfigurable
computing; VHDL",
}
@Article{Kotzmann:2008:DJH,
author = "Thomas Kotzmann and Christian Wimmer and Hanspeter
M{\"o}ssenb{\"o}ck and Thomas Rodriguez and Kenneth
Russell and David Cox",
title = "Design of the {Java HotSpot\TM} client compiler for
{Java 6}",
journal = j-TACO,
volume = "5",
number = "1",
pages = "7:1--7:??",
month = may,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1369396.1370017",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 16 11:41:51 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Version 6 of Sun Microsystems' Java HotSpot{\TM} VM
ships with a redesigned version of the client
just-in-time compiler that includes several research
results of the last years. The client compiler is at
the heart of the VM configuration used by default for
interactive desktop applications. For such
applications, low startup and pause times are more
important than peak performance. This paper outlines
the new architecture of the client compiler and shows
how it interacts with the VM. It presents the
intermediate representation that now uses static
single-assignment (SSA) form and the linear scan
algorithm for global register allocation. Efficient
support for exception handling and deoptimization
fulfills the demands that are imposed by the dynamic
features of the Java programming language. The
evaluation shows that the new client compiler generates
better code in less time. The popular SPECjvm98
benchmark suite is executed 45\% faster, while the
compilation speed is also up to 40\% better. This
indicates that a carefully selected set of global
optimizations can also be integrated in just-in-time
compilers that focus on compilation speed and not on
peak performance. In addition, the paper presents the
impact of several optimizations on execution and
compilation speed. As the source code is freely
available, the Java HotSpot{\TM} VM and the client
compiler are the ideal basis for experiments with new
feedback-directed optimizations in a production-level
Java just-in-time compiler. The paper outlines research
projects that add fast algorithms for escape analysis,
automatic object inlining, and array bounds check
elimination.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "compiler; deoptimization; intermediate representation;
Java; just-in-time compilation; optimization; register
allocation",
}
@Article{Rangan:2008:PSD,
author = "Ram Rangan and Neil Vachharajani and Guilherme Ottoni
and David I. August",
title = "Performance scalability of decoupled software
pipelining",
journal = j-TACO,
volume = "5",
number = "2",
pages = "8:1--8:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1400112.1400113",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Any successful solution to using multicore processors
to scale general-purpose program performance will have
to contend with rising intercore communication costs
while exposing coarse-grained parallelism. Recently
proposed pipelined multithreading (PMT) techniques have
been demonstrated to have general-purpose applicability
and are also able to effectively tolerate inter-core
latencies through pipelined interthread communication.
These desirable properties make PMT techniques strong
candidates for program parallelization on current and
future multicore processors and understanding their
performance characteristics is critical to their
deployment. To that end, this paper evaluates the
performance scalability of a general-purpose PMT
technique called decoupled software pipelining (DSWP)
and presents a thorough analysis of the communication
bottlenecks that must be overcome for optimal DSWP
scalability.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "decoupled software pipelining; performance analysis",
}
@Article{Long:2008:TMM,
author = "Jieyi Long and Seda Ogrenci Memik and Gokhan Memik and
Rajarshi Mukherjee",
title = "Thermal monitoring mechanisms for chip
multiprocessors",
journal = j-TACO,
volume = "5",
number = "2",
pages = "9:1--9:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1400112.1400114",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With large-scale integration and increasing power
densities, thermal management has become an important
tool to maintain performance and reliability in modern
process technologies. In the core of dynamic thermal
management schemes lies accurate reading of on-die
temperatures. Therefore, careful planning and embedding
of thermal monitoring mechanisms into high-performance
systems becomes crucial. In this paper, we propose
three techniques to create sensor infrastructures for
monitoring the maximum temperature on a multicore
system. Initially, we extend a nonuniform sensor
placement methodology proposed in the literature to
handle chip multiprocessors (CMPs) and show its
limitations. We then analyze a grid-based approach
where the sensors are placed on a static grid covering
each core and show that the sensor readings can differ
from the actual maximum core temperature by as much as
12.6^\circ C when using 16 sensors per core. Also, as
large as 10.6\% of the thermal emergencies are not
captured using the same number of sensors. Based on
this observation, we first develop an interpolation
scheme, which estimates the maximum core temperature
through interpolation of the readings collected at the
static grid points. We show that the interpolation
scheme improves the measurement accuracy and emergency
coverage compared to grid-based placement when using
the same number of sensors. Second, we present a
dynamic scheme where only a subset of the sensor
readings is collected to predict the maximum
temperature of each core. Our results indicate that, we
can reduce the number of active sensors by as much as
50\%, while maintaining similar measurement accuracy
and emergency coverage compared to the case where the
entire sensor set on the grid is sampled at all
times.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "nonuniform and uniform sensor placement; thermal
sensor allocation",
}
@Article{Joshi:2008:DEP,
author = "Ajay Joshi and Lieven Eeckhout and Robert H. {Bell,
Jr.} and Lizy K. John",
title = "Distilling the essence of proprietary workloads into
miniature benchmarks",
journal = j-TACO,
volume = "5",
number = "2",
pages = "10:1--10:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1400112.1400115",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Benchmarks set standards for innovation in computer
architecture research and industry product development.
Consequently, it is of paramount importance that these
workloads are representative of real-world
applications. However, composing such representative
workloads poses practical challenges to application
analysis teams and benchmark developers (1) real-world
workloads are intellectual property and vendors
hesitate to share these proprietary applications; and
(2) porting and reducing these applications to
benchmarks that can be simulated in a tractable amount
of time is a nontrivial task. In this paper, we address
this problem by proposing a technique that
automatically distills key inherent behavioral
attributes of a proprietary workload and captures them
into a miniature synthetic benchmark clone. The
advantage of the benchmark clone is that it hides the
functional meaning of the code but exhibits similar
performance characteristics as the target application.
Moreover, the dynamic instruction count of the
synthetic benchmark clone is substantially shorter than
the proprietary application, greatly reducing overall
simulation time for SPEC CPU, the simulation time
reduction is over five orders of magnitude compared to
entire benchmark execution. Using a set of benchmarks
representative of general-purpose, scientific, and
embedded applications, we demonstrate that the power
and performance characteristics of the synthetic
benchmark clone correlate well with those of the
original application across a wide range of
microarchitecture configurations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "benchmark cloning; benchmarks; workload
characterization",
}
@Article{Catania:2008:RCM,
author = "Vincenzo Catania and Maurizio Palesi and Davide
Patti",
title = "Reducing complexity of multiobjective design space
exploration in {VLIW}-based embedded systems",
journal = j-TACO,
volume = "5",
number = "2",
pages = "11:1--11:??",
month = aug,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1400112.1400116",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Aug 28 13:25:00 MDT 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Architectures based on very-long instruction word
(VLIW) have found fertile ground in multimedia
electronic appliances thanks to their ability to
exploit high degrees of instruction level parallelism
(ILP) with a reasonable trade-off in complexity and
silicon cost. Specialization of such architectures
involves the configuration of both hardware-related
aspects (e.g., register files, functional units, memory
subsystem) and software-related issues (e.g., the
compilation strategy). The complex interactions between
the components of such systems will force a human
designer to rely on judgment and experience in
designing them, possibly eliminating interesting
configurations, and making tuning of the system, for
either power, energy, or performance, difficult. In
this paper we propose tools and methodologies to
efficiently cope with this complexity from a
multiobjective perspective. We first analyze the impact
of ILP-oriented code transformations using two
alternative compilation profiles to quantitatively show
the effect of such transformations on typical design
objectives like performance, power dissipation, and
energy consumption. Next, by means of statistical
analysis, we collect useful data to predict the
effectiveness of a given compilation profiles for a
specific application. Information gathered from such
analysis can be exploited to drastically reduce the
computational effort needed to perform the design space
exploration.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "design space exploration; energy; genetic algorithms;
hyperblock formation; ILP; multiobjective optimization;
performances; power; statistical analysis; VLIW
architectures",
}
@Article{Leverich:2008:CEM,
author = "Jacob Leverich and Hideho Arakida and Alex
Solomatnikov and Amin Firoozshahian and Mark Horowitz
and Christos Kozyrakis",
title = "Comparative evaluation of memory models for chip
multiprocessors",
journal = j-TACO,
volume = "5",
number = "3",
pages = "12:1--12:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1455650.1455651",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 8 14:28:18 MST 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "There are two competing models for the on-chip memory
in Chip Multiprocessor (CMP) systems: {\em
hardware-managed coherent caches\/} and {\em
software-managed streaming memory}. This paper performs
a direct comparison of the two models under the same
set of assumptions about technology, area, and
computational capabilities. The goal is to quantify how
and when they differ in terms of performance, energy
consumption, bandwidth requirements, and latency
tolerance for general-purpose CMPs. We demonstrate that
for data-parallel applications on systems with up to 16
cores, the cache-based and streaming models perform and
scale equally well. For certain applications with
little data reuse, streaming scales better due to
better bandwidth use and macroscopic software
prefetching. However, the introduction of techniques
such as hardware prefetching and nonallocating stores
to the cache-based model eliminates the streaming
advantage. Overall, our results indicate that there is
not sufficient advantage in building streaming memory
systems where all on-chip memory structures are
explicitly managed. On the other hand, we show that
streaming at the programming model level is
particularly beneficial, even with the cache-based
model, as it enhances locality and creates
opportunities for bandwidth optimizations. Moreover, we
observe that stream programming is actually easier with
the cache-based model because the hardware guarantees
correct, best-effort execution even when the programmer
cannot fully regularize an application's code.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "cache coherence; Chip multiprocessors; locality
optimizations; parallel programming; streaming memory",
}
@Article{Sharkey:2008:RRP,
author = "Joseph J. Sharkey and Jason Loew and Dmitry V.
Ponomarev",
title = "Reducing register pressure in {SMT} processors through
{L2}-miss-driven early register release",
journal = j-TACO,
volume = "5",
number = "3",
pages = "13:1--13:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1455650.1455652",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 8 14:28:18 MST 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The register file is one of the most critical datapath
components limiting the number of threads that can be
supported on a simultaneous multithreading (SMT)
processor. To allow the use of smaller register files
without degrading performance, techniques that maximize
the efficiency of using registers through aggressive
register allocation/deallocation can be considered. In
this article, we propose a novel technique to early
deallocate physical registers allocated to threads
which experience L2 cache misses. This is accomplished
by speculatively committing the load-independent
instructions and deallocating the registers
corresponding to the previous mappings of their
destinations, without waiting for the cache miss
request to be serviced. The early deallocated registers
are then made immediately available for allocation to
instructions within the same thread as well as within
other threads, thus improving the overall processor
throughput. On the average across the simulated mixes
of multiprogrammed SPEC 2000 workloads, our technique
results in 33\% improvement in throughput and 25\%
improvement in terms of harmonic mean of weighted IPCs
over the baseline SMT with the state-of-the-art DCRA
policy. This is achieved without creating checkpoints,
maintaining per-register counters of pending consumers,
performing tag rebroadcasts, register remappings,
and/or additional associative searches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "register file; Simultaneous multithreading",
}
@Article{Mehrara:2008:ESP,
author = "Mojtaba Mehrara and Todd Austin",
title = "Exploiting selective placement for low-cost memory
protection",
journal = j-TACO,
volume = "5",
number = "3",
pages = "14:1--14:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1455650.1455653",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 8 14:28:18 MST 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many embedded processing applications, such as those
found in the automotive or medical field, require
hardware designs that are at the same time low cost and
reliable. Traditionally, reliable memory systems have
been implemented using coded storage techniques, such
as ECC. While these designs can effectively detect and
correct memory faults such as transient errors and
single-bit defects, their use bears a significant cost
overhead. In this article, we propose a novel partial
memory protection scheme that provides high-coverage
fault protection for program code and data, but with
much lower cost than traditional approaches. Our
approach profiles program code and data usage to assess
which program elements are most critical to maintaining
program correctness. Critical code and variables are
then placed into a limited protected storage resources.
To ensure high coverage of program elements, our
placement technique considers all program components
simultaneously, including code, global variables, stack
frames, and heap variables. The fault coverage of our
approach is gauged using Monte Carlo fault-injection
experiments, which confirm that our technique provides
high levels of fault protection (99\% coverage) with
limited memory protection resources (36\% protected
area).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "fault-tolerant design; memory system design; Partial
memory protection; selective placement; transient
faults",
}
@Article{Vandierendonck:2008:SRA,
author = "Hans Vandierendonck and Andr{\'e} Seznec",
title = "Speculative return address stack management
revisited",
journal = j-TACO,
volume = "5",
number = "3",
pages = "15:1--15:??",
month = nov,
year = "2008",
CODEN = "????",
DOI = "https://doi.org/10.1145/1455650.1455654",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 8 14:28:18 MST 2008",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Branch prediction feeds a speculative execution
processor core with instructions. Branch mispredictions
are inevitable and have negative effects on performance
and energy consumption. With the advent of highly
accurate conditional branch predictors, nonconditional
branch instructions are gaining importance.\par
In this article, we address the prediction of procedure
returns. On modern processors, procedure returns are
predicted through a return address stack (RAS). The
overwhelming majority of the return mispredictions are
due to RAS overflows and/or overwriting the top entries
of the RAS on a mispredicted path. These sources of
misprediction were addressed by previously proposed
speculative return address stacks [Jourdan et al. 1996;
Skadron et al. 1998]. However, the remaining
misprediction rate of these RAS designs is still
significant when compared to state-of-the-art
conditional predictors.\par
We present two low-cost corruption detectors for RAS
predictors. They detect RAS overflows and wrong path
corruption with 100\% coverage. As a consequence, when
such a corruption is detected, another source can be
used for predicting the return. On processors featuring
a branch target buffer (BTB), this BTB can be used as a
free backup predictor for predicting returns when
corruption is detected.\par
Our experiments show that our proposal can be used to
improve the behavior of all previously proposed
speculative RASs. For instance, without any specific
management of the speculative states on the RAS, an
8-entry BTB-backed up RAS achieves the same performance
level as a state-of-the-art, but complex, 64-entry
self-checkpointing RAS [Jourdan et al. 1996].
Therefore, our proposal can be used either to improve
the performance of the processor or to reduce its
hardware complexity.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "back-up predictor; corruption detection; Return
address prediction",
}
@Article{Chhabra:2009:MSP,
author = "Siddhartha Chhabra and Brian Rogers and Yan Solihin
and Milos Prvulovic",
title = "Making secure processors {OS}- and
performance-friendly",
journal = j-TACO,
volume = "5",
number = "4",
pages = "16:1--16:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1498690.1498691",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Mar 18 21:35:33 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In today's digital world, computer security issues
have become increasingly important. In particular,
researchers have proposed designs for secure processors
that utilize hardware-based memory encryption and
integrity verification to protect the privacy and
integrity of computation even from sophisticated
physical attacks. However, currently proposed schemes
remain hampered by problems that make them impractical
for use in today's computer systems: lack of virtual
memory and Inter-Process Communication support as well
as excessive storage and performance overheads. In this
article, we propose (1) address independent seed
encryption (AISE), a counter-mode-based memory
encryption scheme using a novel seed composition, and
(2) bonsai Merkle trees (BMT), a novel Merkle
tree-based memory integrity verification technique, to
eliminate these system and performance issues
associated with prior counter-mode memory encryption
and Merkle tree integrity verification schemes. We
present both a qualitative discussion and a
quantitative analysis to illustrate the advantages of
our techniques over previously proposed approaches in
terms of complexity, feasibility, performance, and
storage. Our results show that AISE+BMT reduces the
overhead of prior memory encryption and integrity
verification schemes from 12\% to 2\% on average for
single-threaded benchmarks on uniprocessor systems, and
from 15\% to 4\% for coscheduled benchmarks on
multicore systems while eliminating critical
system-level problems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "memory encryption; memory integrity verification;
Secure processor architectures; virtualization",
}
@Article{Jimenez:2009:GNB,
author = "Daniel A. Jim{\'e}nez",
title = "Generalizing neural branch prediction",
journal = j-TACO,
volume = "5",
number = "4",
pages = "17:1--17:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1498690.1498692",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Mar 18 21:35:33 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Improved branch prediction accuracy is essential to
sustaining instruction throughput with today's deep
pipelines. Traditional branch predictors exploit
correlations between pattern history and branch outcome
to predict branches, but there is a stronger and more
natural correlation between path history and branch
outcome. We explore the potential for exploiting this
correlation. We introduce {\em piecewise linear branch
prediction}, an idealized branch predictor that
develops a set of linear functions, one for each
program path to the branch to be predicted, that
separate predicted taken from predicted not taken
branches. Taken together, all of these linear functions
form a piecewise linear decision surface. We present a
limit study of this predictor showing its potential to
greatly improve predictor accuracy.\par
We then introduce a practical implementable branch
predictor based on piecewise linear branch prediction.
In making our predictor practical, we show how a
parameterized version of it unifies the previously
distinct concepts of perceptron prediction and
path-based neural prediction. Our new branch predictor
has implementation costs comparable to current
prominent predictors in the literature while
significantly improving accuracy. For a deeply
pipelined simulated microarchitecture our predictor
with a 256-KB hardware budget improves the harmonic
mean normalized instructions-per-cycle rate by 8\% over
both the original path-based neural predictor and
2Bc-{\em gskew}. The average misprediction rate is
decreased by 16\% over the path-based neural predictor
and by 22\% over 2Bc-{\em gskew}.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Branch prediction; machine learning",
}
@Article{Jeon:2009:AAP,
author = "Jinseong Jeon and Keoncheol Shin and Hwansoo Han",
title = "Abstracting access patterns of dynamic memory using
regular expressions",
journal = j-TACO,
volume = "5",
number = "4",
pages = "18:1--18:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1498690.1498693",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Mar 18 21:35:33 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Unless the speed gap between CPU and memory
disappears, efficient memory usage remains a decisive
factor for performance. To optimize data usage of
programs in the presence of the memory hierarchy, we
are particularly interested in two compiler techniques:
{\em pool allocation\/} and {\em field layout
restructuring}. Since foreseeing runtime behaviors of
programs at compile time is difficult, most of the
previous work relied on profiling. On the contrary, our
goal is to develop a fully automatic compiler that
statically transforms input codes to use memory
efficiently. Noticing that {\em regular expressions},
which denote repetition explicitly, are sufficient for
memory access patterns, we describe how to extract
memory access patterns as regular expressions in
detail. Based on static patterns presented in regular
expressions, we apply pool allocation to repeatedly
accessed structures and exploit field layout
restructuring according to field affinity relations of
chosen structures. To make a scalable framework, we
devise and apply new abstraction techniques, which
build and interpret access patterns for the whole
programs in a bottom-up fashion. We implement our
analyses and transformations with the CIL compiler. To
verify the effect and scalability of our scheme, we
examine 17 benchmarks including 2 SPECINT 2000
benchmarks whose source lines of code are larger than
10,000. Our experiments demonstrate that the static
layout transformations for dynamic memory can reduce
L1D cache misses by 16\% and execution times by 14\% on
average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Access patterns; field affinity; layout
transformation; pool allocation; regular expressions",
}
@Article{Shobaki:2009:OTS,
author = "Ghassan Shobaki and Kent Wilken and Mark Heffernan",
title = "Optimal trace scheduling using enumeration",
journal = j-TACO,
volume = "5",
number = "4",
pages = "19:1--19:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1498690.1498694",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Mar 18 21:35:33 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article presents the first optimal algorithm for
trace scheduling. The trace is a global scheduling
region used by compilers to exploit instruction-level
parallelism across basic block boundaries. Several
heuristic techniques have been proposed for trace
scheduling, but the precision of these techniques has
not been studied relative to optimality. This article
describes a technique for finding provably optimal
trace schedules, where optimality is defined in terms
of a weighted sum of schedule lengths across all code
paths in a trace. The optimal algorithm uses
branch-and-bound enumeration to efficiently explore the
entire solution space. Experimental evaluation of the
algorithm shows that, with a time limit of 1 second per
problem, 91\% of the hard trace scheduling problems in
the SPEC CPU 2006 Integer Benchmarks are solved
optimally. For 58\% of these hard problems, the optimal
schedule is improved compared to that produced by a
heuristic scheduler with a geometric mean improvement
of 3.2\% in weighted schedule length and 18\% in
compensation code size.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "branch-and-bound enumeration; compiler optimizations;
global instruction scheduling; Instruction scheduling;
instruction-level parallelism; optimal instruction
scheduling; trace scheduling",
}
@Article{Kulkarni:2009:PEO,
author = "Prasad A. Kulkarni and David B. Whalley and Gary S.
Tyson and Jack W. Davidson",
title = "Practical exhaustive optimization phase order
exploration and evaluation",
journal = j-TACO,
volume = "6",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1509864.1509865",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 7 14:55:25 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Choosing the most appropriate optimization phase
ordering has been a long-standing problem in compiler
optimizations. Exhaustive evaluation of all possible
orderings of optimization phases for each function is
generally dismissed as infeasible for
production-quality compilers targeting accepted
benchmarks. In this article, we show that it is
possible to exhaustively evaluate the optimization
phase order space for each function in a reasonable
amount of time for most of the functions in our
benchmark suite. To achieve this goal, we used various
techniques to significantly prune the optimization
phase order search space so that it can be
inexpensively enumerated in most cases and reduce the
number of program simulations required to evaluate
program performance for each distinct phase ordering.
The techniques described are applicable to other
compilers in which it is desirable to find the best
phase ordering for most functions in a reasonable
amount of time. We also describe some interesting
properties of the optimization phase order space, which
will prove useful for further studies of related
problems in compilers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "exhaustive search; iterative compilation; Phase
ordering",
}
@Article{Hohenauer:2009:SOF,
author = "Manuel Hohenauer and Felix Engel and Rainer Leupers
and Gerd Ascheid and Heinrich Meyr",
title = "A {SIMD} optimization framework for retargetable
compilers",
journal = j-TACO,
volume = "6",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1509864.1509866",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 7 14:55:25 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Retargetable C compilers are currently widely used to
quickly obtain compiler support for new embedded
processors and to perform early processor architecture
exploration. A partially inherent problem of the
retargetable compilation approach, though, is the
limited code quality as compared to hand-written
compilers or assembly code due to the lack of dedicated
optimizations techniques. This problem can be
circumvented by designing flexible, retargetable code
optimization techniques that apply to a certain range
of target architectures. This article focuses on target
machines with SIMD instruction support, a common
feature in embedded processors for multimedia
applications. However, SIMD optimization is known to be
a difficult task since SIMD architectures are largely
nonuniform, support only a limited set of data types
and impose several memory alignment constraints.
Additionally, such techniques require complicated loop
transformations, which are tailored to the SIMD
architecture in order to exhibit the necessary amount
of parallelism in the code. Thus, integrating the SIMD
optimization {\em and\/} the required loop
transformations together in a single retargeting
formalism is an ambitious challenge. In this article,
we present an efficient and quickly retargetable SIMD
code optimization framework that is integrated into an
industrial retargetable C compiler. Experimental
results for different processors demonstrate that the
proposed technique applies to real-life target machines
and that it produces code quality improvements close to
the theoretical limit.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "ASIP; retargetable compilers; SIMD; subword
parallelism; vectorization",
}
@Article{Eyerman:2009:MLP,
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Memory-level parallelism aware fetch policies for
simultaneous multithreading processors",
journal = j-TACO,
volume = "6",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1509864.1509867",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 7 14:55:25 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A thread executing on a simultaneous multithreading
(SMT) processor that experiences a long-latency load
will eventually stall while holding execution
resources. Existing long-latency load aware SMT fetch
policies limit the amount of resources allocated by a
stalled thread by identifying long-latency loads and
preventing the thread from fetching more instructions
--- and in some implementations, instructions beyond
the long-latency load are flushed to release allocated
resources.\par
This article proposes an SMT fetch policy that takes
into account the available memory-level parallelism
(MLP) in a thread. The key idea proposed in this
article is that in case of an isolated long-latency
load (i.e., there is no MLP), the thread should be
prevented from allocating additional resources.
However, in case multiple independent long-latency
loads overlap (i.e., there is MLP), the thread should
allocate as many resources as needed in order to fully
expose the available MLP. MLP-aware fetch policies
achieve better performance for MLP-intensive threads on
SMT processors, leading to higher overall system
throughput and shorter average turnaround time than
previously proposed fetch policies.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Fetch Policy; Memory-Level Parallelism (MLP);
Simultaneous Multithreading (SMT)",
}
@Article{Strozek:2009:EAE,
author = "Lukasz Strozek and David Brooks",
title = "Energy- and area-efficient architectures through
application clustering and architectural
heterogeneity",
journal = j-TACO,
volume = "6",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1509864.1509868",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu May 7 14:55:25 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Customizing architectures for particular applications
is a promising approach to yield highly
energy-efficient designs for embedded systems. This
work explores the benefits of architectural
customization for a class of embedded architectures
typically used in energy- and area-constrained
application domains, such as sensor nodes and
multimedia processing. We implement a process flow that
performs an automatic synthesis and evaluation of the
different architectures based on runtime profiles of
applications and determines an efficient architecture,
with consideration for both energy and area
constraints. An expressive architectural model, used by
our engine, is introduced that takes advantage of
efficient opcode allocation, several memory addressing
modes, and operand types. By profiling embedded
benchmarks from a variety of sensor and multimedia
applications, we show that the energy savings resulting
from various architectural optimizations relative to
the base architectures (e.g., MIPS and MSP430) are
significant and can reach 50\%, depending on the
application. We then identify the set of architectures
that achieves near-optimal savings for a group of
applications. Finally, we propose the use of
heterogeneous ISA processors implementing those
architectures as a solution to capitalize on energy
savings provided by application customization while
executing a range of applications efficiently.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Efficient custom architectures; heterogeneous ISA
processors",
}
@Article{Venkataramani:2009:MAM,
author = "Guru Venkataramani and Ioannis Doudalis and Yan
Solihin and Milos Prvulovic",
title = "{MemTracker}: {An} accelerator for memory debugging
and monitoring",
journal = j-TACO,
volume = "6",
number = "2",
pages = "5:1--5:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543753.1543754",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memory bugs are a broad class of bugs that is becoming
increasingly common with increasing software
complexity, and many of these bugs are also security
vulnerabilities. Existing software and hardware
approaches for finding and identifying memory bugs have
a number of drawbacks including considerable
performance overheads, target only a specific type of
bug, implementation cost, and inefficient use of
computational resources.\par
This article describes MemTracker, a new hardware
support mechanism that can be configured to perform
different kinds of memory access monitoring tasks.
MemTracker associates each word of data in memory with
a few bits of state, and uses a programmable state
transition table to react to different events that can
affect this state. The number of state bits per word,
the events to which MemTracker reacts, and the
transition table are all fully programmable.
MemTracker's rich set of states, events, and
transitions can be used to implement different
monitoring and debugging checkers with minimal
performance overheads, even when frequent state updates
are needed. To evaluate MemTracker, we map three
different checkers onto it, as well as a checker that
combines all three. For the most demanding (combined)
checker with 8 bits state per memory word, we observe
performance overheads of only around 3\%, on average,
and 14.5\% worst-case across different benchmark
suites. Such low overheads allow continuous (always-on)
use of MemTracker-enabled checkers, even in production
runs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Accelerator; debugging; memory access monitoring",
}
@Article{Gabor:2009:SLA,
author = "Ron Gabor and Avi Mendelson and Shlomo Weiss",
title = "Service level agreement for multithreaded processors",
journal = j-TACO,
volume = "6",
number = "2",
pages = "6:1--6:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543753.1543755",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Multithreading is widely used to increase processor
throughput. As the number of shared resources increase,
managing them while guaranteeing predicted performance
becomes a major problem. Attempts have been made in
previous work to ease this via different fairness
mechanisms. In this article, we present a new approach
to control the resource allocation and sharing via a
service level agreement (SLA)-based mechanism; that is,
via an agreement in which multithreaded processors
guarantee a minimal level of service to the running
threads. We introduce a new metric, {\em C\/}$_{SLA}$,
for conformance to SLA in multithreaded processors and
show that controlling resources using with SLA allows
for higher gains than are achievable by previously
suggested fairness techniques. It also permits
improving one metric (e.g., power) while maintaining
SLA in another (e.g., performance). We compare SLA
enforcement to schemes based on other fairness metrics,
which are mostly targeted at equalizing execution
parameters. We show that using SLA rather than fairness
based algorithms provides a range of acceptable
execution points from which we can select the point
that best fits our optimization target, such as
maximizing the weighted speedup (sum of the speedups of
the individual threads) or reducing power. We
demonstrate the effectiveness of the new SLA approach
using switch-on-event (coarse-grained) multithreading.
Our weighted speedup improvement scheme successfully
enforces SLA while improving the weighted speedup by an
average of 10\% for unbalanced threads. This result is
significant when compared with performance losses that
may be incurred by fairness enforcement methods. When
optimizing for power reduction in unbalanced threads
SLA enforcement reduces the power by an average of
15\%. SLA may be complemented by other power reduction
methods to achieve further power savings {\em and\/}
maintain the same service level for the threads. We
also demonstrate differentiated SLA, where weighted
speedup is maximized while each thread may have a
different throughput constraint.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "fairness; performance; power; Service level agreement;
throughput",
}
@Article{Fung:2009:DWF,
author = "Wilson W. L. Fung and Ivan Sham and George Yuan and
Tor M. Aamodt",
title = "Dynamic warp formation: {Efficient MIMD} control flow
on {SIMD} graphics hardware",
journal = j-TACO,
volume = "6",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543753.1543756",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recent advances in graphics processing units (GPUs)
have resulted in massively parallel hardware that is
easily programmable and widely available in today's
desktop and notebook computer systems. GPUs typically
use single-instruction, multiple-data (SIMD) pipelines
to achieve high performance with minimal overhead for
control hardware. Scalar threads running the same
computing kernel are grouped together into SIMD
batches, sometimes referred to as warps. While SIMD is
ideally suited for simple programs, recent GPUs include
control flow instructions in the GPU instruction set
architecture and programs using these instructions may
experience reduced performance due to the way branch
execution is supported in hardware. One solution is to
add a stack to allow different SIMD processing elements
to execute distinct program paths after a branch
instruction. The occurrence of diverging branch
outcomes for different processing elements
significantly degrades performance using this approach.
In this article, we propose dynamic warp formation and
scheduling, a mechanism for more efficient SIMD branch
execution on GPUs. It dynamically regroups threads into
new warps on the fly following the occurrence of
diverging branch outcomes. We show that a realistic
hardware implementation of this mechanism improves
performance by 13\%, on average, with 256 threads per
core, 24\% with 512 threads, and 47\% with 768 threads
for an estimated area increase of 8\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "control flow; fine-grained multithreading; GPU; SIMD",
}
@Article{Koh:2009:TPV,
author = "Cheng-Kok Koh and Weng-Fai Wong and Yiran Chen and Hai
Li",
title = "Tolerating process variations in large,
set-associative caches: {The} buddy cache",
journal = j-TACO,
volume = "6",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1543753.1543757",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jul 2 12:32:04 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "One important trend in today's microprocessor
architectures is the increase in size of the processor
caches. These caches also tend to be set associative.
As technology scales, process variations are expected
to increase the fault rates of the SRAM cells that
compose such caches. As an important component of the
processor, the parametric yield of SRAM cells is
crucial to the overall performance and yield of the
microchip. In this article, we propose a
microarchitectural solution, called the buddy cache
that permits large, set-associative caches to tolerate
faults in SRAM cells due to process variations. In
essence, instead of disabling a faulty cache block in a
set (as is the current practice), it is paired with
another faulty cache block in the same set --- the
buddy. Although both cache blocks are faulty, if the
faults of the two blocks do not overlap, then instead
of losing two blocks, buddying will yield a functional
block from the nonfaulty portions of the two blocks. We
found that with buddying, caches can better mitigate
the negative impacts of process variations on
performance and yield, gracefully downgrading
performance as opposed to catastrophic failure. We will
describe the details of the buddy cache and give
insights as to why it is both more performance and
yield resilient to faults.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "caches; fault recovery; memory structures; Processor
architectures",
}
@Article{Li:2009:CDS,
author = "Lian Li and Hui Feng and Jingling Xue",
title = "Compiler-directed scratchpad memory management via
graph coloring",
journal = j-TACO,
volume = "6",
number = "3",
pages = "9:1--9:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1582710.1582711",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Oct 1 09:20:47 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Scratchpad memory (SPM), a fast on-chip SRAM managed
by software, is widely used in embedded systems. This
article introduces a general-purpose compiler approach,
called memory coloring, to assign static data
aggregates, such as arrays and structs, in a program to
an SPM. The novelty of this approach lies in
partitioning the SPM into a pseudo--register file (with
interchangeable and aliased registers), splitting the
live ranges of data aggregates to create potential data
transfer statements between SPM and off-chip memory,
and finally, adapting an existing graph coloring
algorithm for register allocation to assign the data
aggregates to the pseudo--register file. Our
experimental results using a set of 10 C benchmarks
from MediaBench and MiBench show that our methodology
is capable of managing SPMs efficiently and effectively
for large embedded applications. In addition, our SPM
allocator can obtain close to optimal solutions when
evaluated and compared against an existing
heuristics-based SPM allocator and an ILP-based SPM
allocator.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "graph coloring; live range splitting; memory
allocation; memory coloring; register coalescing;
Scratchpad memory; software-managed cache",
}
@Article{Golander:2009:CAR,
author = "Amit Golander and Shlomo Weiss",
title = "Checkpoint allocation and release",
journal = j-TACO,
volume = "6",
number = "3",
pages = "10:1--10:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1582710.1582712",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Oct 1 09:20:47 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Out-of-order speculative processors need a bookkeeping
method to recover from incorrect speculation. In recent
years, several microarchitectures that employ
checkpoints have been proposed, either extending the
reorder buffer or entirely replacing it. This work
presents an in-dept-study of checkpointing in
checkpoint-based microarchitectures, from the desired
content of a checkpoint, via implementation trade-offs,
and to checkpoint allocation and release policies. A
major contribution of the article is a novel adaptive
checkpoint allocation policy that outperforms known
policies. The adaptive policy controls checkpoint
allocation according to dynamic events, such as
second-level cache misses and rollback history. It
achieves 6.8\% and 2.2\% speedup for the integer and
floating point benchmarks, respectively, and does not
require a branch confidence estimator. The results show
that the proposed adaptive policy achieves most of the
potential of an oracle policy whose performance
improvement is 9.8\% and 3.9\% for the integer and
floating point benchmarks, respectively. We exploit
known techniques for saving leakage power by adapting
and applying them to checkpoint-based
microarchitectures. The proposed applications combine
to reduce the leakage power of the register file to
about one half of its original value.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Checkpoint; early register release; leakage;
misprediction; out-of-order execution; rollback",
}
@Article{Xu:2009:TXP,
author = "Weifeng Xu and Russell Tessier",
title = "{Tetris-XL}: a performance-driven spill reduction
technique for embedded {VLIW} processors",
journal = j-TACO,
volume = "6",
number = "3",
pages = "11:1--11:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1582710.1582713",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Oct 1 09:20:47 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As technology has advanced, the application space of
Very Long Instruction Word (VLIW) processors has grown
to include a variety of embedded platforms. Due to cost
and power consumption constraints, many embedded VLIW
processors contain limited resources, including
registers. As a result, a VLIW compiler that maximizes
instruction level parallelism (ILP) without considering
register constraints may generate excessive register
spills, leading to reduced overall system performance.
To address this issue, this article presents a new
spill reduction technique that improves VLIW runtime
performance by reordering operations prior to register
allocation and instruction scheduling. Unlike earlier
algorithms, our approach explicitly considers both
register reduction and data dependency in performing
operation reordering. Data dependency control limits
unexpected schedule length increases during subsequent
instruction scheduling. Our technique has been
evaluated using Trimaran, an academic VLIW compiler,
and evaluated using a set of embedded systems
benchmarks. Experimental results show that, on average,
this technique improves VLIW performance by 10\% for
VLIW processors with 32 registers and 8 functional
units compared with previous spill reduction
techniques. Limited improvement is seen versus prior
approaches for VLIW processors with 64 registers and 8
functional units.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "instruction level parallelism; Register pressure; Very
Long Instruction Word (VLIW) processor",
}
@Article{Jones:2009:ELE,
author = "Timothy M. Jones and Michael F. P. O'Boyle and Jaume
Abella and Antonio Gonz{\'a}lez and O{\u{g}}uz Ergin",
title = "Exploring the limits of early register release:
{Exploiting} compiler analysis",
journal = j-TACO,
volume = "6",
number = "3",
pages = "12:1--12:??",
month = sep,
year = "2009",
CODEN = "????",
DOI = "https://doi.org/10.1145/1582710.1582714",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Oct 1 09:20:47 MDT 2009",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Register pressure in modern superscalar processors can
be reduced by releasing registers early and by copying
their contents to cheap back-up storage. This article
quantifies the potential benefits of register occupancy
reduction and shows that existing hardware-based
schemes typically achieve only a small fraction of this
potential. This is because they are unable to
accurately determine the last use of a register and
must wait until the redefining instruction enters the
pipeline. On the other hand, compilers have a global
view of the program and, using simple dataflow
analysis, can determine the last use. This article
evaluates the extent to which compiler analysis can aid
early releasing, explores the design space, and
introduces commit and issue-based early releasing
schemes, quantifying their benefits. Using simple
compiler analysis and microarchitecture changes, we
achieve 70\% of the potential register file occupancy
reduction. By adding more hardware support, we can
increase this to 94\%. Our schemes are compared to
state-of-the-art approaches for varying register file
sizes and are shown to outperform these existing
techniques.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "compiler; energy efficiency; Low-power design;
microarchitecture; register file",
}
@Article{Jones:2009:EER,
author = "Timothy M. Jones and Michael F. P. O'Boyle and Jaume
Abella and Antonio Gonz{\'a}lez and O{\u{g}}uz Ergin",
title = "Energy-efficient register caching with compiler
assistance",
journal = j-TACO,
volume = "6",
number = "4",
pages = "13:1--13:??",
month = oct,
year = "2009",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 15 18:49:43 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2009:TUC,
author = "Weijia Li and Youtao Zhang and Jun Yang and Jiang
Zheng",
title = "Towards update-conscious compilation for
energy-efficient code dissemination in {WSNs}",
journal = j-TACO,
volume = "6",
number = "4",
pages = "14:1--14:??",
month = oct,
year = "2009",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 15 18:49:43 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wegiel:2009:SRC,
author = "Michal Wegiel and Chandra Krintz",
title = "The single-referent collector: {Optimizing} compaction
for the common case",
journal = j-TACO,
volume = "6",
number = "4",
pages = "15:1--15:??",
month = oct,
year = "2009",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 15 18:49:43 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Subramaniam:2009:DOS,
author = "Samantika Subramaniam and Gabriel H. Loh",
title = "Design and optimization of the store vectors memory
dependence predictor",
journal = j-TACO,
volume = "6",
number = "4",
pages = "16:1--16:??",
month = oct,
year = "2009",
CODEN = "????",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Mar 15 18:49:43 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2010:PAM,
author = "Xiaohang Wang and Mei Yang and Yingtao Jiang and Peng
Liu",
title = "A power-aware mapping approach to map {IP} cores onto
{NoCs} under bandwidth and latency constraints",
journal = j-TACO,
volume = "7",
number = "1",
pages = "1:1--1:??",
month = apr,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1736065.1736066",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 5 15:38:13 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, we investigate the Intellectual
Property (IP) mapping problem that maps a given set of
IP cores onto the tiles of a mesh-based Network-on-Chip
(NoC) architecture such that the power consumption due
to intercore communications is minimized. This IP
mapping problem is considered under both bandwidth and
latency constraints as imposed by the applications and
the on-chip network infrastructure. By examining
various applications' communication characteristics
extracted from their respective communication trace
graphs, two distinguishable connectivity templates are
realized: the graphs with tightly coupled vertices and
those with distributed vertices. These two templates
are formally defined in this article, and different
mapping heuristics are subsequently developed to map
them. In general, tightly coupled vertices are mapped
onto tiles that are physically close to each other
while the distributed vertices are mapped following a
graph partition scheme. Experimental results on both
random and multimedia benchmarks have confirmed that
the proposed template-based mapping algorithm achieves
an average of 15\% power savings as compared with MOCA,
a fast greedy-based mapping algorithm. Compared with a
branch-and-bound--based mapping algorithm, which
produces near optimal results but incurs an extremely
high computation cost, the proposed algorithm, due to
its polynomial runtime complexity, can generate the
results of almost the same quality with much less CPU
time. As the on-chip network size increases, the
superiority of the proposed algorithm becomes more
evident.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "bandwidth and latency constraints; IP mapping; Low
power; network-on-chip (NoC)",
}
@Article{Chen:2010:HSF,
author = "Zhong-Ho Chen and Alvin W. Y. Su",
title = "A hardware\slash software framework for instruction
and data scratchpad memory allocation",
journal = j-TACO,
volume = "7",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1736065.1736067",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 5 15:38:13 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Previous researches show that a scratchpad memory
device consumed less energy than a cache device with
the same capacity. In this article, we locate the
scratchpad memory (SPM) in the top level of the memory
hierarchy to reduce the energy consumption. To take the
advantage of a SPM, we address two issues of utilizing
a SPM. First, the program's locality should be
improved. The second issue is SPM management. To tackle
these two issues, we present a hardware/software
framework for dynamically allocating both instructions
and data in SPM. The software flow could be divided
into three phases: locality improving, locality
extraction, and runtime SPM management. Without
modifying the original compiler and the source code, we
improve the locality of a program. An optimization
algorithm is proposed to extract the SPM allocations.
At runtime, an SPM management program is employed. In
hardware, an address translation logic (ATL) is
proposed to reduce the overhead of SPM
management.\par
The results show that the proposed framework can reduce
energy delay product (EDP) by 63\%, on average, when
compared with the traditional cache architecture. The
reduction in EDP is contributed by properly allocating
both instructions and data in SPM. By allocating only
instructions in SPM, the EDPs are reduced by 45\%, on
average. By allocating only data in SPM, the EDPs are
reduced by 14\%, on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "allocation algorithm; Memory allocation; scratchpad
memory",
}
@Article{Woo:2010:CVI,
author = "Dong Hyuk Woo and Joshua B. Fryman and Allan D. Knies
and Hsien-Hsin S. Lee",
title = "{Chameleon}: {Virtualizing} idle acceleration cores of
a heterogeneous multicore processor for caching and
prefetching",
journal = j-TACO,
volume = "7",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1736065.1736068",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 5 15:38:13 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Heterogeneous multicore processors have emerged as an
energy- and area-efficient architectural solution to
improving performance for domain-specific applications
such as those with a plethora of data-level
parallelism. These processors typically contain a large
number of small, compute-centric cores for acceleration
while keeping one or two high-performance ILP cores on
the die to guarantee single-thread performance.
Although a major portion of the transistors are
occupied by the acceleration cores, these resources
will sit idle when running unparallelized legacy codes
or the sequential part of an application. To address
this underutilization issue, in this article, we
introduce Chameleon, a flexible heterogeneous multicore
architecture to virtualize these resources for
enhancing memory performance when running sequential
programs. The Chameleon architecture can dynamically
virtualize the idle acceleration cores into a
last-level cache, a data prefetcher, or a hybrid
between these two techniques. In addition, Chameleon
can operate in an adaptive mode that dynamically
configures the acceleration cores between the hybrid
mode and the prefetch-only mode by monitoring the
effectiveness of the Chameleon cache mode. In our
evaluation with SPEC2006 benchmark suite, different
levels of performance improvements were achieved in
different modes for different applications. In the case
of the adaptive mode, Chameleon improves the
performance of SPECint06 and SPECfp06 by 31\% and 15\%,
on average. When considering only memory-intensive
applications, Chameleon improves the system performance
by 50\% and 26\% for SPECint06 and SPECfp06,
respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "cache; Heterogeneous multicore; idle core;
prefetching",
}
@Article{Sanchez:2010:ACI,
author = "Daniel Sanchez and George Michelogiannakis and
Christos Kozyrakis",
title = "An analysis of on-chip interconnection networks for
large-scale chip multiprocessors",
journal = j-TACO,
volume = "7",
number = "1",
pages = "4:1--4:??",
month = apr,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1756065.1736069",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 5 15:38:13 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the number of cores of chip multiprocessors
(CMPs) rapidly growing as technology scales down,
connecting the different components of a CMP in a
scalable and efficient way becomes increasingly
challenging. In this article, we explore the
architectural-level implications of interconnection
network design for CMPs with up to 128 fine-grain
multithreaded cores. We evaluate and compare different
network topologies using accurate simulation of the
full chip, including the memory hierarchy and
interconnect, and using a diverse set of scientific and
engineering workloads.\par
We find that the interconnect has a large impact on
performance, as it is responsible for 60\% to 75\% of
the miss latency. Latency, and not bandwidth, is the
primary performance constraint, since, even with many
threads per core and workloads with high miss rates,
networks with enough bandwidth can be efficiently
implemented for the system scales we consider. From the
topologies we study, the flattened butterfly
consistently outperforms the mesh and fat tree on all
workloads, leading to performance advantages of up to
22\%. We also show that considering interconnect and
memory hierarchy together when designing large-scale
CMPs is crucial, and neglecting either of the two can
lead to incorrect conclusions. Finally, the effect of
the interconnect on overall performance becomes more
important as the number of cores increases, making
interconnection choices especially critical when
scaling up.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "chip multiprocessors; hierarchical networks;
Networks-on-chip",
}
@Article{Zhou:2010:PAT,
author = "Xiuyi Zhou and Jun Yang and Marek Chrobak and Youtao
Zhang",
title = "Performance-aware thermal management via task
scheduling",
journal = j-TACO,
volume = "7",
number = "1",
pages = "5:1--5:??",
month = apr,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1746065.1736070",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 5 15:38:13 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "High on-chip temperature impairs the processor's
reliability and reduces its lifetime. Hardware-level
dynamic thermal management (DTM) techniques can
effectively constrain the chip temperature, but
degrades the performance. We propose an OS-level
technique that performs thermal-aware job scheduling to
reduce DTMs. The algorithm is based on the observation
that hot and cool jobs executed in a different order
can make a difference in resulting temperature.
Real-system implementation in Linux shows that our
scheduler can remove 10.5\% to 73.6\% of the hardware
DTMs in a medium thermal environment. The CPU
throughput is improved by up to 7.6\% (4.1\%, on
average) in a severe thermal environment.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "task scheduling; Thermal management",
}
@Article{Raghavan:2010:TTP,
author = "Arun Raghavan and Colin Blundell and Milo M. K.
Martin",
title = "Token tenure and {PATCH}: a predictive\slash adaptive
token-counting hybrid",
journal = j-TACO,
volume = "7",
number = "2",
pages = "6:1--6:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839668",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Traditional coherence protocols present a set of
difficult trade-offs: the reliance of snoopy protocols
on broadcast and ordered interconnects limits their
scalability, while directory protocols incur a
performance penalty on sharing misses due to
indirection. This work introduces Patch
(Predictive/Adaptive Token-Counting Hybrid), a
coherence protocol that provides the scalability of
directory protocols while opportunistically sending
direct requests to reduce sharing latency. Patch
extends a standard directory protocol to track tokens
and use token-counting rules for enforcing coherence
permissions. Token counting allows Patch to support
direct requests on an unordered interconnect, while a
mechanism called {\em token tenure\/} provides
broadcast-free forward progress using the directory
protocol's per-block point of ordering at the home
along with either timeouts at requesters or explicit
race notification messages.\par
Patch makes three main contributions. First, Patch
introduces token tenure, which provides broadcast-free
forward progress for token-counting protocols. Second,
Patch deprioritizes best-effort direct requests to
match or exceed the performance of directory protocols
without restricting scalability. Finally, Patch
provides greater scalability than directory protocols
when using inexact encodings of sharers because only
processors holding tokens need to acknowledge requests.
Overall, Patch is a ``one-size-fits-all'' coherence
protocol that dynamically adapts to work well for small
systems, large systems, and anywhere in between.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "adaptive; bandwidth-efficiency; Cache coherence
protocol; predictive; token coherence",
}
@Article{Wimmer:2010:AFD,
author = "Christian Wimmer and Hanspeter M{\"o}ssenb{\"o}sck",
title = "Automatic feedback-directed object fusing",
journal = j-TACO,
volume = "7",
number = "2",
pages = "7:1--7:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839669",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Object fusing is an optimization that embeds certain
referenced objects into their referencing object. The
order of objects on the heap is changed in such a way
that objects that are accessed together are placed next
to each other in memory. Their offset is then fixed,
that is, the objects are colocated, allowing field
loads to be replaced by address arithmetic. Array
fusing specifically optimizes arrays, which are
frequently used for the implementation of dynamic data
structures. Therefore, the length of arrays often
varies, and fields referencing such arrays have to be
changed. An efficient code pattern detects these
changes and allows the optimized access of such
fields.\par
We integrated these optimizations into Sun
Microsystems' Java HotSpot\TM{} VM. The analysis is
performed automatically at runtime, requires no actions
on the part of the programmer, and supports dynamic
class loading. To safely eliminate a field load, the
colocation of the object that holds the field and the
object that is referenced by the field must be
guaranteed. Two preconditions must be satisfied: The
objects must be allocated at the same time, and the
field must not be overwritten later. These
preconditions are checked by the just-in-time compiler
to avoid an interprocedural data flow analysis. The
garbage collector ensures that groups of colocated
objects are not split by copying groups as a whole. The
evaluation shows that the dynamic approach successfully
identifies and optimizes frequently accessed fields for
several benchmarks with a low compilation and analysis
overhead. It leads to a speedup of up to 76\% for
simple benchmarks and up to 6\% for complex
workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "cache performance; garbage collection; Java;
just-in-time compilation; object colocation; object
fusing; object inlining; optimization",
}
@Article{Lee:2010:AIC,
author = "Benjamin C. Lee and David Brooks",
title = "Applied inference: {Case} studies in
microarchitectural design",
journal = j-TACO,
volume = "7",
number = "2",
pages = "8:1--8:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839670",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We propose and apply a new simulation paradigm for
microarchitectural design evaluation and optimization.
This paradigm enables more comprehensive design studies
by combining spatial sampling and statistical
inference. Specifically, this paradigm (i) defines a
large, comprehensive design space, (ii) samples points
from the space for simulation, and (iii) constructs
regression models based on sparse simulations. This
approach greatly improves the computational efficiency
of microarchitectural simulation and enables new
capabilities in design space exploration.\par
We illustrate new capabilities in three case studies
for a large design space of approximately 260,000
points: (i) Pareto frontier, (ii) pipeline depth, and
(iii) multiprocessor heterogeneity analyses. In
particular, regression models are exhaustively
evaluated to identify Pareto optimal designs that
maximize performance for given power budgets. These
models enable pipeline depth studies in which all
parameters vary simultaneously with depth, thereby more
effectively revealing interactions with nondepth
parameters. Heterogeneity analysis combines
regression-based optimization with clustering
heuristics to identify efficient design compromises
between similar optimal architectures. These
compromises are potential core designs in a
heterogeneous multicore architecture. Increasing
heterogeneity can improve {\em bips\/}$^3$ / {\em w\/}
efficiency by as much as 2.4\times , a theoretical
upper bound on heterogeneity benefits that neglects
contention between shared resources as well as design
complexity. Collectively these studies demonstrate
regression models' ability to expose trends and
identify optima in diverse design regions, motivating
the application of such models in statistical inference
for more effective use of modern simulator
infrastructure.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "Microarchitecture; regression; simulation;
statistics",
}
@Article{Rakvic:2010:TMT,
author = "R. Rakvic and Q. Cai and J. Gonz{\'a}lez and G.
Magklis and P. Chaparro and A. Gonz{\'a}lez",
title = "Thread-management techniques to maximize efficiency in
multicore and simultaneous multithreaded
microprocessors",
journal = j-TACO,
volume = "7",
number = "2",
pages = "9:1--9:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839671",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We provide an analysis of thread-management techniques
that increase performance or reduce energy in multicore
and Simultaneous Multithreaded (SMT) cores. Thread
delaying reduces energy consumption by running the core
containing the critical thread at maximum frequency
while scaling down the frequency and voltage of the
cores containing noncritical threads. In this article,
we provide an insightful breakdown of thread delaying
on a simulated multi-core microprocessor. Thread
balancing improves overall performance by giving higher
priority to the critical thread in the issue queue of
an SMT core. We provide a detailed breakdown of
performance results for thread-balancing, identifying
performance benefits and limitations. For those
benchmarks where a performance benefit is not possible,
we introduce a novel thread-balancing mechanism on an
SMT core that can reduce energy consumption. We have
performed a detailed study on an Intel microprocessor
simulator running parallel applications. Thread
delaying can reduce energy consumption by 4\% to 44\%
with negligible performance loss. Thread balancing can
increase performance by 20\% or can reduce energy
consumption by 23\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "critical threads; energy-aware; low-power; Meeting
point thread characterization; microarchitecture;
multi-threaded application; thread balancing; thread
delaying",
}
@Article{Pao:2010:MEP,
author = "Derek Pao and Wei Lin and Bin Liu",
title = "A memory-efficient pipelined implementation of the
{Aho--Corasick} string-matching algorithm",
journal = j-TACO,
volume = "7",
number = "2",
pages = "10:1--10:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839672",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With rapid advancement in Internet technology and
usages, some emerging applications in data
communications and network security require matching of
huge volume of data against large signature sets with
thousands of strings in real time. In this article, we
present a memory-efficient hardware implementation of
the well-known Aho--Corasick (AC) string-matching
algorithm using a pipelining approach called P-AC. An
attractive feature of the AC algorithm is that it can
solve the string-matching problem in time linearly
proportional to the length of the input stream, and the
computation time is independent of the number of
strings in the signature set. A major disadvantage of
the AC algorithm is the high memory cost required to
store the transition rules of the underlying
deterministic finite automaton. By incorporating
pipelined processing, the state graph is reduced to a
character trie that only contains forward edges.
Together with an intelligent implementation of look-up
tables, the memory cost of P-AC is only about 18 bits
per character for a signature set containing 6,166
strings extracted from Snort. The control structure of
P-AC is simple and elegant. The cost of the control
logic is very low. With the availability of dual-port
memories in FPGA devices, we can double the system
throughput by duplicating the control logic such that
the system can process two data streams concurrently.
Since our method is memory-based, incremental changes
to the signature set can be accommodated by updating
the look-up tables without reconfiguring the FPGA
circuitry.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "deterministic and nondeterministic finite automaton;
intrusion detection system; pipelined processing;
String-matching",
}
@Article{Yang:2010:ERS,
author = "Xuejun Yang and Ying Zhang and Xicheng Lu and Jingling
Xue and Ian Rogers and Gen Li and Guibin Wang and
Xudong Fang",
title = "Exploiting the reuse supplied by loop-dependent stream
references for stream processors",
journal = j-TACO,
volume = "7",
number = "2",
pages = "11:1--11:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839673",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memory accesses limit the performance of stream
processors. By exploiting the reuse of data held in the
Stream Register File (SRF), an on-chip, software
controlled storage, the number of memory accesses can
be reduced. In current stream compilers, reuse
exploitation is only attempted for simple stream
references, those whose start and end are known.
Compiler analysis, from outside of stream processors,
does not directly enable the consideration of other
more complex stream references. In this article, we
propose a transformation to automatically optimize
stream programs to exploit the reuse supplied by
loop-dependent stream references. The transformation is
based on three results: lemmas identifying the reuse
supplied by stream references, a new abstract
representation called the Stream Reuse Graph (SRG)
depicting the identified reuse, and the optimization of
the SRG for our transformation. Both the reuse between
the whole sequences accessed by stream references and
between partial sequences is exploited in the article.
In particular, partial reuse and its treatment are
quite new and have never, to the best of our knowledge,
appeared in scalar and vector processing. At the same
time, reusing streams increases the pressure on the
SRF, and this presents a problem of which reuse should
be exploited within limited SRF capacity. We extend our
analysis to achieve this objective. Finally, we
implement our techniques based on the StreamC/KernelC
compiler that has been optimized with the best existing
compilation techniques for stream processors.
Experimental results show a resultant speed-up of 1.14
to 2.54 times using a range of benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "stream professor; Stream programming model; stream
register file; stream reuse; streamc",
}
@Article{Reddi:2010:EVE,
author = "Vijay Janapa Reddi and Simone Campanoni and Meeta S.
Gupta and Michael D. Smith and Gu-Yeon Wei and David
Brooks and Kim Hazelwood",
title = "Eliminating voltage emergencies via software-guided
code transformations",
journal = j-TACO,
volume = "7",
number = "2",
pages = "12:1--12:??",
month = sep,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1839667.1839674",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 2 18:05:46 MDT 2010",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In recent years, circuit reliability in modern
high-performance processors has become increasingly
important. Shrinking feature sizes and diminishing
supply voltages have made circuits more sensitive to
microprocessor supply voltage fluctuations. These
fluctuations result from the natural variation of
processor activity as workloads execute, but when left
unattended, these voltage fluctuations can lead to
timing violations or even transistor lifetime issues.
In this article, we present a hardware--software
collaborative approach to mitigate voltage
fluctuations. A checkpoint-recovery mechanism rectifies
errors when voltage violates maximum tolerance
settings, while a runtime software layer reschedules
the program's instruction stream to prevent recurring
violations at the same program location. The runtime
layer, combined with the proposed code-rescheduling
algorithm, removes 60\% of all violations with minimal
overhead, thereby significantly improving overall
performance. Our solution is a radical departure from
the ongoing industry-standard approach to circumvent
the issue altogether by optimizing for the worst-case
voltage flux, which compromises power and performance
efficiency severely, especially looking ahead to future
technology generations. Existing conservative
approaches will have severe implications on the ability
to deliver efficient microprocessors. The proposed
technique reassembles a traditional reliability problem
as a runtime performance optimization problem, thus
allowing us to design processors for typical case
operation by building intelligent algorithms that can
prevent recurring violations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
keywords = "dI/dt; inductive noise; voltage emergencies; Voltage
noise",
}
@Article{Zhao:2010:PPP,
author = "Qin Zhao and Ioana Cutcutache and Weng-Fai Wong",
title = "{PiPA}: {Pipelined} profiling and analysis on
multicore systems",
journal = j-TACO,
volume = "7",
number = "3",
pages = "13:1--13:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880037.1880038",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Profiling and online analysis are important tasks in
program understanding and feedback-directed
optimization. However, fine-grained profiling and
online analysis tend to seriously slow down the
application. To cope with the slowdown, one may have to
terminate the process early or resort to sampling. The
former tends to distort the result because of warm-up
effects. The latter runs the risk of missing important
effects because sampling was turned off during the time
that these effects appeared. A promising approach is to
make use of the parallel processing capabilities of the
now ubiquitous multicore processors to speed up the
profiling and analysis process.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Guo:2010:QSS,
author = "Fei Guo and Yan Solihin and Li Zhao and Ravishankar
Iyer",
title = "Quality of service shared cache management in chip
multiprocessor architecture",
journal = j-TACO,
volume = "7",
number = "3",
pages = "14:1--14:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880037.1880039",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The trends in enterprise IT toward service-oriented
computing, server consolidation, and virtual computing
point to a future in which workloads are becoming
increasingly diverse in terms of performance,
reliability, and availability requirements. It can be
expected that more and more applications with diverse
requirements will run on a Chip Multi-Processor (CMP)
and share platform resources such as the lowest level
cache and off-chip bandwidth. In this environment, it
is desirable to have microarchitecture and software
support that can provide a guarantee of a certain level
of performance, which we refer to as performance
Quality of Service. In this article, we investigated a
framework would be needed to manage the shared cache
resource for fully providing QoS in a CMP.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wu:2010:DEH,
author = "Xiaoxia Wu and Jian Li and Lixin Zhang and Evan
Speight and Ram Rajamony and Yuan Xie",
title = "Design exploration of hybrid caches with disparate
memory technologies",
journal = j-TACO,
volume = "7",
number = "3",
pages = "15:1--15:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880037.1880040",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Traditional multilevel SRAM-based cache hierarchies,
especially in the context of chip multiprocessors
(CMPs), present many challenges in area requirements,
core--to--cache balance, power consumption, and design
complexity. New advancements in technology enable
caches to be built from other technologies, such as
Embedded DRAM (EDRAM), Magnetic RAM (MRAM), and
Phase-change RAM (PRAM), in both 2D chips or 3D stacked
chips. Caches fabricated in these technologies offer
dramatically different power-performance
characteristics when compared with SRAM-based caches,
particularly in the areas of access latency, cell
density, and overall power consumption. In this
article, we propose to take advantage of the best
characteristics that each technology has to offer
through the use of Hybrid Cache Architecture (HCA)
designs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kourtis:2010:ECO,
author = "Kornilios Kourtis and Georgios Goumas and Nectarios
Koziris",
title = "Exploiting compression opportunities to improve
{SpMxV} performance on shared memory systems",
journal = j-TACO,
volume = "7",
number = "3",
pages = "16:1--16:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880037.1880041",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The Sparse Matrix-Vector Multiplication (SpMxV) kernel
exhibits poor scaling on shared memory systems, due to
the streaming nature of its data access pattern. To
decrease memory contention and improve kernel
performance we propose two compression schemes: CSR-DU,
that targets the reduction of the matrix structural
data by applying coarse-grained delta-encoding, and
CSR-VI, that targets the reduction of the values using
indirect indexing, applicable to matrices with a small
number of unique values.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Buyukkurt:2010:IHL,
author = "Betul Buyukkurt and John Cortes and Jason Villarreal
and Walid A. Najjar",
title = "Impact of high-level transformations within the
{ROCCC} framework",
journal = j-TACO,
volume = "7",
number = "4",
pages = "17:1--17:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880043.1880044",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hwang:2010:DCR,
author = "Yuan-Shin Hwang and Tzong-Yen Lin and Rong-Guey
Chang",
title = "{DisIRer}: {Converting} a retargetable compiler into a
multiplatform binary translator",
journal = j-TACO,
volume = "7",
number = "4",
pages = "18:1--18:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880043.1880045",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Boyer:2010:FBP,
author = "Michael Boyer and David Tarjan and Kevin Skadron",
title = "Federation: {Boosting} per-thread performance of
throughput-oriented manycore architectures",
journal = j-TACO,
volume = "7",
number = "4",
pages = "19:1--19:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880043.1880046",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fursin:2010:COP,
author = "Grigori Fursin and Olivier Temam",
title = "Collective optimization: a practical collaborative
approach",
journal = j-TACO,
volume = "7",
number = "4",
pages = "20:1--20:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880043.1880047",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2010:UBI,
author = "Fang Liu and Yan Solihin",
title = "Understanding the behavior and implications of context
switch misses",
journal = j-TACO,
volume = "7",
number = "4",
pages = "21:1--21:??",
month = dec,
year = "2010",
CODEN = "????",
DOI = "https://doi.org/10.1145/1880043.1880048",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 10 09:37:16 MST 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Eyerman:2011:FGD,
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Fine-grained {DVFS} using on-chip regulators",
journal = j-TACO,
volume = "8",
number = "1",
pages = "1:1--1:??",
month = apr,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1952998.1952999",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Apr 27 07:54:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Limit studies on Dynamic Voltage and Frequency Scaling
(DVFS) provide apparently contradictory conclusions. On
the one hand early limit studies report that DVFS is
effective at large timescales (on the order of
million(s) of cycles) with large scaling overheads (on
the order of tens of microseconds), and they conclude
that there is no need for small overhead DVFS at small
timescales. Recent work on the other hand --- motivated
by the surge of on-chip voltage regulator research ---
explores the potential of fine-grained DVFS and reports
substantial energy savings at timescales of hundreds of
cycles (while assuming no scaling overhead). This
article unifies these apparently contradictory
conclusions through a DVFS limit study that
simultaneously explores timescale and scaling speed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cher:2011:EEC,
author = "Chen-Yong Cher and Eren Kursun",
title = "Exploring the effects of on-chip thermal variation on
high-performance multicore architectures",
journal = j-TACO,
volume = "8",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1952998.1953000",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Apr 27 07:54:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Inherent temperature variation among cores in a
multicore architecture can be caused by a number of
factors including process variation, cooling and
packaging imperfections, and even placement of the chip
in the module. Current dynamic thermal management
techniques assume identical heating profiles for
homogeneous multicore architectures. Our experimental
results indicate that inherent thermal variation is
very common in existing multicores. While most
multicore chips accommodate multiple thermal sensors,
the dynamic power/thermal management schemes are
oblivious of the inherent heating tendencies. Hence, in
the case of variation, the chip faces repetitive
hotspots running on such cores. In this article, we
propose a technique that leverages the on-chip sensor
infrastructure as well as the capabilities of
power/thermal management to effectively reduce the
heating and minimize local hotspots.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wu:2011:ATR,
author = "Carole-Jean Wu and Margaret Martonosi",
title = "Adaptive timekeeping replacement: Fine-grained
capacity management for shared {CMP} caches",
journal = j-TACO,
volume = "8",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1952998.1953001",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Apr 27 07:54:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In chip multiprocessors (CMPs), several
high-performance cores typically compete for capacity
in a shared last-level cache. This causes degraded and
unpredictable memory performance for multiprogrammed
and parallel workloads. In response, recent schemes
apportion cache bandwidth and capacity in ways that
offer better aggregate performance for the workloads.
These schemes, however, focus primarily on relatively
coarse-grained capacity management without concern for
operating system process priority levels. In this work,
we explore capacity management approaches that are both
temporally and spatially more fine-grained than prior
work. We also consider operating system priority levels
as part of capacity management. We propose a capacity
management mechanism based on timekeeping techniques
that track the time interval since the last access to
cached data.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vespa:2011:DFA,
author = "Lucas Vespa and Ning Weng",
title = "Deterministic finite automata characterization and
optimization for scalable pattern matching",
journal = j-TACO,
volume = "8",
number = "1",
pages = "4:1--4:??",
month = apr,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1952998.1953002",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Apr 27 07:54:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memory-based Deterministic Finite Automata (DFA) are
ideal for pattern matching in network intrusion
detection systems due to their deterministic
performance and ease of update of new patterns, however
severe DFA memory requirements make it impractical to
implement thousands of patterns. This article aims to
understand the basic relationship between DFA
characteristics and memory requirements, and to design
a practical memory-based pattern matching engine. We
present a methodology that consists of theoretical DFA
characterization, encoding optimization, and
implementation architecture. Results show the validity
of the characterization metrics, effectiveness of the
encoding techniques, and efficiency of the memory-based
pattern engines.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bhattacharjee:2011:PLC,
author = "Abhishek Bhattacharjee and Gilberto Contreras and
Margaret Martonosi",
title = "Parallelization libraries: Characterizing and reducing
overheads",
journal = j-TACO,
volume = "8",
number = "1",
pages = "5:1--5:??",
month = apr,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1952998.1953003",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Apr 27 07:54:03 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Creating efficient, scalable dynamic parallel runtime
systems for chip multiprocessors (CMPs) requires
understanding the overheads that manifest at high core
counts and small task sizes. In this article, we assess
these overheads on Intel's Threading Building Blocks
(TBB) and OpenMP. First, we use real hardware and
simulations to detail various scheduler and
synchronization overheads. We find that these can
amount to 47\% of TBB benchmark runtime and 80\% of
OpenMP benchmark runtime. Second, we propose load
balancing techniques such as occupancy-based and
criticality-guided task stealing, to boost performance.
Overall, our study provides valuable insights for
creating robust, scalable runtime libraries.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dong:2011:HCU,
author = "Xiangyu Dong and Yuan Xie and Naveen Muralimanohar and
Norman P. Jouppi",
title = "Hybrid checkpointing using emerging nonvolatile
memories for future exascale systems",
journal = j-TACO,
volume = "8",
number = "2",
pages = "6:1--6:??",
month = jul,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1970386.1970387",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 17 18:32:40 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The scalability of future Massively Parallel
Processing (MPP) systems is being severely challenged
by high failure rates. Current centralized Hard Disk
Drive (HDD) checkpointing results in overhead of 25\%
or more at petascale. Since systems become more
vulnerable as the node count keeps increasing, novel
techniques that enable fast and frequent checkpointing
are critical to the future exascale system
implementation. In this work, we first introduce one of
the emerging nonvolatile memory technologies,
Phase-Change Random Access Memory (PCRAM), as a proper
candidate of the fast checkpointing device. After a
thorough analysis of MPP systems, failure rates and
failure sources, we propose a PCRAM-based hybrid
local/global checkpointing mechanism which not only
provides a faster checkpoint storage, but also boosts
the effectiveness of other orthogonal techniques such
as incremental checkpointing and background
checkpointing.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2011:EEM,
author = "Jianjun Li and Chenggang Wu and Wei-Chung Hsu",
title = "Efficient and effective misaligned data access
handling in a dynamic binary translation system",
journal = j-TACO,
volume = "8",
number = "2",
pages = "7:1--7:??",
month = jul,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1970386.1970388",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 17 18:32:40 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Binary Translation (BT) has been commonly used to
migrate application software across Instruction Set
Architectures (ISAs). Some architectures, such as X86,
allow Misaligned Data Accesses (MDAs), while most
modern architectures require natural data alignments.
In a binary translation system, where the source ISA
allows MDA and the target ISA does not, memory
operations must be carefully translated. Naive
translation may cause frequent misaligned data access
traps to occur at runtime on the target machine and
severely slow down the migrated application. This
article evaluates different approaches in handling MDA
in a binary translation system including how to
identify MDA candidates and how to translate such
memory instructions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Venkataramani:2011:DDS,
author = "Guru Venkataramani and Christopher J. Hughes and
Sanjeev Kumar and Milos Prvulovic",
title = "{DeFT}: Design space exploration for on-the-fly
detection of coherence misses",
journal = j-TACO,
volume = "8",
number = "2",
pages = "8:1--8:??",
month = jul,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1970386.1970389",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 17 18:32:40 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "While multicore processors promise large performance
benefits for parallel applications, writing these
applications is notoriously difficult. Tuning a
parallel application to achieve good performance, also
known as performance debugging, is often more
challenging than debugging the application for
correctness. Parallel programs have many
performance-related issues that are not seen in
sequential programs. An increase in cache misses is one
of the biggest challenges that programmers face. To
minimize these misses, programmers must not only
identify the source of the extra misses, but also
perform the tricky task of determining if the misses
are caused by interthread communication (i.e.,
coherence misses) and if so, whether they are caused by
true or false sharing (since the solutions for these
two are quite different).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hiser:2011:EIB,
author = "Jason D. Hiser and Daniel W. Williams and Wei Hu and
Jack W. Davidson and Jason Mars and Bruce R. Childers",
title = "Evaluating indirect branch handling mechanisms in
software dynamic translation systems",
journal = j-TACO,
volume = "8",
number = "2",
pages = "9:1--9:??",
month = jul,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/1970386.1970390",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jun 17 18:32:40 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Software Dynamic Translation (SDT) is used for
instrumentation, optimization, security, and many other
uses. A major source of SDT overhead is the execution
of code to translate an indirect branch's target
address into the translated destination block's
address. This article discusses sources of Indirect
Branch (IB) overhead in SDT systems and evaluates
techniques for overhead reduction. Measurements using
SPEC CPU2000 show that the appropriate choice and
configuration of IB translation mechanisms can
significantly reduce the overhead. Further,
cross-architecture evaluation of these mechanisms
reveals that the most efficient implementation and
configuration can be highly dependent on the
architecture implementation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2011:HAM,
author = "Xi E. Chen and Tor M. Aamodt",
title = "Hybrid analytical modeling of pending cache hits, data
prefetching, and {MSHRs}",
journal = j-TACO,
volume = "8",
number = "3",
pages = "10:1--10:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019609",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article proposes techniques to predict the
performance impact of pending cache hits, hardware
prefetching, and miss status holding register resources
on superscalar microprocessors using hybrid analytical
models. The proposed models focus on timeliness of
pending hits and prefetches and account for a limited
number of MSHRs. They improve modeling accuracy of
pending hits by 3.9{\times} and when modeling data
prefetching, a limited number of MSHRs, or both, these
techniques result in average errors of 9.5\% to 17.8\%.
The impact of non-uniform DRAM memory latency is shown
to be approximated well by using a moving average of
memory access latency.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kleanthous:2011:CMD,
author = "Marios Kleanthous and Yiannakis Sazeides",
title = "{CATCH}: a mechanism for dynamically detecting
cache-content-duplication in instruction caches",
journal = j-TACO,
volume = "8",
number = "3",
pages = "11:1--11:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019610",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Cache-content-duplication (CCD) occurs when there is a
miss for a block in a cache and the entire content of
the missed block is already in the cache in a block
with a different tag. Caches aware of
content-duplication can have lower miss penalty by
fetching, on a miss to a duplicate block, directly from
the cache instead of accessing lower in the memory
hierarchy, and can have lower miss rates by allowing
only blocks with unique content to enter a cache. This
work examines the potential of CCD for instruction
caches. We show that CCD is a frequent phenomenon and
that an idealized duplication-detection mechanism for
instruction caches has the potential to increase
performance of an out-of-order processor, with a 16KB,
8-way, 8 instructions per block instruction cache,
often by more than 10\% and up to 36\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vandierendonck:2011:MSR,
author = "Hans Vandierendonck and Andr{\'e} Seznec",
title = "Managing {SMT} resource usage through speculative
instruction window weighting",
journal = j-TACO,
volume = "8",
number = "3",
pages = "12:1--12:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019611",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Simultaneous multithreading processors dynamically
share processor resources between multiple threads. In
general, shared SMT resources may be managed
explicitly, for instance, by dynamically setting queue
occupation bounds for each thread as in the DCRA and
Hill-Climbing policies. Alternatively, resources may be
managed implicitly; that is, resource usage is
controlled by placing the desired instruction mix in
the resources. In this case, the main resource
management tool is the instruction fetch policy which
must predict the behavior of each thread (branch
mispredictions, long-latency loads, etc.) as it fetches
instructions. In this article, we present the use of
Speculative Instruction Window Weighting (SIWW) to
bridge the gap between implicit and explicit SMT fetch
policies.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2011:PGS,
author = "Po-Han Wang and Chia-Lin Yang and Yen-Ming Chen and
Yu-Jung Cheng",
title = "Power gating strategies on {GPUs}",
journal = j-TACO,
volume = "8",
number = "3",
pages = "13:1--13:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019612",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As technology continues to shrink, reducing leakage is
critical to achieving energy efficiency. Previous
studies on low-power GPUs (Graphics Processing Units)
focused on techniques for dynamic power reduction, such
as DVFS (Dynamic Voltage and Frequency Scaling) and
clock gating. In this paper, we explore the potential
of adopting architecture-level power gating techniques
for leakage reduction on GPUs. We propose three
strategies for applying power gating on different
modules in GPUs. The Predictive Shader Shutdown
technique exploits workload variation across frames to
eliminate leakage in shader clusters. Deferred Geometry
Pipeline seeks to minimize leakage in fixed-function
geometry units by utilizing an imbalance between
geometry and fragment computation across batches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Feng:2011:DAD,
author = "Min Feng and Chen Tian and Changhui Lin and Rajiv
Gupta",
title = "Dynamic access distance driven cache replacement",
journal = j-TACO,
volume = "8",
number = "3",
pages = "14:1--14:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019613",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, we propose a new cache replacement
policy that makes the replacement decision based on the
reuse information of the cache lines and the requested
data. We present the architectural support and evaluate
the performance of our approach using SPEC benchmarks.
We also develop two reuse information predictors: a
profile-based static predictor and a runtime predictor.
The applicability of each predictor is discussed in
this paper. We further extend our reuse information
predictors so that the cache can adaptively choose
between the reuse information based replacement policy
and an approximation of LRU policy. According to the
experimental results, our adaptive reuse information
based replacement policy performs either better than or
close to the LRU policy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Samih:2011:EPP,
author = "Ahmad Samih and Yan Solihin and Anil Krishna",
title = "Evaluating placement policies for managing capacity
sharing in {CMP} architectures with private caches",
journal = j-TACO,
volume = "8",
number = "3",
pages = "15:1--15:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019614",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Chip Multiprocessors (CMP) with distributed L2 caches
suffer from a cache fragmentation problem; some caches
may be overutilized while others may be underutilized.
To avoid such fragmentation, researchers have proposed
capacity sharing mechanisms where applications that
need additional cache space can place their victim
blocks in remote caches. However, we found that only
allowing victim blocks to be placed on remote caches
tends to cause a high number of remote cache hits
relative to local cache hits. In this article, we show
that many of the remote cache hits can be converted
into local cache hits if we allow newly fetched blocks
to be selectively placed directly in a remote cache,
rather than in the local cache.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yeh:2011:MPP,
author = "Chang-Ching Yeh and Kuei-Chung Chang and Tien-Fu Chen
and Chingwei Yeh",
title = "Maintaining performance on power gating of
microprocessor functional units by using a predictive
pre-wakeup strategy",
journal = j-TACO,
volume = "8",
number = "3",
pages = "16:1--16:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019615",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Power gating is an effective technique for reducing
leakage power in deep submicron CMOS technology.
Microarchitectural techniques for power gating of
functional units have been developed by detecting
suitable idle regions and turning them off to reduce
leakage energy consumption; however, wakeup of
functional units is needed when instructions are ready
for execution such that wakeup overhead is naturally
incurred. This study presents time-based power gating
with reference pre-wakeup (PGRP), a novel predictive
strategy that detects suitable idle periods for power
gating and then enables pre-wakeup of needed functional
units for avoiding wakeup overhead. The key insight is
that most wakeups are repeated due to program
locality.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2011:DDE,
author = "Hyunjin Lee and Sangyeun Cho and Bruce R. Childers",
title = "{DEFCAM}: a design and evaluation framework for
defect-tolerant cache memories",
journal = j-TACO,
volume = "8",
number = "3",
pages = "17:1--17:??",
month = oct,
year = "2011",
CODEN = "????",
DOI = "https://doi.org/10.1145/2019608.2019616",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Oct 22 09:15:12 MDT 2011",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Advances in deep submicron technology call for a
careful review of existing cache designs and design
practices in terms of yield, area, and performance.
This article presents a Design and Evaluation Framework
for defect-tolerant Cache Memories (DEFCAM), which
enables processor architects to consider yield, area,
and performance together in a unified framework. Since
there is a complex, changing trade-off among these
metrics depending on the technology, the cache
organization, and the yield enhancement scheme
employed, such a design flow is invaluable to processor
architects when they assess a design and explore the
design space quickly at an early stage. We develop a
complete framework supporting the proposed DEFCAM
design flow, from injecting defects into a wafer to
evaluating program performance of individual processors
on the wafer.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Stenstrom:2012:ISI,
author = "Per Stenstr{\"o}m and Koen {De Bosschere}",
title = "Introduction to the special issue on high-performance
and embedded architectures and compilers",
journal = j-TACO,
volume = "8",
number = "4",
pages = "18:1--18:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086697",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Albericio:2012:ALC,
author = "Jorge Albericio and Rub{\'e}n Gran and Pablo
Ib{\'a}{\~n}ez and V{\'\i}ctor Vi{\~n}als and Jose
Mar{\'\i}a Llaber{\'\i}a",
title = "{ABS}: a low-cost adaptive controller for prefetching
in a banked shared last-level cache",
journal = j-TACO,
volume = "8",
number = "4",
pages = "19:1--19:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086698",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Hardware data prefetch is a very well known technique
for hiding memory latencies. However, in a multicore
system fitted with a shared Last-Level Cache (LLC),
prefetch induced by a core consumes common resources
such as shared cache space and main memory bandwidth.
This may degrade the performance of other cores and
even the overall system performance unless the prefetch
aggressiveness of each core is controlled from a system
standpoint. On the other hand, LLCs in commercial chip
multiprocessors are more and more frequently organized
in independent banks. In this contribution, we target
for the first time prefetch in a banked LLC
organization and propose ABS, a low-cost controller
with a hill-climbing approach that runs stand-alone at
each LLC bank without requiring inter-bank
communication.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bayrak:2012:AII,
author = "Ali Galip Bayrak and Nikola Velickovic and Paolo Ienne
and Wayne Burleson",
title = "An architecture-independent instruction shuffler to
protect against side-channel attacks",
journal = j-TACO,
volume = "8",
number = "4",
pages = "20:1--20:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086699",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Embedded cryptographic systems, such as smart cards,
require secure implementations that are robust to a
variety of low-level attacks. Side-Channel Attacks
(SCA) exploit the information such as power
consumption, electromagnetic radiation and acoustic
leaking through the device to uncover the secret
information. Attackers can mount successful attacks
with very modest resources in a short time period.
Therefore, many methods have been proposed to increase
the security against SCA. Randomizing the execution
order of the instructions that are independent, i.e.,
random shuffling, is one of the most popular among
them. Implementing instruction shuffling in software is
either implementation specific or has a significant
performance or code size overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Demme:2012:AGC,
author = "John Demme and Simha Sethumadhavan",
title = "Approximate graph clustering for program
characterization",
journal = j-TACO,
volume = "8",
number = "4",
pages = "21:1--21:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086700",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "An important aspect of system optimization research is
the discovery of program traits or behaviors. In this
paper, we present an automated method of program
characterization which is able to examine and cluster
program graphs, i.e., dynamic data graphs or control
flow graphs. Our novel approximate graph clustering
technology allows users to find groups of program
fragments which contain similar code idioms or patterns
in data reuse, control flow, and context. Patterns of
this nature have several potential applications
including development of new static or dynamic
optimizations to be implemented in software or in
hardware. For the SPEC CPU 2006 suite of benchmarks,
our results show that approximate graph clustering is
effective at grouping behaviorally similar functions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pricopi:2012:BPH,
author = "Mihai Pricopi and Tulika Mitra",
title = "{Bahurupi}: a polymorphic heterogeneous multi-core
architecture",
journal = j-TACO,
volume = "8",
number = "4",
pages = "22:1--22:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086701",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Computing systems have made an irreversible transition
towards parallel architectures with the emergence of
multi-cores. Moreover, power and thermal limits in
embedded systems mandate the deployment of many simpler
cores rather than a few complex cores on chip. Consumer
electronic devices, on the other hand, need to support
an ever-changing set of diverse applications with
varying performance demands. While some applications
can benefit from thread-level parallelism offered by
multi-core solutions, there still exist a large number
of applications with substantial amount of sequential
code. The sequential programs suffer from limited
exploitation of instruction-level parallelism in simple
cores. We propose a reconfigurable multi-core
architecture, called Bahurupi, that can successfully
reconcile the conflicting demands of instruction-level
and thread-level parallelism.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cleemput:2012:CMT,
author = "Jeroen V. Cleemput and Bart Coppens and Bjorn {De
Sutter}",
title = "Compiler mitigations for time attacks on modern x86
processors",
journal = j-TACO,
volume = "8",
number = "4",
pages = "23:1--23:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086702",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This paper studies and evaluates the extent to which
automated compiler techniques can defend against
timing-based side channel attacks on modern x86
processors. We study how modern x86 processors can leak
timing information through side channels that relate to
data flow. We study the efficiency, effectiveness,
portability, predictability and sensitivity of several
mitigating code transformations that eliminate or
minimize key-dependent execution time variations.
Furthermore, we discuss the extent to which compiler
backends are a suitable tool to provide automated
support for the proposed mitigations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mccandless:2012:CTI,
author = "Jason Mccandless and David Gregg",
title = "Compiler techniques to improve dynamic branch
prediction for indirect jump and call instructions",
journal = j-TACO,
volume = "8",
number = "4",
pages = "24:1--24:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086703",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Indirect jump instructions are used to implement
multiway branch statements and virtual function calls
in object-oriented languages. Branch behavior can have
significant impact on program performance, but
fortunately hardware predictors can alleviate much of
the risk. Modern processors include indirect branch
predictors which use part of the target address to
update a global history. We present a code generation
technique to maximize the branch history information
available to the predictor. We implement our
optimization as an assembly language transformation,
and evaluate it for SPEC benchmarks and interpreters
using simulated and real hardware, showing indirect
branch misprediction decreases.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Garcia-Guirado:2012:DDA,
author = "Antonio Garc{\'\i}a-Guirado and Ricardo
Fern{\'a}ndez-Pascual and Alberto Ros and Jos{\'e} M.
Garc{\'\i}a",
title = "{DAPSCO}: Distance-aware partially shared cache
organization",
journal = j-TACO,
volume = "8",
number = "4",
pages = "25:1--25:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086704",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many-core tiled CMP proposals often assume a partially
shared last level cache (LLC) since this provides a
good compromise between access latency and cache
utilization. In this paper, we propose a novel way to
map memory addresses to LLC banks that takes into
account the average distance between the banks and the
tiles that access them. Contrary to traditional
approaches, our mapping does not group the tiles in
clusters within which all the cores access the same
bank for the same addresses. Instead, two neighboring
cores access different sets of banks minimizing the
average distance travelled by the cache requests.
Results for a 64-core CMP show that our proposal
improves both execution time and the energy consumed by
the network by 13\% when compared to a traditional
mapping.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2012:FSS,
author = "Zhenjiang Wang and Chenggang Wu and Pen-Chung Yew and
Jianjun Li and Di Xu",
title = "On-the-fly structure splitting for heap objects",
journal = j-TACO,
volume = "8",
number = "4",
pages = "26:1--26:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086705",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the advent of multicore systems, the gap between
processor speed and memory latency has grown worse
because of their complex interconnect. Sophisticated
techniques are needed more than ever to improve an
application's spatial and temporal locality. This paper
describes an optimization that aims to improve heap
data layout by structure-splitting. It also provides
runtime address checking by piggybacking on the
existing page protection mechanism to guarantee the
correctness of such optimization that has eluded many
previous attempts due to safety concerns. The technique
can be applied to both sequential and parallel programs
at either compile time or runtime. However, we focus
primarily on sequential programs (i.e., single-threaded
programs) at runtime in this paper.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Das:2012:ELC,
author = "Dibyendu Das and B. Dupont {De Dinechin} and
Ramakrishna Upadrasta",
title = "Efficient liveness computation using merge sets and
{DJ}-graphs",
journal = j-TACO,
volume = "8",
number = "4",
pages = "27:1--27:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086706",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this work we devise an efficient algorithm that
computes the liveness information of program variables.
The algorithm employs SSA form and DJ-graphs as
representation to build Merge sets. The Merge set of
node n, M(n) is based on the structure of the Control
Flow Graph (CFG) and consists of all nodes where a
{\phi}-function needs to be placed, if a definition of
a variable appears in n. The merge sets of a CFG can be
computed using DJ-graphs without prior knowledge of how
the variables are used and defined. Later, we can
answer the liveness query (as a part of other
optimization or analysis phase) by utilizing the
knowledge of the use/def of variables, the dominator
tree and the pre-computed merge sets.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Patsilaras:2012:EEM,
author = "George Patsilaras and Niket K. Choudhary and James
Tuck",
title = "Efficiently exploiting memory level parallelism on
asymmetric coupled cores in the dark silicon era",
journal = j-TACO,
volume = "8",
number = "4",
pages = "28:1--28:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086707",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Extracting high memory-level parallelism (MLP) is
essential for speeding up single-threaded applications
which are memory bound. At the same time, the projected
amount of dark silicon (the fraction of the chip
powered off) on a chip is growing. Hence, Asymmetric
Multicore Processors (AMP) offer a unique opportunity
to integrate many types of cores, each powered at
different times, in order to optimize for different
regions of execution. In this work, we quantify the
potential for exploiting core customization to speedup
programs during regions of high MLP. Based on a careful
design space exploration, we discover that an AMP that
includes a narrow and fast specialized core has the
potential to efficiently exploit MLP.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Malits:2012:ELG,
author = "Roman Malits and Evgeny Bolotin and Avinoam Kolodny
and Avi Mendelson",
title = "Exploring the limits of {GPGPU} scheduling in control
flow bound applications",
journal = j-TACO,
volume = "8",
number = "4",
pages = "29:1--29:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086708",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "GPGPUs are optimized for graphics, for that reason the
hardware is optimized for massively data parallel
applications characterized by predictable memory access
patterns and little control flow. For such
applications' e.g., matrix multiplication, GPGPU based
system can achieve very high performance. However, many
general purpose data parallel applications are
characterized as having intensive control flow and
unpredictable memory access patterns. Optimizing the
code in such problems for current hardware is often
ineffective and even impractical since it exhibits low
hardware utilization leading to relatively low
performance. This work tracks the root causes of
execution inefficacies when running control flow
intensive CUDA applications on NVIDIA GPGPU hardware.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Orosa:2012:FIF,
author = "Lois Orosa and Elisardo Antelo and Javier D.
Bruguera",
title = "{FlexSig}: {Implementing} flexible hardware
signatures",
journal = j-TACO,
volume = "8",
number = "4",
pages = "30:1--30:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086709",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the advent of chip multiprocessors, new
techniques have been developed to make parallel
programming easier and more reliable. New parallel
programming paradigms and new methods of making the
execution of programs more efficient and more reliable
have been developed. Usually, these improvements
require hardware support to avoid a system slowdown.
Signatures based on Bloom filters are widely used as
hardware support for parallel programming in chip
multiprocessors. Signatures are used in Transactional
Memory, thread-level speculation, parallel debugging,
deterministic replay and other tools and applications.
The main limitation of hardware signatures is the lack
of flexibility: if signatures are designed with a given
configuration, tailored to the requirements of a
specific tool or application, it is likely that they do
not fit well for other different requirements.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Titos-Gil:2012:HTM,
author = "Ruben Titos-Gil and Manuel E. Acacio and Jose M.
Garcia and Tim Harris and Adrian Cristal and Osman
Unsal and Ibrahim Hur and Mateo Valero",
title = "Hardware transactional memory with software-defined
conflicts",
journal = j-TACO,
volume = "8",
number = "4",
pages = "31:1--31:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086710",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this paper we investigate the benefits of turning
the concept of transactional conflict from its
traditionally fixed definition into a variable one that
can be dynamically controlled in software. We propose
the extension of the atomic language construct with an
attribute that specifies the definition of conflict, so
that programmers can write code which adjusts what
kinds of conflicts are to be detected, relaxing or
tightening the conditions according to the forms of
interference that can be tolerated by a particular
algorithm. Using this performance-motivated construct,
specific conflict information can be associated with
portions of code, as each transaction is provided with
a local definition that applies while it executes.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kim:2012:IPN,
author = "Yongjoo Kim and Jongeun Lee and Toan X. Mai and
Yunheung Paek",
title = "Improving performance of nested loops on
reconfigurable array processors",
journal = j-TACO,
volume = "8",
number = "4",
pages = "32:1--32:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086711",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Pipelining algorithms are typically concerned with
improving only the steady-state performance, or the
kernel time. The pipeline setup time happens only once
and therefore can be negligible compared to the kernel
time. However, for Coarse-Grained Reconfigurable
Architectures (CGRAs) used as a coprocessor to a main
processor, pipeline setup can take much longer due to
the communication delay between the two processors, and
can become significant if it is repeated in an outer
loop of a loop nest. In this paper we evaluate the
overhead of such non-kernel execution times when
mapping nested loops for CGRAs, and propose a novel
architecture-compiler cooperative scheme to reduce the
overhead, while also minimizing the number of extra
configurations required.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Purnaprajna:2012:MWI,
author = "Madhura Purnaprajna and Paolo Ienne",
title = "Making wide-issue {VLIW} processors viable on
{FPGAs}",
journal = j-TACO,
volume = "8",
number = "4",
pages = "33:1--33:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086712",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Soft and highly-customized processors are emerging as
a common way to efficiently control large amount of
computing resources available on FPGAs. Yet, some
processor architectures of choice for DSP and media
applications, such as wide-issue VLIW processors,
remain impractical: the multi-ported register file
makes a very inefficient use of the resources in the
FPGA fabric. This paper proposes modifications to
existing FPGAs to make soft-VLIW processor viable. We
introduce an embedded multi-ported RAM that can be
customized to match the issue-width of VLIW processors.
To ascertain the benefits of this approach, we map an
extensible VLIW processor onto a standard FPGA from
Xilinx.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Radojkovic:2012:EIS,
author = "Petar Radojkovi{\'c} and Sylvain Girbal and Arnaud
Grasset and Eduardo Qui{\~n}ones and Sami Yehia and
Francisco J. Cazorla",
title = "On the evaluation of the impact of shared resources in
multithreaded {COTS} processors in time-critical
environments",
journal = j-TACO,
volume = "8",
number = "4",
pages = "34:1--34:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086713",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Commercial Off-The-Shelf (COTS) processors are now
commonly used in real-time embedded systems. The
characteristics of these processors fulfill system
requirements in terms of time-to-market, low cost, and
high performance-per-watt ratio. However, multithreaded
(MT) processors are still not widely used in real-time
systems because the timing analysis is too complex. In
MT processors, simultaneously-running tasks share and
compete for processor resources, so the timing analysis
has to estimate the possible impact that the inter-task
interferences have on the execution time of the
applications. In this paper, we propose a method that
quantifies the slowdown that simultaneously-running
tasks may experience due to collision in shared
processor resources.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Domnitser:2012:NMC,
author = "Leonid Domnitser and Aamer Jaleel and Jason Loew and
Nael Abu-Ghazaleh and Dmitry Ponomarev",
title = "Non-monopolizable caches: Low-complexity mitigation of
cache side channel attacks",
journal = j-TACO,
volume = "8",
number = "4",
pages = "35:1--35:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086714",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We propose a flexibly-partitioned cache design that
either drastically weakens or completely eliminates
cache-based side channel attacks. The proposed
Non-Monopolizable (NoMo) cache dynamically reserves
cache lines for active threads and prevents other
co-executing threads from evicting reserved lines.
Unreserved lines remain available for dynamic sharing
among threads. NoMo requires only simple modifications
to the cache replacement logic, making it
straightforward to adopt. It requires no software
support enabling it to automatically protect
pre-existing binaries. NoMo results in performance
degradation of about 1\% on average. We demonstrate
that NoMo can provide strong security guarantees for
the AES and Blowfish encryption algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rico:2012:SLS,
author = "Alejandro Rico and Felipe Cabarcas and Carlos
Villavieja and Milan Pavlovic and Augusto Vega and Yoav
Etsion and Alex Ramirez and Mateo Valero",
title = "On the simulation of large-scale architectures using
multiple application abstraction levels",
journal = j-TACO,
volume = "8",
number = "4",
pages = "36:1--36:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086715",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Simulation is a key tool for computer architecture
research. In particular, cycle-accurate simulators are
extremely important for microarchitecture exploration
and detailed design decisions, but they are slow and,
so, not suitable for simulating large-scale
architectures, nor are they meant for this. Moreover,
microarchitecture design decisions are irrelevant, or
even misleading, for early processor design stages and
high-level explorations. This allows one to raise the
abstraction level of the simulated architecture, and
also the application abstraction level, as it does not
necessarily have to be represented as an instruction
stream. In this paper we introduce a definition of
different application abstraction levels, and how these
are employed in TaskSim, a multi-core architecture
simulator, to provide several architecture modeling
abstractions, and simulate large-scale architectures
with hundreds of cores.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Saidi:2012:OED,
author = "Selma Saidi and Pranav Tendulkar and Thierry Lepley
and Oded Maler",
title = "Optimizing explicit data transfers for data parallel
applications on the {Cell} architecture",
journal = j-TACO,
volume = "8",
number = "4",
pages = "37:1--37:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086716",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this paper we investigate a general approach to
automate some deployment decisions for a certain class
of applications on multi-core computers. We consider
data-parallelizable programs that use the well-known
double buffering technique to bring the data from the
off-chip slow memory to the local memory of the cores
via a DMA (direct memory access) mechanism. Based on
the computation time and size of elementary data items
as well as DMA characteristics, we derive optimal and
near optimal values for the number of blocks that
should be clustered in a single DMA command. We then
extend the results to the case where a computation for
one data item needs some data in its neighborhood.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Feng:2012:PPL,
author = "Min Feng and Changhui Lin and Rajiv Gupta",
title = "{PLDS}: Partitioning linked data structures for
parallelism",
journal = j-TACO,
volume = "8",
number = "4",
pages = "38:1--38:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086717",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recently, parallelization of computations in the
presence of dynamic data structures has shown promising
potential. In this paper, we present PLDS, a system for
easily expressing and efficiently exploiting
parallelism in computations that are based on dynamic
linked data structures. PLDS improves the execution
efficiency by providing support for data partitioning
and then distributing computation across threads based
on the partitioning. Such computations often require
the use of speculation to exploit dynamic parallelism.
PLDS supports a conditional speculation mechanism that
reduces the cost of speculation. PLDS can be employed
in the context of different forms of parallelism, which
to cover a wide range of parallel applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pradelle:2012:PPB,
author = "Benoit Pradelle and Alain Ketterlin and Philippe
Clauss",
title = "Polyhedral parallelization of binary code",
journal = j-TACO,
volume = "8",
number = "4",
pages = "39:1--39:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086718",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many automatic software parallelization systems have
been proposed in the past decades, but most of them are
dedicated to source-to-source transformations. This
paper shows that parallelizing executable programs is
feasible, even if they require complex transformations,
and in effect decouples parallelization from
compilation, for example, for closed-source or legacy
software, where binary code is the only available
representation. We propose an automatic parallelizer,
which is able to perform advanced parallelization on
binary code. It first parses the binary code and
extracts high-level information. From this information,
a C program is generated. This program captures only a
subset of the program semantics, namely, loops and
memory accesses.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dong:2012:RAE,
author = "Yaozu Dong and Yu Chen and Zhenhao Pan and Jinquan Dai
and Yunhong Jiang",
title = "{ReNIC}: Architectural extension to {SR-IOV} {I/O}
virtualization for efficient replication",
journal = j-TACO,
volume = "8",
number = "4",
pages = "40:1--40:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086719",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Virtualization is gaining popularity in cloud
computing and has become the key enabling technology in
cloud infrastructure. By replicating the virtual server
state to multiple independent platforms, virtualization
improves the reliability and availability of cloud
systems. Unfortunately, existing Virtual Machine (VM)
replication solutions were designed only for software
virtualized I/O, which suffers from large performance
and scalability overheads. Although hardware-assisted
I/O virtualization (such as SR-IOV) can achieve close
to native performance and very good scalability, they
cannot be properly replicated across different physical
machines due to architectural limitations (such as lack
of efficient device state read/write, buffering
outbound packets, etc.) .",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bruintjes:2012:SLA,
author = "Tom M. Bruintjes and Karel H. G. Walters and Sabih H.
Gerez and Bert Molenkamp and Gerard J. M. Smit",
title = "{Sabrewing}: a lightweight architecture for combined
floating-point and integer arithmetic",
journal = j-TACO,
volume = "8",
number = "4",
pages = "41:1--41:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086720",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In spite of the fact that floating-point arithmetic is
costly in terms of silicon area, the joint design of
hardware for floating-point and integer arithmetic is
seldom considered. While components like multipliers
and adders can potentially be shared, floating-point
and integer units in contemporary processors are
practically disjoint. This work presents a new
architecture which tightly integrates floating-point
and integer arithmetic in a single datapath. It is
mainly intended for use in low-power embedded digital
signal processors and therefore the following design
constraints were important: limited use of pipelining
for the convenience of the compiler; maintaining
compatibility with existing technology; minimal area
and power consumption for applicability in embedded
systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kicherer:2012:SPA,
author = "Mario Kicherer and Fabian Nowak and Rainer Buchty and
Wolfgang Karl",
title = "Seamlessly portable applications: Managing the
diversity of modern heterogeneous systems",
journal = j-TACO,
volume = "8",
number = "4",
pages = "42:1--42:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086721",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Nowadays, many possible configurations of
heterogeneous systems exist, posing several new
challenges to application development: different types
of processing units usually require individual
programming models with dedicated runtime systems and
accompanying libraries. If these are absent on an
end-user system, e.g. because the respective hardware
is not present, an application linked against these
will break. This handicaps portability of applications
being developed on one system and executed on other,
differently configured heterogeneous systems. Moreover,
the individual profit of different processing units is
normally not known in advance. In this work, we propose
a technique to effectively decouple applications from
their accelerator-specific parts, respectively code.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Premillieu:2012:SSR,
author = "Nathanael Premillieu and Andre Seznec",
title = "{SYRANT}: {SYmmetric Resource Allocation on Not-taken
and Taken} paths",
journal = j-TACO,
volume = "8",
number = "4",
pages = "43:1--43:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086722",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In the multicore era, achieving ultimate single
process performance is still an issue e.g. for single
process workload or for sequential sections in parallel
applications. Unfortunately, despite tremendous
research effort on branch prediction, substantial
performance potential is still wasted due to branch
mispredictions. On a branch misprediction resolution,
instruction treatment on the wrong path is essentially
thrown away. However, in most cases after a conditional
branch, the taken and the not-taken paths of execution
merge after a few instructions. Instructions that
follow the reconvergence point are executed whatever
the branch outcome is. We present SYRANT (SYmmetric
Resource Allocation on Not-taken and Taken paths), a
new technique for exploiting control independence.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hasenplaugh:2012:GBC,
author = "William Hasenplaugh and Pritpal S. Ahuja and Aamer
Jaleel and Simon {Steely, Jr.} and Joel Emer",
title = "The gradient-based cache partitioning algorithm",
journal = j-TACO,
volume = "8",
number = "4",
pages = "44:1--44:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086723",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This paper addresses the problem of partitioning a
cache between multiple concurrent threads and in the
presence of hardware prefetching. Cache replacement
designed to preserve temporal locality (e.g., LRU) will
allocate cache resources proportional to the miss-rate
of each competing thread irrespective of whether the
cache space will be utilized [Qureshi and Patt 2006].
This is clearly suboptimal as applications vary
dramatically in their use of recently accessed data. We
address this problem by partitioning a shared cache
such that a global goodness metric is optimized. This
paper introduces the Gradient-based Cache Partitioning
Algorithm (GPA), whose variants optimize either
hitrate, total instructions per cycle (IPC) or a
weighted IPC metric designed to enforce Quality of
Service (QoS) [Iyer 2004].",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lira:2012:MPA,
author = "Javier Lira and Timothy M. Jones and Carlos Molina and
Antonio Gonz{\'a}lez",
title = "The migration prefetcher: Anticipating data promotion
in dynamic {NUCA} caches",
journal = j-TACO,
volume = "8",
number = "4",
pages = "45:1--45:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086724",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The exponential increase in multicore processor (CMP)
cache sizes accompanied by growing on-chip wire delays
make it difficult to implement traditional caches with
a single, uniform access latency. Non-Uniform Cache
Architecture (NUCA) designs have been proposed to
address this problem. A NUCA divides the whole cache
memory into smaller banks and allows banks nearer a
processor core to have lower access latencies than
those further away, thus mitigating the effects of the
cache's internal wires. Determining the best placement
for data in the NUCA cache at any particular moment
during program execution is crucial for exploiting the
benefits that this architecture provides.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pusukuri:2012:TTD,
author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
Bhuyan",
title = "Thread Tranquilizer: Dynamically reducing performance
variation",
journal = j-TACO,
volume = "8",
number = "4",
pages = "46:1--46:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086725",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "To realize the performance potential of multicore
systems, we must effectively manage the interactions
between memory reference behavior and the operating
system policies for thread scheduling and migration
decisions. We observe that these interactions lead to
significant variations in the performance of a given
application, from one execution to the next, even when
the program input remains unchanged and no other
applications are being run on the system. Our
experiments with multithreaded programs, including the
TATP database application, SPECjbb2005, and a subset of
PARSEC and SPEC OMP programs, on a 24-core Dell
PowerEdge R905 server running OpenSolaris confirms the
above observation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhang:2012:TPB,
author = "Dongsong Zhang and Deke Guo and Fangyuan Chen and Fei
Wu and Tong Wu and Ting Cao and Shiyao Jin",
title = "{TL}-plane-based multi-core energy-efficient real-time
scheduling algorithm for sporadic tasks",
journal = j-TACO,
volume = "8",
number = "4",
pages = "47:1--47:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086726",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As the energy consumption of multi-core systems
becomes increasingly prominent, it's a challenge to
design an energy-efficient real-time scheduling
algorithm in multi-core systems for reducing the system
energy consumption while guaranteeing the feasibility
of real-time tasks. In this paper, we focus on
multi-core processors, with the global Dynamic Voltage
Frequency Scaling (DVFS) and Dynamic Power Management
(DPM) technologies. In this setting, we propose an
energy-efficient real-time scheduling algorithm, the
Time Local remaining execution plane based Dynamic
Voltage Frequency Scaling (TL-DVFS). TL-DVFS utilizes
the concept of Time Local remaining execution (TL)
plane to dynamically scale the voltage and frequency of
a processor at the initial time of each TL plane as
well as at the release time of a sporadic task in each
TL plane.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lyons:2012:ASS,
author = "Michael J. Lyons and Mark Hempstead and Gu-Yeon Wei
and David Brooks",
title = "The accelerator store: a shared memory framework for
accelerator-based systems",
journal = j-TACO,
volume = "8",
number = "4",
pages = "48:1--48:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086727",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This paper presents the many-accelerator architecture,
a design approach combining the scalability of
homogeneous multi-core architectures and
system-on-chip's high performance and power-efficient
hardware accelerators. In preparation for systems
containing tens or hundreds of accelerators, we
characterize a diverse pool of accelerators and find
each contains significant amounts of SRAM memory (up to
90\% of their area). We take advantage of this
discovery and introduce the accelerator store, a
scalable architectural component to minimize
accelerator area by sharing its memories between
accelerators. We evaluate the accelerator store for two
applications and find significant system area
reductions (30\%) in exchange for small overheads (2\%
performance, 0\%--8\% energy).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Orozco:2012:THT,
author = "Daniel Orozco and Elkin Garcia and Rishi Khan and
Kelly Livingston and Guang R. Gao",
title = "Toward high-throughput algorithms on many-core
architectures",
journal = j-TACO,
volume = "8",
number = "4",
pages = "49:1--49:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086728",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Advanced many-core CPU chips already have a few
hundreds of processing cores (e.g., 160 cores in an IBM
Cyclops-64 chip) and more and more processing cores
become available as computer architecture progresses.
The underlying runtime systems of such architectures
need to efficiently serve hundreds of processors at the
same time, requiring all basic data structures within
the runtime to maintain unprecedented throughput. In
this paper, we analyze the throughput requirements that
must be met by algorithms in runtime systems to be able
to handle hundreds of simultaneous operations in real
time. We reach a surprising conclusion: Many
traditional algorithm techniques are poorly suited for
highly parallel computing environments because of their
low throughput.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Stock:2012:UML,
author = "Kevin Stock and Louis-No{\"e}l Pouchet and P.
Sadayappan",
title = "Using machine learning to improve automatic
vectorization",
journal = j-TACO,
volume = "8",
number = "4",
pages = "50:1--50:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086729",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Automatic vectorization is critical to enhancing
performance of compute-intensive programs on modern
processors. However, there is much room for improvement
over the auto-vectorization capabilities of current
production compilers through careful vector-code
synthesis that utilizes a variety of loop
transformations (e.g., unroll-and-jam, interchange,
etc.) . As the set of transformations considered is
increased, the selection of the most effective
combination of transformations becomes a significant
challenge: Currently used cost models in vectorizing
compilers are often unable to identify the best
choices. In this paper, we address this problem using
machine learning models to predict the performance of
SIMD codes. In contrast to existing approaches that
have used high-level features of the program, we
develop machine learning models based on features
extracted from the generated assembly code.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Therdsteerasukdi:2012:URI,
author = "Kanit Therdsteerasukdi and Gyungsu Byun and Jason Cong
and M. Frank Chang and Glenn Reinman",
title = "Utilizing {RF-I} and intelligent scheduling for better
throughput\slash watt in a mobile {GPU} memory system",
journal = j-TACO,
volume = "8",
number = "4",
pages = "51:1--51:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086730",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Smartphones and tablets are becoming more and more
powerful, replacing desktops and laptops as the users'
main computing system. As these systems support higher
and higher resolutions with more complex 3D graphics, a
high-throughput and low-power memory system is
essential for the mobile GPU. In this article, we
propose to improve throughput/watt in a mobile GPU
memory system by using intelligent scheduling to reduce
power and multi-band radio frequency interconnect
(MRF-I) to offset any throughput degradation caused by
our intelligent scheduling.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ryckbosch:2012:VSM,
author = "Frederick Ryckbosch and Stijn Polfliet and Lieven
Eeckhout",
title = "{VSim}: Simulating multi-server setups at near native
hardware speed",
journal = j-TACO,
volume = "8",
number = "4",
pages = "52:1--52:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086731",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Simulating contemporary computer systems is a
challenging endeavor, especially when it comes to
simulating high-end setups involving multiple servers.
The simulation environment needs to run complete
software stacks, including operating systems,
middleware, and application software, and it needs to
simulate network and disk activity next to CPU
performance. In addition, it needs the ability to scale
out to a large number of server nodes while attaining
good accuracy and reasonable simulation speeds. This
paper presents VSim, a novel simulation methodology for
multi-server systems. VSim leverages virtualization
technology for simulating a target system on a host
system. VSim controls CPU, network and disk performance
on the host, and it gives the illusion to the software
stack to run on a target system through time
dilation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhou:2012:WAP,
author = "Miao Zhou and Yu Du and Bruce Childers and Rami Melhem
and Daniel Moss{\'e}",
title = "Writeback-aware partitioning and replacement for
last-level caches in phase change main memory systems",
journal = j-TACO,
volume = "8",
number = "4",
pages = "53:1--53:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086732",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Phase-Change Memory (PCM) has emerged as a promising
low-power main memory candidate to replace DRAM. The
main problems of PCM are that writes are much slower
and more power hungry than reads, write bandwidth is
much lower than read bandwidth, and limited write
endurance. Adding an extra layer of cache, which is
logically the last-level cache (LLC), can mitigate the
drawbacks of PCM. However, writebacks from the LLC
might (a) overwhelm the limited PCM write bandwidth and
stall the application, (b) shorten lifetime, and (c)
increase energy consumption. Cache partitioning and
replacement schemes are important to achieve high
throughput for multi-core systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2012:TMA,
author = "Qingping Wang and Sameer Kulkarni and John Cavazos and
Michael Spear",
title = "A transactional memory with automatic performance
tuning",
journal = j-TACO,
volume = "8",
number = "4",
pages = "54:1--54:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086733",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A significant obstacle to the acceptance of
transactional memory (TM) in real-world parallel
programs is the abundance of substantially different TM
algorithms. Each TM algorithm appears well-suited to
certain workload characteristics, but the best choice
of algorithm is sensitive to program inputs, available
cores, and program phases. Furthermore, operating
system and hardware characteristics can affect which
algorithm is best, with tradeoffs changing across
iterations of a single ISA. This paper introduces
methods for constructing policies to dynamically select
the most appropriate TM algorithm based on static and
dynamic information. We leverage intraprocedural static
analysis to create a static profile of the
application.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bogdanski:2012:SFC,
author = "Bartosz Bogdanski and Sven-Arne Reinemo and Frank Olaf
Sem-Jacobsen and Ernst Gunnar Gran",
title = "{sFtree}: a fully connected and deadlock-free
switch-to-switch routing algorithm for fat-trees",
journal = j-TACO,
volume = "8",
number = "4",
pages = "55:1--55:??",
month = jan,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2086696.2086734",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Sat Jan 21 07:49:49 MST 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Existing fat-tree routing algorithms fully exploit the
path diversity of a fat-tree topology in the context of
compute node traffic, but they lack support for
deadlock-free and fully connected switch-to-switch
communication. Such support is crucial for efficient
system management, for example, in InfiniBand (IB)
systems. With the general increase in system management
capabilities found in modern InfiniBand switches, the
lack of deadlock-free switch-to-switch communication is
a problem for fat-tree-based IB installations because
management traffic might cause routing deadlocks that
bring the whole system down. This lack of deadlock-free
communication affects all system management and
diagnostic tools using LID routing. In this paper, we
propose the sFtree routing algorithm that guarantees
deadlock-free and fully connected switch-to-switch
communication in fat-trees while maintaining the
properties of the current fat-tree algorithm.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ghandour:2012:LSB,
author = "Walid J. Ghandour and Haitham Akkary and Wes Masri",
title = "Leveraging Strength-Based Dynamic Information Flow
Analysis to Enhance Data Value Prediction",
journal = j-TACO,
volume = "9",
number = "1",
pages = "1:1--1:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133382.2133383",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 30 17:45:35 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Value prediction is a technique to increase
parallelism by attempting to overcome serialization
constraints caused by true data dependences. By
predicting the outcome of an instruction before it
executes, value prediction allows data dependent
instructions to issue and execute speculatively, hence
increasing parallelism when the prediction is correct.
In case of a misprediction, the execution is redone
with the corrected value. If the benefit from increased
parallelism outweighs the misprediction recovery
penalty, overall performance could be improved.
Enhancing performance with value prediction therefore
requires highly accurate prediction methods. Most
existing general value prediction techniques are local,
that is, future outputs of an instruction are predicted
based on outputs from previous executions of the same
instruction. In this article, we investigate leveraging
strength-based dynamic information flow analysis to
enhance data value prediction. We use dynamic
information flow analysis (DIFA) to determine when a
specific value predictor can perform well and even
outperform other predictors. We apply information
theory to mathematically prove the validity and
benefits of correlating value predictors. We also
introduce the concept of the linear value predictors, a
new technique that predicts a new value from another
one using a linear relation. We finally present a
variant of stride predictor that we call update stride.
We then conduct an empirical analysis using Pin, a
dynamic binary instrumentation tool, and DynFlow, a
dynamic information flow analysis tool, that we apply
to programs from the SPECjvm2008 and Siemens
benchmarks. Our empirical measurements support our
mathematical theory and allow us to make important
observations on the relation between predictability of
data values and information flow. Our analysis and
empirical results show that the values of a set of
selected variables can be predicted with a very high
accuracy, up to 100\%. Such prediction is based on the
previous history and/or the values of one or more other
source variables that have strong information flow into
the predicted variable. Using our selection criteria,
we show that a DIFA-directed predictor outperforms
hardware value prediction for all subject programs, and
sometimes by a significant margin. This was observed
even when using an ideal tagged hardware value
prediction table that does not suffer from aliasing or
capacity misses.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2012:WPW,
author = "Jaekyu Lee and Hyesoon Kim and Richard Vuduc",
title = "When Prefetching Works, When It Doesn't, and Why",
journal = j-TACO,
volume = "9",
number = "1",
pages = "2:1--2:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133382.2133384",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 30 17:45:35 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In emerging and future high-end processor systems,
tolerating increasing cache miss latency and properly
managing memory bandwidth will be critical to achieving
high performance. Prefetching, in both hardware and
software, is among our most important available
techniques for doing so; yet, we claim that prefetching
is perhaps also the least well-understood. Thus, the
goal of this study is to develop a novel, foundational
understanding of both the benefits and limitations of
hardware and software prefetching. Our study includes:
source code-level analysis, to help in understanding
the practical strengths and weaknesses of compiler- and
software-based prefetching; a study of the synergistic
and antagonistic effects between software and hardware
prefetching; and an evaluation of hardware prefetching
training policies in the presence of software
prefetching requests. We use both simulation and
measurement on real systems. We find, for instance,
that although there are many opportunities for
compilers to prefetch much more aggressively than they
currently do, there is also a tangible risk of
interference with training existing hardware
prefetching mechanisms. Taken together, our
observations suggest new research directions for
cooperative hardware/software prefetching.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mazloom:2012:DTI,
author = "Bita Mazloom and Shashidhar Mysore and Mohit Tiwari
and Banit Agrawal and Tim Sherwood",
title = "Dataflow Tomography: Information Flow Tracking For
Understanding and Visualizing Full Systems",
journal = j-TACO,
volume = "9",
number = "1",
pages = "3:1--3:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133382.2133385",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 30 17:45:35 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "It is not uncommon for modern systems to be composed
of a variety of interacting services, running across
multiple machines in such a way that most developers do
not really understand the whole system. As abstraction
is layered atop abstraction, developers gain the
ability to compose systems of extraordinary complexity
with relative ease. However, many software properties,
especially those that cut across abstraction layers,
become very difficult to understand in such
compositions. The communication patterns involved, the
privacy of critical data, and the provenance of
information, can be difficult to find and understand,
even with access to all of the source code. The goal of
Dataflow Tomography is to use the inherent information
flow of such systems to help visualize the interactions
between complex and interwoven components across
multiple layers of abstraction. In the same way that
the injection of short-lived radioactive isotopes help
doctors trace problems in the cardiovascular system,
the use of ``data tagging'' can help developers slice
through the extraneous layers of software and pin-point
those portions of the system interacting with the data
of interest. To demonstrate the feasibility of this
approach we have developed a prototype system in which
tags are tracked both through the machine and in
between machines over the network, and from which novel
visualizations of the whole system can be derived. We
describe the system-level challenges in creating a
working system tomography tool and we qualitatively
evaluate our system by examining several example real
world scenarios.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ahn:2012:ISE,
author = "Jung Ho Ahn and Norman P. Jouppi and Christos
Kozyrakis and Jacob Leverich and Robert S. Schreiber",
title = "Improving System Energy Efficiency with Memory Rank
Subsetting",
journal = j-TACO,
volume = "9",
number = "1",
pages = "4:1--4:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133382.2133386",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 30 17:45:35 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "VLSI process technology scaling has enabled dramatic
improvements in the capacity and peak bandwidth of DRAM
devices. However, current standard DDR x DIMM memory
interfaces are not well tailored to achieve high energy
efficiency and performance in modern
chip-multiprocessor-based computer systems. Their
suboptimal performance and energy inefficiency can have
a significant impact on system-wide efficiency since
much of the system power dissipation is due to memory
power. New memory interfaces, better suited for future
many-core systems, are needed. In response, there are
recent proposals to enhance the energy efficiency of
main-memory systems by dividing a memory rank into
subsets, and making a subset rather than a whole rank
serve a memory request. We holistically assess the
effectiveness of rank subsetting from system-wide
performance, energy-efficiency, and reliability
perspectives. We identify the impact of rank subsetting
on memory power and processor performance analytically,
compare two promising rank-subsetting proposals,
Multicore DIMM and mini-rank, and verify our analysis
by simulating a chip-multiprocessor system using
multithreaded and consolidated workloads. We extend the
design of Multicore DIMM for high-reliability systems
and show that compared with conventional chipkill
approaches, rank subsetting can lead to much higher
system-level energy efficiency and performance at the
cost of additional DRAM devices. This holistic
assessment shows that rank subsetting offers compelling
alternatives to existing processor-memory interfaces
for future DDR systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yang:2012:CGC,
author = "Xuejun Yang and Li Wang and Jingling Xue and Qingbo
Wu",
title = "Comparability Graph Coloring for Optimizing
Utilization of Software-Managed Stream Register Files
for Stream Processors",
journal = j-TACO,
volume = "9",
number = "1",
pages = "5:1--5:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133382.2133387",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 30 17:45:35 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The stream processors represent a promising
alternative to traditional cache-based general-purpose
processors in achieving high performance in stream
applications (media and some scientific applications).
In a stream programming model for stream processors, an
application is decomposed into a sequence of kernels
operating on streams of data. During the execution of a
kernel on a stream processor, all streams accessed must
be communicated through a nonbypassing software-managed
on-chip memory, the SRF (Stream Register File).
Optimizing utilization of the scarce on-chip memory is
crucial for good performance. The key insight is that
the interference graphs (IGs) formed by the streams in
stream applications tend to be comparability graphs or
decomposable into a set of comparability graphs. We
present a compiler algorithm for finding optimal or
near-optimal colorings, that is, SRF allocations in
stream IGs, by computing a maximum spanning forest of
the sub-IG formed by long live ranges, if necessary.
Our experimental results validate the optimality and
near-optimality of our algorithm by comparing it with
an ILP solver, and show that our algorithm yields
improved SRF utilization over the First-Fit bin-packing
algorithm, the best in the literature.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Majumdar:2012:MPE,
author = "Abhinandan Majumdar and Srihari Cadambi and Michela
Becchi and Srimat T. Chakradhar and Hans Peter Graf",
title = "A Massively Parallel, Energy Efficient Programmable
Accelerator for Learning and Classification",
journal = j-TACO,
volume = "9",
number = "1",
pages = "6:1--6:??",
month = mar,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2133382.2133388",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 30 17:45:35 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Applications that use learning and classification
algorithms operate on large amounts of unstructured
data, and have stringent performance constraints. For
such applications, the performance of general purpose
processors scales poorly with data size because of
their limited support for fine-grained parallelism and
absence of software-managed caches. The large
intermediate data in these applications also limits
achievable performance on many-core processors such as
GPUs. To accelerate such learning applications, we
present a programmable accelerator that can execute
multiple learning and classification algorithms. To
architect such an accelerator, we profile five
representative workloads, and find that their
computationally intensive portions can be formulated as
matrix or vector operations generating large amounts of
intermediate data, which are then reduced by a
secondary operation such as array ranking, finding
max/min and aggregation. Our proposed accelerator,
called MAPLE, has hundreds of simple processing
elements (PEs) laid out in a two-dimensional grid, with
two key features. First, it uses dynamic in-memory
processing where on-chip memory blocks perform the
secondary reduction operations. Second, MAPLE uses
banked off-chip memory, and organizes its PEs into
independent groups each with its own off-chip memory
bank. These two features allow MAPLE to scale its
performance with data size. We also present an Atom
based energy-efficient heterogeneous system with MAPLE
as the accelerator that satisfies the application's
performance requirements at a lower system power. This
article describes the MAPLE architecture, explores its
design space with a simulator, illustrates how to
automatically map application kernels to the hardware,
and presents its performance improvement and energy
benefits over classic server-based implementations. We
implement a 512-PE FPGA prototype of MAPLE and find
that it is 1.5-10x faster than a 2.5 GHz quad-core Xeon
processor despite running at a modest 125 MHz clock
rate. With MAPLE connected to a 1.6GHz dual-core Atom,
we show an energy improvement of 38--84\% over the Xeon
server coupled to a 1.3 GHz 240 core Tesla GPU.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Eyerman:2012:PMJ,
author = "Stijn Eyerman and Lieven Eeckhout",
title = "Probabilistic modeling for job symbiosis scheduling on
{SMT} processors",
journal = j-TACO,
volume = "9",
number = "2",
pages = "7:1--7:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2207222.2207223",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Jun 13 17:20:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Symbiotic job scheduling improves simultaneous
multithreading (SMT) processor performance by
coscheduling jobs that have ``compatible'' demands on
the processor's shared resources. Existing approaches
however require a sampling phase, evaluate a limited
number of possible coschedules, use heuristics to gauge
symbiosis, are rigid in their optimization target, and
do not preserve system-level priorities/shares. This
article proposes probabilistic job symbiosis modeling,
which predicts whether jobs will create positive or
negative symbiosis when coscheduled without requiring
the coschedule to be evaluated. The model, which uses
per-thread cycle stacks computed through a previously
proposed cycle accounting architecture, is simple
enough to be used in system software. Probabilistic job
symbiosis modeling provides six key innovations over
prior work in symbiotic job scheduling: (i) it does not
require a sampling phase, (ii) it readjusts the job
coschedule continuously, (iii) it evaluates a large
number of possible coschedules at very low overhead,
(iv) it is not driven by heuristics, (v) it can
optimize a performance target of interest (e.g., system
throughput or job turnaround time), and (vi) it
preserves system-level priorities/shares. These
innovations make symbiotic job scheduling both
practical and effective. Our experimental evaluation,
which assumes a realistic scenario in which jobs come
and go, reports an average 16\% (and up to 35\%)
reduction in job turnaround time compared to the
previously proposed SOS (sample, optimize, symbios)
approach for a two-thread SMT processor, and an average
19\% (and up to 45\%) reduction in job turnaround time
for a four-thread SMT processor.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Seghir:2012:IAT,
author = "Rachid Seghir and Vincent Loechner and Beno{\^\i}t
Meister",
title = "Integer affine transformations of parametric
{$Z$}-polytopes and applications to loop nest
optimization",
journal = j-TACO,
volume = "9",
number = "2",
pages = "8:1--8:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2207222.2207224",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Jun 13 17:20:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The polyhedral model is a well-known compiler
optimization framework for the analysis and
transformation of affine loop nests. We present a new
method to solve a difficult geometric operation that is
raised by this model: the integer affine transformation
of parametric $Z$-polytopes. The result of such a
transformation is given by a worst-case exponential
union of $Z$-polytopes. We also propose a polynomial
algorithm (for fixed dimension), to count points in
arbitrary unions of a fixed number of parametric
$Z$-polytopes. We implemented these algorithms and
compared them to other existing algorithms, for a set
of applications to loop nest analysis and
optimization.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yang:2012:UOC,
author = "Yi Yang and Ping Xiang and Jingfei Kong and Mike
Mantor and Huiyang Zhou",
title = "A unified optimizing compiler framework for different
{GPGPU} architectures",
journal = j-TACO,
volume = "9",
number = "2",
pages = "9:1--9:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2207222.2207225",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Jun 13 17:20:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article presents a novel optimizing compiler for
general purpose computation on graphics processing
units (GPGPU). It addresses two major challenges of
developing high performance GPGPU programs: effective
utilization of GPU memory hierarchy and judicious
management of parallelism. The input to our compiler is
a na{\"\i}ve GPU kernel function, which is functionally
correct but without any consideration for performance
optimization. The compiler generates two kernels, one
optimized for global memories and the other for texture
memories. The proposed compilation process is effective
for both AMD/ATI and NVIDIA GPUs. The experiments show
that our optimized code achieves very high performance,
either superior or very close to highly fine-tuned
libraries.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jang:2012:ACO,
author = "Choonki Jang and Jaejin Lee and Bernhard Egger and
Soojung Ryu",
title = "Automatic code overlay generation and partially
redundant code fetch elimination",
journal = j-TACO,
volume = "9",
number = "2",
pages = "10:1--10:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2207222.2207226",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Jun 13 17:20:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "There is an increasing interest in explicitly managed
memory hierarchies, where a hierarchy of distinct
memories is exposed to the programmer and managed
explicitly in software. These hierarchies can be found
in typical embedded systems and an emerging class of
multicore architectures. To run an application that
requires more code memory than the available
higher-level memory, typically an overlay structure is
needed. The overlay structure is generated manually by
the programmer or automatically by a specialized
linker. Manual code overlaying requires the programmer
to deeply understand the program structure for maximum
memory savings as well as minimum performance
degradation. Although the linker can automatically
generate the code overlay structure, its memory savings
are limited and it even brings significant performance
degradation because traditional techniques do not
consider the program context. In this article, we
propose an automatic code overlay generation technique
that overcomes the limitations of traditional automatic
code overlaying techniques. We are dealing with a
system context that imposes two distinct constraints:
(1) no hardware support for address translation and (2)
a spatially and temporally coarse grained faulting
mechanism at the function level. Our approach addresses
those two constraints as efficiently as possible. Our
technique statically computes the Worst-Case Number of
Conflict misses (WCNC) between two different code
segments using path expressions. Then, it constructs a
static temporal relationship graph with the WCNCs and
emits an overlay structure for a given higher-level
memory size. We also propose an inter-procedural
partial redundancy elimination technique that minimizes
redundant code copying caused by the generated overlay
structure. Experimental results show that our approach
is promising.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Abbasi:2012:TSW,
author = "Zahra Abbasi and Georgios Varsamopoulos and Sandeep K.
S. Gupta",
title = "{TACOMA}: Server and workload management in {Internet}
data centers considering cooling-computing power
trade-off and energy proportionality",
journal = j-TACO,
volume = "9",
number = "2",
pages = "11:1--11:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2207222.2207227",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Jun 13 17:20:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A two-tier Internet data center management scheme,
TACOMA, with thermal-aware server provisioning (TASP)
in one tier, and thermal-aware workload distribution
(TAWD) in the other is proposed. TASP and TAWD
coordinate to maximize the energy savings by leveraging
the workload dynamics, at coarse and fine time scale,
respectively. TACOMA is aware of the QoS constraints,
the energy proportionality of servers, and the
potential trade-off between cooling and computing
power. The obtained energy savings are a combination of
suspending idle servers, using servers at their peak
efficiency, and avoiding heat recirculation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lankes:2012:BSP,
author = "Andreas Lankes and Thomas Wild and Stefan Wallentowitz
and Andreas Herkersdorf",
title = "Benefits of selective packet discard in
networks-on-chip",
journal = j-TACO,
volume = "9",
number = "2",
pages = "12:1--12:??",
month = jun,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2207222.2207228",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed Jun 13 17:20:51 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Today, Network on Chip concepts principally assume
inherent lossless operation. Considering that future
nanometer CMOS technologies will witness increased
sensitivity to all forms of manufacturing and
environmental variations (e.g., IR drop, soft errors
due to radiation, transient temperature induced timing
problems, device aging), efforts to cope with data
corruption or packet loss will be unavoidable. Possible
counter measures against packet loss are the extension
of flits with ECC or the introduction of error
detection with retransmission. We propose to make use
of the perceived deficiency of packet loss as a
feature. By selectively discarding stuck packets in the
NoC, a proven practice in computer networks, all types
of deadlocks can be resolved. This is especially
advantageous for solving the problem of
message-dependent deadlocks, which otherwise leads to
high costs either in terms of throughput or chip area.
Strict ordering, the most popular approach to this
problem, results in a significant buffer overhead and a
more complex router architecture. In addition, we will
show that eliminating local network congestions by
selectively discarding individual packets also can
improve the effective throughput of the network. The
end-to-end retransmission mechanism required for the
reliable communication, then also provides lossless
communication for the cores.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Luo:2012:DDS,
author = "Yangchun Luo and Antonia Zhai",
title = "Dynamically dispatching speculative threads to improve
sequential execution",
journal = j-TACO,
volume = "9",
number = "3",
pages = "13:1--13:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355586",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Efficiently utilizing multicore processors to improve
their performance potentials demands extracting
thread-level parallelism from the applications. Various
novel and sophisticated execution models have been
proposed to extract thread-level parallelism from
sequential programs. One such execution model,
Thread-Level Speculation (TLS), allows potentially
dependent threads to execute speculatively in parallel.
However, TLS execution is inherently unpredictable, and
consequently incorrect speculation could degrade
performance for the multicore systems. Existing
approaches have focused on using the compilers to
select sequential program regions to apply TLS. Our
research shows that even the state-of-the-art compiler
makes suboptimal decisions, due to the unpredictability
of TLS execution. Thus, we propose to dynamically
optimize TLS performance. This article describes the
design, implementation, and evaluation of a runtime
thread dispatching mechanism that adjusts the behaviors
of speculative threads based on their efficiency. In
the proposed system, speculative threads are monitored
by hardware-based performance counters and their
performance impact is evaluated with a novel
methodology that takes into account various unique TLS
characteristics. Thread dispatching policies are
devised to adjust the behaviors of speculative threads
accordingly. With the help of the runtime evaluation,
where and how to create speculative threads is better
determined. Evaluated with all the SPEC CPU2000
benchmark programs written in C, the dynamic
dispatching system outperforms the state-of-the-art
compiler-based thread management techniques by 9.4\% on
average. Comparing to sequential execution, we achieve
1.37X performance improvement on a four-core CMP-based
system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cui:2012:EPO,
author = "Huimin Cui and Jingling Xue and Lei Wang and Yang Yang
and Xiaobing Feng and Dongrui Fan",
title = "Extendable pattern-oriented optimization directives",
journal = j-TACO,
volume = "9",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355587",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Algorithm-specific, that is, semantic-specific
optimizations have been observed to bring significant
performance gains, especially for a diverse set of
multi/many-core architectures. However, current
programming models and compiler technologies for the
state-of-the-art architectures do not exploit well
these performance opportunities. In this article, we
propose a pattern-making methodology that enables
algorithm-specific optimizations to be encapsulated
into ``optimization patterns''. Such optimization
patterns are expressed in terms of preprocessor
directives so that simple annotations can result in
significant performance improvements. To validate this
new methodology, a framework, named EPOD, is developed
to map these directives into the underlying
optimization schemes for a particular architecture. It
is difficult to create an exact performance model to
determine an optimal or near-optimal optimization
scheme (including which optimizations to apply and in
which order) for a specific application, due to the
complexity of applications and architectures. However,
it is trackable to build individual optimization
components and let compiler developers synthesize an
optimization scheme from these components. Therefore,
our EPOD framework provides an Optimization Programming
Interface (OPI) for compiler developers to define new
optimization schemes. Thus, new patterns can be
integrated into EPOD in a flexible manner. We have
identified and implemented a number of optimization
patterns for three representative computer platforms.
Our experimental results show that a pattern-guided
compiler can outperform the state-of-the-art compilers
and even achieve performance as competitive as
hand-tuned code. Therefore, such a pattern-making
methodology represents an encouraging direction for
domain experts' experience and knowledge to be
integrated into general-purpose compilers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lewis:2012:REC,
author = "Adam Wade Lewis and Nian-Feng Tzeng and Soumik Ghosh",
title = "Runtime energy consumption estimation for server
workloads based on chaotic time-series approximation",
journal = j-TACO,
volume = "9",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355588",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article proposes a runtime model that relates
server energy consumption to its overall thermal
envelope, using hardware performance counters and
experimental measurements. While previous studies have
attempted system-wide modeling of server power
consumption through subsystem models, our approach is
different in that it links system energy input to
subsystem energy consumption based on a small set of
tightly correlated parameters. The proposed model takes
into account processor power, bus activities, and
system ambient temperature for real-time prediction on
the power consumption of long running jobs. Using the
HyperTransport and QuickPath Link structures as case
studies and through electrical measurements on example
server subsystems, we develop a chaotic time-series
approximation for runtime power consumption, arriving
at the Chaotic Attractor Predictor (CAP). With
polynomial time complexity, CAP exhibits high
prediction accuracy, having the prediction errors
within 1.6\% (or 3.3\%) for servers based on the
HyperTransport bus (or the QuickPath Links), as
verified by a set of common processor benchmarks. Our
CAP is a superior predictive mechanism over existing
linear auto-regressive methods, which require expensive
and complex corrective steps to address the nonlinear
and chaotic aspects of the underlying physical
system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Valero:2012:CRI,
author = "Alejandro Valero and Julio Sahuquillo and Salvador
Petit and Pedro L{\'o}pez and Jos{\'e} Duato",
title = "Combining recency of information with selective random
and a victim cache in last-level caches",
journal = j-TACO,
volume = "9",
number = "3",
pages = "16:1--16:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355589",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memory latency has become an important performance
bottleneck in current microprocessors. This problem
aggravates as the number of cores sharing the same
memory controller increases. To palliate this problem,
a common solution is to implement cache hierarchies
with large or huge Last-Level Cache (LLC)
organizations. LLC memories are implemented with a high
number of ways (e.g., 16) to reduce conflict misses.
Typically, caches have implemented the LRU algorithm to
exploit temporal locality, but its performance goes
away from the optimal as the number of ways increases.
In addition, the implementation of a strict LRU
algorithm is costly in terms of area and power. This
article focuses on a family of low-cost replacement
strategies, whose implementation scales with the number
of ways while maintaining the performance. The proposed
strategies track the accessing order for just a few
blocks, which cannot be replaced. The victim is
randomly selected among those blocks exhibiting poor
locality. Although, in general, the random policy helps
improving the performance, in some applications the
scheme fails with respect to the LRU policy leading to
performance degradation. This drawback can be overcome
by the addition of a small victim cache of the large
LLC. Experimental results show that, using the best
version of the family without victim cache, MPKI
reduction falls in between 10\% and 11\% compared to a
set of the most representative state-of-the-art
algorithms, whereas the reduction grows up to 22\% with
respect to LRU. The proposal with victim cache achieves
speedup improvements, on average, by 4\% compared to
LRU. In addition, it reduces dynamic energy, on
average, up to 8\%. Finally, compared to the studied
algorithms, hardware complexity is largely reduced by
the baseline algorithm of the family.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2012:DQM,
author = "Bin Li and Li-Shiuan Peh and Li Zhao and Ravi Iyer",
title = "Dynamic {QoS} management for chip multiprocessors",
journal = j-TACO,
volume = "9",
number = "3",
pages = "17:1--17:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355590",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the continuing scaling of semiconductor
technologies, chip multiprocessor (CMP) has become the
de facto design for modern high performance computer
architectures. It is expected that more and more
applications with diverse requirements will run
simultaneously on the CMP platform. However, this will
exert contention on shared resources such as the last
level cache, network-on-chip bandwidth and off-chip
memory bandwidth, thus affecting the performance and
quality-of-service (QoS) significantly. In this
environment, efficient resource sharing and a guarantee
of a certain level of performance is highly desirable.
Researchers have proposed different frameworks for
providing QoS. Most of these frameworks focus on
individual resource for QoS management. Coordinated
management of multiple QoS-aware shared resources at
runtime remains an open problem. Recently, there has
been work that proposed a class-of-serviced based
framework to jointly managing cache, NoC and memory
resources simultaneously. However, the work allocates
shared resources statically at the beginning of
application runtime, and do not dynamically track,
manage and share shared resources across applications.
In this article, we address this limitation by
proposing dynamic resource management policies that
monitor the resource usage of applications at runtime,
then steals resources from the high-priority
applications for lower-priority ones. The goal is to
maintain the targeted level of performance for
high-priority applications while improving the
performance of lower-priority applications. We use a PI
(Proportional-Integral gain) feedback controller based
technique to maintain stability in our framework. Our
evaluation results show that our policy can improve
performance for lower-priority applications
significantly while maintaining the performance for
high-priority application, thus demonstrating the
effectiveness of our dynamic QoS resource management
policy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xekalakis:2012:MSM,
author = "Polychronis Xekalakis and Nikolas Ioannou and Marcelo
Cintra",
title = "Mixed speculative multithreaded execution models",
journal = j-TACO,
volume = "9",
number = "3",
pages = "18:1--18:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355591",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The current trend toward multicore architectures has
placed great pressure on programmers and compilers to
generate thread-parallel programs. Improved execution
performance can no longer be obtained via traditional
single-thread instruction level parallelism (ILP), but,
instead, via multithreaded execution. One notable
technique that facilitates the extraction of parallel
threads from sequential applications is thread-level
speculation (TLS). This technique allows
programmers/compilers to generate threads without
checking for inter-thread data and control dependences,
which are then transparently enforced by the hardware.
Most prior work on TLS has concentrated on thread
selection and mechanisms to efficiently support the
main TLS operations, such as squashes, data versioning,
and commits. This article seeks to enhance TLS
functionality by combining it with other speculative
multithreaded execution models. The main idea is that
TLS already requires extensive hardware support, which
when slightly augmented can accommodate other
speculative multithreaded techniques. Recognizing that
for different applications, or even program phases, the
application bottlenecks may be different, it is
reasonable to assume that the more versatile a system
is, the more efficiently it will be able to execute the
given program. Toward this direction, we first show
that mixed execution models that combine TLS with
Helper Threads (HT), RunAhead execution (RA) and
MultiPath execution (MP) perform better than any of the
models alone. Based on a simple model that we propose,
we show that benefits come from being able to extract
additional ILP without harming the TLP extracted by
TLS. We then show that by combining all the execution
models in a unified one that combines all these
speculative multithreaded models, ILP can be further
enhanced with only minimal additional cost in
hardware.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sharafeddine:2012:DOE,
author = "Mageda Sharafeddine and Komal Jothi and Haitham
Akkary",
title = "Disjoint out-of-order execution processor",
journal = j-TACO,
volume = "9",
number = "3",
pages = "19:1--19:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355592",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "High-performance superscalar architectures used to
exploit instruction level parallelism in single-thread
applications have become too complex and power hungry
for the multicore processors era. We propose a new
architecture that uses multiple small latency-tolerant
out-of-order cores to improve single-thread
performance. Improving single-thread performance with
multiple small out-of-order cores allows designers to
place more of these cores on the same die.
Consequently, emerging highly parallel applications can
take full advantage of the multicore parallel hardware
without sacrificing performance of inherently serial
and hard to parallelize applications. Our architecture
combines speculative multithreading (SpMT) with
checkpoint recovery and continual flow pipeline
architectures. It splits single-thread program
execution into disjoint control and data threads that
execute concurrently on multiple cooperating small and
latency-tolerant out-of-order cores. Hence we call this
style of execution Disjoint Out-of-Order Execution
(DOE). DOE uses latency tolerance to overcome
performance issues of SpMT caused by interthread data
dependences. To evaluate this architecture, we have
developed a microarchitecture performance model of DOE
based on PTLSim, a simulation infrastructure of the x86
instruction set architecture. We evaluate the potential
performance of DOE processor architecture using a
simple heuristic to fork control independent threads in
hardware at the target addresses of future procedure
return instructions. Using applications from SpecInt
2000, we study DOE under ideal as well as realistic
architectural constraints. We discuss the performance
impact of key DOE architecture and application
variables such as number of cores, interthread data
dependences, intercore data communication delay,
buffers capacity, and branch mispredictions. Without
any DOE specific compiler optimizations, our results
show that DOE outperforms conventional SpMT
architectures by 15\%, on average. We also show that
DOE with four small cores can perform on average
equally well to a large superscalar core, consuming
about the same power. Most importantly, DOE improves
throughput performance by a significant amount over a
large superscalar core, up to 2.5 times, when running
multitasking applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Andrade:2012:SAW,
author = "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n
Doallo",
title = "Static analysis of the worst-case memory performance
for irregular codes with indirections",
journal = j-TACO,
volume = "9",
number = "3",
pages = "20:1--20:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355593",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Real-time systems are subject to timing constraints,
whose upper bound is given by the Worst-Case Execution
Time (WCET). Cache memory behavior is difficult to
predict analytically and estimating a safe and precise
worst-case value is even more challenging. The
worst-case memory performance (WCMP) component of the
WCET can only be estimated with the precise knowledge
of the stream of data addresses accessed by the code,
which is determined by the access patterns and the base
addresses of the data structures accessed. The
regularity of strided access patterns simplifies their
analysis, as they are characterized by relatively few
parameters, which are often available at compile time.
Unfortunately codes may exhibit irregular access
patterns, which are much more difficult to statically
analyze. As for the base addresses of the data
structures, they are not always available at
compile-time for many reasons: stack variables,
dynamically allocated memory, modules compiled
separately, etc. This article addresses these problems
by presenting a model that predicts an \%safe and upper
bound of the data cache performance for codes both with
regular and irregular access patterns, which is valid
for any possible base addresses of the data structures.
The model analyzes irregular access patterns due to the
presence of indirections in the code and it can provide
two kinds of predictions: a safe hard boundary that is
suitable for hard real-time systems and a soft boundary
whose safeness is not guaranteed but which is valid
most of the times. In fact, in all our experiments the
number of misses was below the soft boundary predicted
by the model. This turns this soft boundary prediction
into a valuable tool, particularly for non and soft
real-time systems, which tolerate a percentage of the
runs exceeding their deadlines.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2012:DIO,
author = "Yang Chen and Shuangde Fang and Yuanjie Huang and
Lieven Eeckhout and Grigori Fursin and Olivier Temam
and Chengyong Wu",
title = "Deconstructing iterative optimization",
journal = j-TACO,
volume = "9",
number = "3",
pages = "21:1--21:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355594",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Iterative optimization is a popular compiler
optimization approach that has been studied extensively
over the past decade. In this article, we deconstruct
iterative optimization by evaluating whether it works
across datasets and by analyzing why it works. Up to
now, most iterative optimization studies are based on a
premise which was never truly evaluated: that it is
possible to learn the best compiler optimizations
across datasets. In this article, we evaluate this
question for the first time with a very large number of
datasets. We therefore compose KDataSets, a dataset
suite with 1000 datasets for 32 programs, which we
release to the public. We characterize the diversity of
KDataSets, and subsequently use it to evaluate
iterative optimization. For all 32 programs, we find
that there exists at least one combination of compiler
optimizations that achieves at least 83\% or more of
the best possible speedup across all datasets on two
widely used compilers (Intel's ICC and GNU's GCC). This
optimal combination is program-specific and yields
speedups up to 3.75$ \times $ (averaged across datasets
of a program) over the highest optimization level of
the compilers (-O3 for GCC and -fast for ICC). This
finding suggests that optimizing programs across
datasets might be much easier than previously
anticipated. In addition, we evaluate the idea of
introducing compiler choice as part of iterative
optimization. We find that it can further improve the
performance of iterative optimization because different
programs favor different compilers. We also investigate
why iterative optimization works by analyzing the
optimal combinations. We find that only a handful
optimizations yield most of the speedup. Finally, we
show that optimizations interact in a complex and
sometimes counterintuitive way through two case
studies, which confirms that iterative optimization is
an irreplaceable and important compiler strategy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Guha:2012:MOD,
author = "Apala Guha and Kim Hazelwood and Mary Lou Soffa",
title = "Memory optimization of dynamic binary translators for
embedded systems",
journal = j-TACO,
volume = "9",
number = "3",
pages = "22:1--22:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355595",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Dynamic binary translators (DBTs) are becoming
increasingly important because of their power and
flexibility. DBT-based services are valuable for all
types of platforms. However, the high memory demands of
DBTs present an obstacle for embedded systems. Most
research on DBT design has a performance focus, which
often drives up the DBT memory demand. In this article,
we present a memory-oriented approach to DBT design. We
consider the class of translation-based DBTs and their
sources of memory demand; cached translated code,
cached auxiliary code and DBT data structures. We
explore aspects of DBT design that impact these memory
demand sources and present strategies to mitigate
memory demand. We also explore performance
optimizations for DBTs that handle memory demand by
placing a limit on it, and repeatedly flush
translations to stay within the limit, thereby
replacing the memory demand problem with a performance
degradation problem. Our optimizations that mitigate
memory demand improve performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Geraci:2012:TFP,
author = "James R. Geraci and Sharon M. Sacco",
title = "A transpose-free in-place {SIMD} optimized {FFT}",
journal = j-TACO,
volume = "9",
number = "3",
pages = "23:1--23:??",
month = sep,
year = "2012",
CODEN = "????",
DOI = "https://doi.org/10.1145/2355585.2355596",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 22 10:48:53 MDT 2012",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A transpose-free in-place SIMD optimized algorithm for
the computation of large FFTs is introduced and
implemented on the Cell Broadband Engine. Six different
FFT implementations of the algorithm using six
different data movement methods are described. Their
relative performance is compared for input sizes from $
2^{17} $ to $ 2^{21} $ complex floating point samples.
Large differences in performance are observed among
even theoretically equivalent data movement patterns.
All six implementations compare favorably with FFTW and
other previous FFT implementations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Coppens:2013:FDB,
author = "Bart Coppens and Bjorn {De Sutter} and Jonas Maebe",
title = "Feedback-driven binary code diversification to the
special issue on high-performance embedded
architectures and compilers",
journal = j-TACO,
volume = "9",
number = "4",
pages = "24:1--24:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400683",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As described in many blog posts and in the scientific
literature, exploits for software vulnerabilities are
often engineered on the basis of patches. For example,
``Microsoft Patch Tuesday'' is often followed by
``Exploit Wednesday'' during which yet unpatched
systems become vulnerable to patch-based exploits. Part
of the patch engineering includes the identification of
the vulnerable binary code by means of
reverse-engineering tools and diffing add-ons. In this
article we present a feedback-driven compiler tool flow
that iteratively transforms code until diffing tools
become ineffective enough to close the ``Exploit
Wednesday'' window of opportunity. We demonstrate the
tool's effectiveness on a set of real-world patches and
against the latest version of BinDiff.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fowers:2013:PEC,
author = "Jeremy Fowers and Greg Brown and John Wernsing and
Greg Stitt",
title = "A performance and energy comparison of convolution on
{GPUs}, {FPGAs}, and multicore processors",
journal = j-TACO,
volume = "9",
number = "4",
pages = "25:1--25:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400684",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recent architectural trends have focused on increased
parallelism via multicore processors and increased
heterogeneity via accelerator devices (e.g.,
graphics-processing units, field-programmable gate
arrays). Although these architectures have significant
performance and energy potential, application designers
face many device-specific challenges when choosing an
appropriate accelerator or when customizing an
algorithm for an accelerator. To help address this
problem, in this article we thoroughly evaluate
convolution, one of the most common operations in
digital-signal processing, on multicores,
graphics-processing units, and field-programmable gate
arrays. Whereas many previous application studies
evaluate a specific usage of an application, this
article assists designers with design space exploration
for numerous use cases by analyzing effects of
different input sizes, different algorithms, and
different devices, while also determining
Pareto-optimal trade-offs between performance and
energy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rohou:2013:VTI,
author = "Erven Rohou and Kevin Williams and David Yuste",
title = "Vectorization technology to improve interpreter
performance",
journal = j-TACO,
volume = "9",
number = "4",
pages = "26:1--26:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400685",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In the present computing landscape, interpreters are
in use in a wide range of systems. Recent trends in
consumer electronics have created a new category of
portable, lightweight software applications. Typically,
these applications have fast development cycles and
short life spans. They run on a wide range of systems
and are deployed in a target independent bytecode
format over Internet and cellular networks. Their
authors are untrusted third-party vendors, and they are
executed in secure managed runtimes or virtual
machines. Furthermore, due to security policies or
development time constraints, these virtual machines
often lack just-in-time compilers and rely on
interpreted execution. At the other end of the
spectrum, interpreters are also a reality in the field
of high-performance computations because of the
flexibility they provide. The main performance penalty
in interpreters arises from instruction dispatch. Each
bytecode requires a minimum number of machine
instructions to be executed. In this work, we introduce
a novel approach for interpreter optimization that
reduces instruction dispatch thanks to vectorization
technology. We extend the split compilation paradigm to
interpreters, thus guaranteeing that our approach
exhibits almost no overhead at runtime. We take
advantage of the vast research in vectorization and its
presence in modern compilers. Complex analyses are
performed ahead of time, and their results are conveyed
to the executable bytecode. At runtime, the interpreter
retrieves this additional information to build the SIMD
IR (intermediate representation) instructions that
carry the vector semantics. The bytecode language
remains unmodified, making this representation
compatible with legacy interpreters and previously
proposed JIT compilers. We show that this approach
drastically reduces the number of instructions to
interpret and decreases execution time of vectorizable
applications. Moreover, we map SIMD IR instructions to
hardware SIMD instructions when available, with a
substantial additional improvement. Finally, we finely
analyze the impact of our extension on the behavior of
the caches and branch predictors.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cleary:2013:FAT,
author = "Jimmy Cleary and Owen Callanan and Mark Purcell and
David Gregg",
title = "Fast asymmetric thread synchronization",
journal = j-TACO,
volume = "9",
number = "4",
pages = "27:1--27:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400686",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "For most multi-threaded applications, data structures
must be shared between threads. Ensuring thread safety
on these data structures incurs overhead in the form of
locking and other synchronization mechanisms. Where
data is shared among multiple threads these costs are
unavoidable. However, a common access pattern is that
data is accessed primarily by one dominant thread, and
only very rarely by the other, non-dominant threads.
Previous research has proposed biased locks, which are
optimized for a single dominant thread, at the cost of
greater overheads for non-dominant threads. In this
article we propose a new family of biased
synchronization mechanisms that, using a modified
interface, push accesses to shared data from the
non-dominant threads to the dominant one, via a novel
set of message passing mechanisms. We present
mechanisms for protecting critical sections, for
queueing work, for caching shared data in registers
where it is safe to do so, and for asynchronous
critical section accesses. We present results for the
conventional Intel\reg{} Sandy Bridge processor and for
the emerging network-optimized many-core IBM\reg{}
PowerENTM processor. We find that our algorithms
compete well with existing biased locking algorithms,
and, in particular, perform better than existing
algorithms as accesses from non-dominant threads
increase.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2013:PTL,
author = "Yong Li and Rami Melhem and Alex K. Jones",
title = "{PS-TLB}: Leveraging page classification information
for fast, scalable and efficient translation for future
{CMPs}",
journal = j-TACO,
volume = "9",
number = "4",
pages = "28:1--28:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400687",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Traversing the page table during virtual to physical
address translation causes pipeline stalls when misses
occur in the translation-lookaside buffer (TLB).
State-of-the-art translation proposals typically
optimize a single aspect of translation performance
(e.g., translation sharing, context switch performance,
etc.) with potential trade-offs of additional hardware
complexity, increased translation latency, or reduced
scalability. In this article, we propose the partial
sharing TLB (PS-TLB), a fast and scalable solution that
reduces off-chip translation misses without sacrificing
the timing-critical requirement of on-chip translation.
We introduce the partial sharing buffer (PSB) which
leverages application page sharing characteristics
using minimal additional hardware resources. Compared
to the leading TLB proposal that leverages sharing,
PS-TLB provides a more than 45\% improvement in
translation latency with a 9\% application speedup
while using fewer storage resources. In addition, the
page classification and PS-TLB architecture provide
further optimizations including an over 30\% reduction
of interprocessor interrupts for coherence, and reduced
context switch misses with fewer resources compared
with existing methods.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{DuBois:2013:PTC,
author = "Kristof {Du Bois} and Stijn Eyerman and Lieven
Eeckhout",
title = "Per-thread cycle accounting in multicore processors",
journal = j-TACO,
volume = "9",
number = "4",
pages = "29:1--29:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400688",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "While multicore processors improve overall chip
throughput and hardware utilization, resource sharing
among the cores leads to unpredictable performance for
the individual threads running on a multicore
processor. Unpredictable per-thread performance becomes
a problem when considered in the context of multicore
scheduling: system software assumes that all threads
make equal progress, however, this is not what the
hardware provides. This may lead to problems at the
system level such as missed deadlines, reduced
quality-of-service, non-satisfied service-level
agreements, unbalanced parallel performance, priority
inversion, unpredictable interactive performance, etc.
This article proposes a hardware-efficient per-thread
cycle accounting architecture for multicore processors.
The counter architecture tracks per-thread progress in
a multicore processor, detects how inter-thread
interference affects per-thread performance, and
predicts the execution time for each thread if run in
isolation. The counter architecture captures the
effects of additional conflict misses due to cache
sharing as well as increased latency for other memory
accesses due to resource and bandwidth contention in
the memory subsystem. The proposed method accounts for
74.3\% of the interference cycles, and estimates
per-thread progress within 14.2\% on average across a
large set of multi-program workloads. Hardware cost is
limited to 7.44KB for an 8-core processor, a reduction
by almost $ 10 \times $ compared to prior work while
being 63.8\% more accurate. Making system software
progress aware improves fairness by 22.5\% on average
over progress-agnostic scheduling.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wimmer:2013:MAV,
author = "Christian Wimmer and Michael Haupt and Michael L. {Van
De Vanter} and Mick Jordan and Laurent Dayn{\`e}s and
Douglas Simon",
title = "{Maxine}: an approachable virtual machine for, and in,
{Java}",
journal = j-TACO,
volume = "9",
number = "4",
pages = "30:1--30:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400689",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A highly productive platform accelerates the
production of research results. The design of a Virtual
Machine (VM) written in the Java{\TM} programming
language can be simplified through exploitation of
interfaces, type and memory safety, automated memory
management (garbage collection), exception handling,
and reflection. Moreover, modern Java IDEs offer
time-saving features such as refactoring,
auto-completion, and code navigation. Finally, Java
annotations enable compiler extensions for low-level
``systems programming'' while retaining IDE
compatibility. These techniques collectively make
complex system software more ``approachable'' than has
been typical in the past. The Maxine VM, a metacircular
Java VM implementation, has aggressively used these
features since its inception. A co-designed companion
tool, the Maxine Inspector, offers integrated debugging
and visualization of all aspects of the VM's runtime
state. The Inspector's implementation exploits advanced
Java language features, embodies intimate knowledge of
the VM's design, and even reuses a significant amount
of VM code directly. These characteristics make Maxine
a highly approachable VM research platform and a
productive basis for research and teaching.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Khan:2013:SBA,
author = "Malik Khan and Protonu Basu and Gabe Rudy and Mary
Hall and Chun Chen and Jacqueline Chame",
title = "A script-based autotuning compiler system to generate
high-performance {CUDA} code",
journal = j-TACO,
volume = "9",
number = "4",
pages = "31:1--31:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400690",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article presents a novel compiler framework for
CUDA code generation. The compiler structure is
designed to support autotuning, which employs empirical
techniques to evaluate a set of alternative mappings of
computation kernels and select the mapping that obtains
the best performance. This article introduces a
Transformation Strategy Generator, a meta-optimizer
that generates a set of transformation recipes, which
are descriptions of the mapping of the sequential code
to parallel CUDA code. These recipes comprise a search
space of possible implementations. This system achieves
performance comparable and sometimes better than
manually tuned libraries and exceeds the performance of
a state-of-the-art GPU compiler.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{VanCraeynest:2013:UFD,
author = "Kenzo {Van Craeynest} and Lieven Eeckhout",
title = "Understanding fundamental design choices in
single-{ISA} heterogeneous multicore architectures",
journal = j-TACO,
volume = "9",
number = "4",
pages = "32:1--32:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400691",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Single-ISA heterogeneous multicore processors have
gained substantial interest over the past few years
because of their power efficiency, as they offer the
potential for high overall chip throughput within a
given power budget. Prior work in heterogeneous
architectures has mainly focused on how heterogeneity
can improve overall system throughput. To what extent
heterogeneity affects per-program performance has
remained largely unanswered. In this article, we aim at
understanding how heterogeneity affects both chip
throughput and per-program performance; how
heterogeneous architectures compare to homogeneous
architectures under both performance metrics; and how
fundamental design choices, such as core type, cache
size, and off-chip bandwidth, affect performance. We
use analytical modeling to explore a large space of
single-ISA heterogeneous architectures. The analytical
model has linear-time complexity in the number of core
types and programs of interest, and offers a unique
opportunity for exploring the large space of both
homogeneous and heterogeneous multicore processors in
limited time. Our analysis provides several interesting
insights: While it is true that heterogeneity can
improve system throughput, it fundamentally trades
per-program performance for chip throughput; although
some heterogeneous configurations yield better
throughput and per-program performance than homogeneous
designs, some homogeneous configurations are optimal
for particular throughput versus per-program
performance trade-offs. Two core types provide most of
the benefits from heterogeneity and a larger number of
core types does not contribute much; job-to-core
mapping is both important and challenging for
heterogeneous multicore processors to achieve optimum
performance. Limited off-chip bandwidth does alter some
of the fundamental design choices in heterogeneous
multicore architectures, such as the need for large
on-chip caches for achieving high throughput, and
per-program performance degrading more relative to
throughput under constrained off-chip bandwidth.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Antao:2013:CFA,
author = "Samuel Ant{\~a}o and Leonel Sousa",
title = "The {CRNS} framework and its application to
programmable and reconfigurable cryptography",
journal = j-TACO,
volume = "9",
number = "4",
pages = "33:1--33:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400692",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article proposes the Computing with the
ResidueNumber System (CRNS) framework, which aims at
the design automation of accelerators for Modular
Arithmetic (MA). The framework provides a comprehensive
set of tools ranging from a programming language and
respective compiler to back-ends targeting parallel
computation platforms such as Graphical Processing
Units (GPUs) and reconfigurable hardware. Given an
input algorithm described with a high-level programming
language, the CRNS can be used to obtain in a few
seconds the corresponding optimized Parallel Thread
Execution (PTX) program ready to be run on GPUs or the
Hardware Description Language (HDL) specification of a
fully functional accelerator suitable for
reconfigurable hardware and embedded systems. The
resulting framework's implementations benefit from the
Residue Number System (RNS) arithmetic's
parallelization properties in a fully automated way.
Designers do not need to be familiar with the
mathematical details concerning the employed
arithmetic, namely the RNS representation. In order to
thoroughly describe and evaluate the proposed
framework, experimental results obtained for the
supported back-ends (GPU and HDL) are presented
targeting the implementation of the modular
exponentiation used in the Rivest-Shamir-Adleman (RSA)
algorithm and Elliptic Curve (EC) point multiplication.
Results suggest competitive latency and throughput with
minimum design effort and overcoming all the
development issues that arise in the specification and
verification of dedicated solutions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Diouf:2013:DLM,
author = "Boubacar Diouf and Can Hantas and Albert Cohen and
{\"O}zcan {\"O}zturk and Jens Palsberg",
title = "A decoupled local memory allocator",
journal = j-TACO,
volume = "9",
number = "4",
pages = "34:1--34:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400693",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Compilers use software-controlled local memories to
provide fast, predictable, and power-efficient access
to critical data. We show that the local memory
allocation for straight-line, or linearized programs is
equivalent to a weighted interval-graph coloring
problem. This problem is new when allowing a color
interval to ``wrap around,'' and we call it the
submarine-building problem. This graph-theoretical
decision problem differs slightly from the classical
ship-building problem, and exhibits very interesting
and unusual complexity properties. We demonstrate that
the submarine-building problem is NP-complete, while it
is solvable in linear time for not-so-proper interval
graphs, an extension of the class of proper interval
graphs. We propose a clustering heuristic to
approximate any interval graph into a not-so-proper
interval graph, decoupling spill code generation from
local memory assignment. We apply this heuristic to a
large number of randomly generated interval graphs
reproducing the statistical features of standard local
memory allocation benchmarks, comparing with
state-of-the-art heuristics.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cui:2013:LOC,
author = "Huimin Cui and Qing Yi and Jingling Xue and Xiaobing
Feng",
title = "Layout-oblivious compiler optimization for matrix
computations",
journal = j-TACO,
volume = "9",
number = "4",
pages = "35:1--35:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400694",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Most scientific computations serve to apply
mathematical operations to a set of preconceived data
structures, e.g., matrices, vectors, and grids. In this
article, we use a number of widely used matrix
computations from the LINPACK library to demonstrate
that complex internal organizations of data structures
can severely degrade the effectiveness of compiler
optimizations. We then present a data-layout-oblivious
optimization methodology, where by isolating an
abstract representation of the computations from
complex implementation details of their data, we enable
these computations to be much more accurately analyzed
and optimized through varying state-of-the-art compiler
technologies. We evaluated our approach on an Intel
8-core platform using two source-to-source compiler
infrastructures, Pluto and EPOD. Our results show that
while the efficiency of a computational kernel differs
when using different data layouts, the alternative
implementations typically benefit from a common set of
optimizations on the operations. Therefore separately
optimizing the operations and the data layout of a
computation could dramatically enhance the
effectiveness of compiler optimizations compared with
the conventional approaches of using a unified
representation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dolan:2013:CSL,
author = "Stephen Dolan and Servesh Muralidharan and David
Gregg",
title = "Compiler support for lightweight context switching",
journal = j-TACO,
volume = "9",
number = "4",
pages = "36:1--36:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400695",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We propose a new language-neutral primitive for the
LLVM compiler, which provides efficient context
switching and message passing between lightweight
threads of control. The primitive, called Swapstack,
can be used by any language implementation based on
LLVM to build higher-level language structures such as
continuations, coroutines, and lightweight threads. As
part of adding the primitives to LLVM, we have also
added compiler support for passing parameters across
context switches. Our modified LLVM compiler produces
highly efficient code through a combination of exposing
the context switching code to existing compiler
optimizations, and adding novel compiler optimizations
to further reduce the cost of context switches. To
demonstrate the generality and efficiency of our
primitives, we add one-shot continuations to C++, and
provide a simple fiber library that allows millions of
fibers to run on multiple cores, with a work-stealing
scheduler and fast inter-fiber sychronization. We argue
that compiler-supported lightweight context switching
can be significantly faster than using a library to
switch between contexts, and provide experimental
evidence to support the position.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Abad:2013:LLE,
author = "Pablo Abad and Valentin Puente and Jose-Angel
Gregorio",
title = "{LIGERO}: a light but efficient router conceived for
cache-coherent chip multiprocessors",
journal = j-TACO,
volume = "9",
number = "4",
pages = "37:1--37:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400696",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Although abstraction is the best approach to deal with
computing system complexity, sometimes implementation
details should be considered. Considering on-chip
interconnection networks in particular, underestimating
the underlying system specificity could have
nonnegligible impact on performance, cost, or
correctness. This article presents a very efficient
router that has been devised to deal with
cache-coherent chip multiprocessor particularities in a
balanced way. Employing the same principles of packet
rotation structures as in the rotary router, we present
a router configuration with the following novel
features: (1) reduced buffering requirements, (2)
optimized pipeline under contentionless conditions, (3)
more efficient deadlock avoidance mechanism, and (4)
optimized in-order delivery guarantee. Putting it all
together, our proposal provides a set of features that
no other router, to the best of our knowledge, has
achieved previously. These are: (1') low implementation
cost, (2') low pass-through latency under low load,
(3') improved resource utilization through adaptive
routing and a buffering scheme free of head-of-line
blocking, (4') guarantee of coherence protocol
correctness via end-to-end deadlock avoidance and
in-order delivery, and (5') improvement of coherence
protocol responsiveness through adaptive in-network
multicast support. We conduct a thorough evaluation
that includes hardware cost estimation and performance
evaluation under a wide spectrum of realistic workloads
and coherence protocols. Comparing our proposal with
VCTM, an optimized state-of-the-art wormhole router, it
requires 50\% less area, reduces on-chip cache
hierarchy energy delay product on average by 20\%, and
improves the cache-coherency chip multiprocessor
performance under realistic working conditions by up to
20\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Albericio:2013:ERL,
author = "Jorge Albericio and Pablo Ib{\'a}{\~n}ez and
V{\'\i}ctor Vi{\~n}als and Jose Mar{\'\i}a
Llaber{\'\i}a",
title = "Exploiting reuse locality on inclusive shared
last-level caches",
journal = j-TACO,
volume = "9",
number = "4",
pages = "38:1--38:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400697",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Optimization of the replacement policy used for Shared
Last-Level Cache (SLLC) management in a
Chip-MultiProcessor (CMP) is critical for avoiding
off-chip accesses. Temporal locality, while being
exploited by first levels of private cache memories, is
only slightly exhibited by the stream of references
arriving at the SLLC. Thus, traditional replacement
algorithms based on recency are bad choices for
governing SLLC replacement. Recent proposals involve
SLLC replacement policies that attempt to exploit reuse
either by segmenting the replacement list or improving
the rereference interval prediction. On the other hand,
inclusive SLLCs are commonplace in the CMP market, but
the interaction between replacement policy and the
enforcement of inclusion has barely been discussed.
After analyzing that interaction, this article
introduces two simple replacement policies exploiting
reuse locality and targeting inclusive SLLCs: Least
Recently Reused (LRR) and Not Recently Reused (NRR).
NRR has the same implementation cost as NRU, and LRR
only adds one bit per line to the LRU cost. After
considering reuse locality and its interaction with the
invalidations induced by inclusion, the proposals are
evaluated by simulating multiprogrammed workloads in an
8-core system with two private cache levels and an
SLLC. LRR outperforms LRU by 4.5\% (performing better
in 97 out of 100 mixes) and NRR outperforms NRU by
4.2\% (performing better in 99 out of 100 mixes). We
also show that our mechanisms outperform rereference
interval prediction, a recently proposed SLLC
replacement policy and that similar conclusions can be
drawn by varying the associativity or the SLLC size.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yiapanis:2013:OSR,
author = "Paraskevas Yiapanis and Demian Rosas-Ham and Gavin
Brown and Mikel Luj{\'a}n",
title = "Optimizing software runtime systems for speculative
parallelization",
journal = j-TACO,
volume = "9",
number = "4",
pages = "39:1--39:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400698",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Thread-Level Speculation (TLS) overcomes limitations
intrinsic with conservative compile-time
auto-parallelizing tools by extracting parallel threads
optimistically and only ensuring absence of data
dependence violations at runtime. A significant barrier
for adopting TLS (implemented in software) is the
overheads associated with maintaining speculative
state. Based on previous TLS limit studies, we observe
that on future multicore systems we will likely have
more cores idle than those which traditional TLS would
be able to harness. This implies that a TLS system
should focus on optimizing for small number of cores
and find efficient ways to take advantage of the idle
cores. Furthermore, research on optimistic systems has
covered two important implementation design points:
eager vs. lazy version management. With this knowledge,
we propose new simple and effective techniques to
reduce the execution time overheads for both of these
design points. This article describes a novel compact
version management data structure optimized for space
overhead when using a small number of TLS threads.
Furthermore, we describe two novel software runtime
parallelization systems that utilize this compact data
structure. The first software TLS system, MiniTLS,
relies on eager memory data management (in-place
updates) and, thus, when a misspeculation occurs a
rollback process is required. MiniTLS takes advantage
of the novel compact version management representation
to parallelize the rollback process and is able to
recover from misspeculation faster than existing
software eager TLS systems. The second one, Lector
(Lazy inspECTOR) is based on lazy version management.
Since we have idle cores, the question is whether we
can create ``helper'' tasks to determine whether
speculation is actually needed without stopping or
damaging the speculative execution. In Lector, for each
conventional TLS thread running speculatively with lazy
version management, there is associated with it a
lightweight inspector. The inspector threads execute
alongside to verify quickly whether data dependencies
will occur. Inspector threads are generated by standard
techniques for inspector/executor parallelization. We
have applied both TLS systems to seven Java sequential
benchmarks, including three benchmarks from
SPECjvm2008. Two out of the seven benchmarks exhibit
misspeculations. MiniTLS experiments report average
speedups of 1.8x for 4 threads increasing close to 7x
speedups with 32 threads. Facilitated by our novel
compact representation, MiniTLS reduces the space
overhead over state-of-the-art software TLS systems
between 96\% on 2 threads and 40\% on 32 threads. The
experiments for Lector, report average speedups of 1.7x
for 2 threads (that is 1 TLS + 1 Inspector threads)
increasing close to 8.2x speedups with 32 threads (16 +
16 threads). Compared to a well established software
TLS baseline, Lector performs on average 1.7x faster
for 32 threads and in no case ( x TLS + x Inspector
threads) Lector delivers worse performance than the
baseline TLS with the equivalent number of TLS threads
(i.e. x TLS threads) nor doubling the equivalent number
of TLS threads (i.e., x + x TLS threads).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nugteren:2013:ASC,
author = "Cedric Nugteren and Pieter Custers and Henk
Corporaal",
title = "Algorithmic species: a classification of affine loop
nests for parallel programming",
journal = j-TACO,
volume = "9",
number = "4",
pages = "40:1--40:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400699",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Code generation and programming have become ever more
challenging over the last decade due to the shift
towards parallel processing. Emerging processor
architectures such as multi-cores and GPUs exploit
increasingly parallelism, requiring programmers and
compilers to deal with aspects such as threading,
concurrency, synchronization, and complex memory
partitioning. We advocate that programmers and
compilers can greatly benefit from a structured
classification of program code. Such a classification
can help programmers to find opportunities for
parallelization, reason about their code, and interact
with other programmers. Similarly, parallelising
compilers and source-to-source compilers can take
threading and optimization decisions based on the same
classification. In this work, we introduce algorithmic
species, a classification of affine loop nests based on
the polyhedral model and targeted for both automatic
and manual use. Individual classes capture information
such as the structure of parallelism and the data
reuse. To make the classification applicable for manual
use, a basic vocabulary forms the base for the creation
of a set of intuitive classes. To demonstrate the use
of algorithmic species, we identify 115 classes in a
benchmark set. Additionally, we demonstrate the
suitability of algorithmic species for automated uses
by showing a tool to automatically extract species from
program code, a species-based source-to-source
compiler, and a species-based performance prediction
model.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gerards:2013:ODD,
author = "Marco E. T. Gerards and Jan Kuper",
title = "Optimal {DPM} and {DVFS} for frame-based real-time
systems",
journal = j-TACO,
volume = "9",
number = "4",
pages = "41:1--41:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400700",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Dynamic Power Management (DPM) and Dynamic Voltage and
Frequency Scaling (DVFS) are popular techniques for
reducing energy consumption. Algorithms for optimal
DVFS exist, but optimal DPM and the optimal combination
of DVFS and DPM are not yet solved. In this article we
use well-established models of DPM and DVFS for
frame-based systems. We show that it is not
sufficient-as some authors argue-to consider only
individual invocations of a task. We define a schedule
that also takes interactions between invocations into
account and prove-in a theoretical fashion-that this
schedule is optimal.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yan:2013:IPA,
author = "Zhichao Yan and Hong Jiang and Yujuan Tan and Dan
Feng",
title = "An integrated pseudo-associativity and relaxed-order
approach to hardware transactional memory",
journal = j-TACO,
volume = "9",
number = "4",
pages = "42:1--42:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400701",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Our experimental study and analysis reveal that the
bottlenecks of existing hardware transactional memory
systems are largely rooted in the extra data movements
in version management and in the inefficient scheduling
of conflicting transactions in conflict management,
particularly in the presence of high-contention and
coarse-grained applications. In order to address this
problem, we propose an integrated Pseudo-Associativity
and Relaxed-Order approach to hardware Transactional
Memory, called PARO-TM. It exploits the extra
pseudo-associative space in the data cache to hold the
new value of each transactional modification, and
maintains the mappings between the old and new versions
via an implicit pseudo-associative hash algorithm
(i.e., by inverting the specific bit of the SET index).
PARO-TM can branch out the speculative version from the
old version upon each transactional modification on
demand without a dedicated hardware component to hold
the uncommitted data. This means that it is able to
automatically access the proper version upon the
transaction's commit or abort. Moreover, PARO-TM
augments multi-version support in a chained directory
to schedule conflicting transactions in a relaxed-order
manner to further reduce their overheads. We compare
PARO-TM with the state-of-the-art LogTM-SE, TCC, DynTM,
and SUV-TM systems and find that PARO-TM consistently
outperforms these four representative HTMs. This
performance advantage of PARO-TM is far more pronounced
under the high-contention and coarse-grained
applications in the STAMP benchmark suite, for which
PARO-TM is motivated and designed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2013:PGF,
author = "Doris Chen and Deshanand Singh",
title = "Profile-guided floating- to fixed-point conversion for
hybrid {FPGA}-processor applications",
journal = j-TACO,
volume = "9",
number = "4",
pages = "43:1--43:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400702",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The key to enabling widespread use of FPGAs for
algorithm acceleration is to allow programmers to
create efficient designs without the time-consuming
hardware design process. Programmers are used to
developing scientific and mathematical algorithms in
high-level languages (C/C++) using floating point data
types. Although easy to implement, the dynamic range
provided by floating point is not necessary in many
applications; more efficient implementations can be
realized using fixed point arithmetic. While this topic
has been studied previously [Han et al. 2006; Olson et
al. 1999; Gaffar et al. 2004; Aamodt and Chow 1999],
the degree of full automation has always been lacking.
We present a novel design flow for cases where FPGAs
are used to offload computations from a microprocessor.
Our LLVM-based algorithm inserts value profiling code
into an unmodified C/C++ application to guide its
automatic conversion to fixed point. This allows for
fast and accurate design space exploration on a host
microprocessor before any accelerators are mapped to
the FPGA. Through experimental results, we demonstrate
that fixed-point conversion can yield resource savings
of up to 2x--3x reductions. Embedded RAM usage is
minimized, and 13\%--22\% higher $ F_{\rm max} $ than
the original floating-point implementation is observed.
In a case study, we show that 17\% reduction in logic
and 24\% reduction in register usage can be realized by
using our algorithm in conjunction with a High-Level
Synthesis (HLS) tool.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cui:2013:LCA,
author = "Yan Cui and Yingxin Wang and Yu Chen and Yuanchun
Shi",
title = "Lock-contention-aware scheduler: a scalable and
energy-efficient method for addressing scalability
collapse on multicore systems",
journal = j-TACO,
volume = "9",
number = "4",
pages = "44:1--44:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400703",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In response to the increasing ubiquity of multicore
processors, there has been widespread development of
multithreaded applications that strive to realize their
full potential. Unfortunately, lock contention within
operating systems can limit the scalability of
multicore systems so severely that an increase in the
number of cores can actually lead to reduced
performance (i.e., scalability collapse). Existing
efforts of solving scalability collapse mainly focus on
making critical sections of kernel code fine-grained or
designing new synchronization primitives. However,
these methods have disadvantages in scalability or
energy efficiency. In this article, we observe that the
percentage of lock-waiting time over the total
execution time for a lock intensive task has a
significant correlation with the occurrence of
scalability collapse. Based on this observation, a
lock-contention-aware scheduler is proposed.
Specifically, each task in the scheduler monitors its
percentage of lock waiting time continuously. If the
percentage exceeds a predefined threshold, this task is
considered as lock intensive and migrated to a Special
Set of Cores (i.e., SSC). In this way, the number of
concurrently running lock-intensive tasks is limited to
the number of cores in the SSC, and therefore, the
degree of lock contention is controlled. A central
challenge of using this scheme is how many cores should
be allocated in the SSC to handle lock-intensive tasks.
In our scheduler, the optimal number of cores is
determined online by the model-driven search. The
proposed scheduler is implemented in the recent Linux
kernel and evaluated using micro- and macrobenchmarks
on AMD and Intel 32-core systems. Experimental results
suggest that our proposal is able to remove scalability
collapse completely and sustains the maximal throughput
of the spin-lock-based system for most applications.
Furthermore, the percentage of lock-waiting time can be
reduced by up to 84\%. When compared with scalability
collapse reduction methods such as requester-based
locking scheme and sleeping-based synchronization
primitives, our scheme exhibits significant advantages
in scalability, power consumption, and energy
efficiency.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pusukuri:2013:AFC,
author = "Kishore Kumar Pusukuri and Rajiv Gupta and Laxmi N.
Bhuyan",
title = "{ADAPT}: a framework for coscheduling multithreaded
programs",
journal = j-TACO,
volume = "9",
number = "4",
pages = "45:1--45:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400704",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Since multicore systems offer greater performance via
parallelism, future computing is progressing towards
use of multicore machines with large number of cores.
However, the performance of emerging multithreaded
programs often does not scale to fully utilize the
available cores. Therefore, simultaneously running
multiple multithreaded applications becomes inevitable
to fully exploit the computing potential of such
machines. However, maximizing the performance and
throughput on multicore machines in the presence of
multiple multithreaded programs is a challenge for the
OS. We have observed that the state-of-the-art
contention management algorithms fail to effectively
coschedule multithreaded programs on multicore
machines. To address the above challenge, we present
ADAPT, a scheduling framework that continuously
monitors the resource usage of multithreaded programs
and adaptively coschedules them such that they
interfere with each other's performance as little as
possible. In addition, ADAPT selects appropriate memory
allocation and scheduling policies according to the
workload characteristics. We have implemented ADAPT on
a 64-core Supermicro server running Solaris 11 and
evaluated it using 26 multithreaded programs including
the TATP database application, SPECjbb2005, and
programs from Phoenix, PARSEC, and SPEC OMP suites. The
experimental results show that ADAPT substantially
improves total turnaround time and system utilization
relative to the default Solaris 11 scheduler.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tartara:2013:CLC,
author = "Michele Tartara and Stefano Crespi Reghizzi",
title = "Continuous learning of compiler heuristics",
journal = j-TACO,
volume = "9",
number = "4",
pages = "46:1--46:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400705",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Optimizing programs to exploit the underlying hardware
architecture is an important task. Much research has
been done on enabling compilers to find the best set of
code optimizations that can build the fastest and less
resource-hungry executable for a given program. A
common approach is iterative compilation, sometimes
enriched by machine learning techniques. This provides
good results, but requires extremely long compilation
times and an initial training phase lasting even for
days or weeks. We present long-term learning, a new
algorithm that allows the compiler user to improve the
performance of compiled programs with reduced
compilation times with respect to iterative
compilation, and without an initial training phase. Our
algorithm does not just build good programs: it
acquires knowledge every time a program is compiled and
it uses such knowledge to learn compiler heuristics,
without the need for an expert to manually define them.
The heuristics are evolved during every compilation, by
evaluating their effect on the generated programs. We
present implementations of long-term learning on top of
two different compilers, and experimental data gathered
on multiple hardware configurations showing its
effectiveness.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chrysos:2013:HCP,
author = "Grigorios Chrysos and Panagiotis Dagritzikos and
Ioannis Papaefstathiou and Apostolos Dollas",
title = "{HC-CART}: a parallel system implementation of data
mining classification and regression tree {(CART)}
algorithm on a multi-{FPGA} system",
journal = j-TACO,
volume = "9",
number = "4",
pages = "47:1--47:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400706",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Data mining is a new field of computer science with a
wide range of applications. Its goal is to extract
knowledge from massive datasets in a
human-understandable structure, for example, the
decision trees. In this article we present an
innovative, high-performance, system-level architecture
for the Classification And Regression Tree (CART)
algorithm, one of the most important and widely used
algorithms in the data mining area. Our proposed
architecture exploits parallelism at the decision
variable level, and was fully implemented and evaluated
on a modern high-performance reconfigurable platform,
the Convey HC-1 server, that features four FPGAs and a
multicore processor. Our FPGA-based implementation was
integrated with the widely used ``rpart'' software
library of the R project in order to provide the first
fully functional reconfigurable system that can handle
real-world large databases. The proposed system, named
HC-CART system, achieves a performance speedup of up to
two orders of magnitude compared to well-known
single-threaded data mining software platforms, such as
WEKA and the R platform. It also outperforms similar
hardware systems which implement parts of the complete
application by an order of magnitude. Finally, we show
that the HC-CART system offers higher performance
speedup than some other proposed parallel software
implementations of decision tree construction
algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2013:DCD,
author = "Jongwon Lee and Yohan Ko and Kyoungwoo Lee and Jonghee
M. Youn and Yunheung Paek",
title = "Dynamic code duplication with vulnerability awareness
for soft error detection on {VLIW} architectures",
journal = j-TACO,
volume = "9",
number = "4",
pages = "48:1--48:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400707",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Soft errors are becoming a critical concern in
embedded system designs. Code duplication techniques
have been proposed to increase the reliability in
multi-issue embedded systems such as VLIW by exploiting
empty slots for duplicated instructions. However, they
increase code size, another important concern, and
ignore vulnerability differences in instructions,
causing unnecessary or inefficient protection when
selecting instructions to be duplicated under
constraints. In this article, we propose a
compiler-assisted dynamic code duplication method to
minimize the code size overhead, and present
vulnerability-aware duplication algorithms to maximize
the effectiveness of instruction duplication with least
overheads for VLIW architecture. Our experimental
results with SoarGen and Synopsys simulation
environments demonstrate that our proposals can reduce
the code size by up to 40\% and detect more soft errors
by up to 82\% via fault injection experiments over
benchmarks from DSPstone and Livermore Loops as
compared to the previously proposed instruction
duplication technique.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Coelho:2013:ACI,
author = "Fabien Coelho and Fran{\c{c}}ois Irigoin",
title = "{API} compilation for image hardware accelerators",
journal = j-TACO,
volume = "9",
number = "4",
pages = "49:1--49:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400708",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We present an API-based compilation strategy to
optimize image applications, developed using a
high-level image processing library, onto three
different image processing hardware accelerators. We
demonstrate that such a strategy is profitable for both
development cost and overall performance, especially as
it takes advantage of optimization opportunities across
library calls otherwise beyond reach. The library API
provides the semantics of the image computations. The
three image accelerator targets are quite distinct: the
first one uses a vector architecture; the second one
presents an SIMD architecture; the last one runs both
on GPGPU and multicores through OpenCL. We have adapted
standard compilation techniques to perform these
compilation and code generation tasks automatically.
Our strategy is implemented in PIPS, a source-to-source
compiler which greatly reduces the development cost as
standard phases are reused and parameterized. We
carried out experiments with applications on hardware
functional simulators and GPUs. Our contributions
include: (1) a general low-cost compilation strategy
for image processing applications, based on the
semantics provided by library calls, which improves
locality by an order of magnitude; (2) specific
heuristics to minimize execution time on the target
accelerators; (3) numerous experiments that show the
effectiveness of our strategies. We also discuss the
conditions required to extend this approach to other
application domains.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Luque:2013:FCT,
author = "Carlos Luque and Miquel Moreto and Francisco J.
Cazorla and Mateo Valero",
title = "Fair {CPU} time accounting in {CMP+SMT} processors",
journal = j-TACO,
volume = "9",
number = "4",
pages = "50:1--50:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400709",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Processor architectures combining several paradigms of
Thread-Level Parallelism (TLP), such as CMP processors
in which each core is SMT, are becoming more and more
popular as a way to improve performance at a moderate
cost. However, the complex interaction between running
tasks in hardware shared resources in multi-TLP
architectures introduces complexities when accounting
CPU time (or CPU utilization) to tasks. The CPU
utilization accounted to a task depends on both the
time it runs in the processor and the amount of
processor hardware resources it receives. Deploying
systems with accurate CPU accounting mechanisms is
necessary to increase fairness. Moreover, it will allow
users to be fairly charged on a shared data center,
facilitating server consolidation in future systems. In
this article we analyze the accuracy and hardware cost
of previous CPU accounting mechanisms for pure-CMP and
pure-SMT processors and we show that they are not
adequate for CMP+SMT processors. Consequently, we
propose a new accounting mechanism for CMP+SMT
processors which: (1) increases the accuracy of
accounted CPU utilization; (2) provides much more
stable results over a wide range of processor setups;
and (3) does not require tracking all hardware shared
resources, significantly reducing its implementation
cost. In particular, previous proposals lead to
inaccuracies between 21\% and 79\% when measuring CPU
utilization in an 8-core 2-way SMT processor, while our
proposal reduces this inaccuracy to less than 5.0\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mattheakis:2013:SRM,
author = "Pavlos M. Mattheakis and Ioannis Papaefstathiou",
title = "Significantly reducing {MPI} intercommunication
latency and power overhead in both embedded and {HPC}
systems",
journal = j-TACO,
volume = "9",
number = "4",
pages = "51:1--51:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400710",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Highly parallel systems are becoming mainstream in a
wide range of sectors ranging from their traditional
stronghold high-performance computing, to data centers
and even embedded systems. However, despite the quantum
leaps of improvements in cost and performance of
individual components over the last decade (e.g.,
processor speeds, memory/interconnection bandwidth,
etc.), system manufacturers are still struggling to
deliver low-latency, highly scalable solutions. One of
the main reasons is that the intercommunication latency
grows significantly with the number of processor nodes.
This article presents a novel way to reduce this
intercommunication delay by implementing, in custom
hardware, certain communication tasks. In particular,
the proposed novel device implements the two most
widely used procedures of the most popular
communication protocol in parallel systems the Message
Passing Interface (MPI). Our novel approach has
initially been simulated within a pioneering parallel
systems simulation framework and then synthesized
directly from a high-level description language (i.e.,
SystemC) using a state-of-the-art synthesis tool. To
the best of our knowledge, this is the first article
presenting the complete hardware implementation of such
a system. The proposed novel approach triggers a
speedup from one to four orders of magnitude when
compared with conventional software-based solutions and
from one to three orders of magnitude when compared
with a sophisticated software-based approach. Moreover,
the performance of our system is from one to two orders
of magnitude higher than the simulated performance of a
similar but, relatively simpler hardware architecture;
at the same time the power consumption of our device is
about two orders of magnitude lower than that of a
low-power CPU when executing the exact same
intercommunication tasks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Baghdadi:2013:ILT,
author = "Riyadh Baghdadi and Albert Cohen and Sven Verdoolaege
and Konrad Trifunovi{\'c}",
title = "Improved loop tiling based on the removal of spurious
false dependences",
journal = j-TACO,
volume = "9",
number = "4",
pages = "52:1--52:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400711",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "To preserve the validity of loop nest transformations
and parallelization, data dependences need to be
analyzed. Memory dependences come in two varieties:
true dependences or false dependences. While true
dependences must be satisfied in order to preserve the
correct order of computations, false dependences are
induced by the reuse of a single memory location to
store multiple values. False dependences reduce the
degrees of freedom for loop transformations. In
particular, loop tiling is severely limited in the
presence of these dependences. While array expansion
removes all false dependences, the overhead on memory
and the detrimental impact on register-level reuse can
be catastrophic. We propose and evaluate a compilation
technique to safely ignore a large number of false
dependences in order to enable loop nest tiling in the
polyhedral model. It is based on the precise
characterization of interferences between live range
intervals, and it does not incur any scalar or array
expansion. Our algorithms have been implemented in the
Pluto polyhedral compiler, and evaluated on the
PolyBench suite.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Pop:2013:OED,
author = "Antoniu Pop and Albert Cohen",
title = "{OpenStream}: Expressiveness and data-flow compilation
of {OpenMP} streaming programs",
journal = j-TACO,
volume = "9",
number = "4",
pages = "53:1--53:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400712",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We present OpenStream, a data-flow extension of OpenMP
to express dynamic dependent tasks. The language
supports nested task creation, modular composition,
variable and unbounded sets of producers/consumers, and
first-class streams. These features, enabled by our
original compilation flow, allow translating high-level
parallel programming patterns, like dependences arising
from StarSs' array regions, or universal low-level
primitives like futures. In particular, these dynamic
features can be embedded efficiently and naturally into
an unmanaged imperative language, avoiding the
complexity and overhead of a concurrent garbage
collector. We demonstrate the performance advantages of
a data-flow execution model compared to more restricted
task and barrier models. We also demonstrate the
efficiency of our compilation and runtime algorithms
for the support of complex dependence patterns arising
from StarSs benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Verdoolaege:2013:PPC,
author = "Sven Verdoolaege and Juan Carlos Juega and Albert
Cohen and Jos{\'e} Ignacio G{\'o}mez and Christian
Tenllado and Francky Catthoor",
title = "Polyhedral parallel code generation for {CUDA}",
journal = j-TACO,
volume = "9",
number = "4",
pages = "54:1--54:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400713",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article addresses the compilation of a sequential
program for parallel execution on a modern GPU. To this
end, we present a novel source-to-source compiler
called PPCG. PPCG singles out for its ability to
accelerate computations from any static control loop
nest, generating multiple CUDA kernels when necessary.
We introduce a multilevel tiling strategy and a code
generation scheme for the parallelization and locality
optimization of imperfectly nested loops, managing
memory and exposing concurrency according to the
constraints of modern GPUs. We evaluate our algorithms
and tool on the entire PolyBench suite.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Du:2013:DCC,
author = "Yu Du and Miao Zhou and Bruce Childers and Rami Melhem
and Daniel Moss{\'e}",
title = "Delta-compressed caching for overcoming the write
bandwidth limitation of hybrid main memory",
journal = j-TACO,
volume = "9",
number = "4",
pages = "55:1--55:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400714",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Limited PCM write bandwidth is a critical obstacle to
achieve good performance from hybrid DRAM/PCM memory
systems. The write bandwidth is severely restricted in
PCM devices, which harms application performance.
Indeed, as we show, it is more important to reduce PCM
write traffic than to reduce PCM read latency for
application performance. To reduce the number of PCM
writes, we propose a DRAM cache organization that
employs compression. A new delta compression technique
for modified data is used to achieve a large
compression ratio. Our approach can selectively and
predictively apply compression to improve its
efficiency and performance. Our approach is designed to
facilitate adoption in existing main memory compression
frameworks. We describe an instance of how to
incorporate delta compression in IBM's MXT memory
compression architecture when used for DRAM cache in a
hybrid main memory. For fourteen representative
memory-intensive workloads, on average, our delta
compression technique reduces the number of PCM writes
by 54.3\%, and improves IPC performance by 24.4\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Purini:2013:FGO,
author = "Suresh Purini and Lakshya Jain",
title = "Finding good optimization sequences covering program
space",
journal = j-TACO,
volume = "9",
number = "4",
pages = "56:1--56:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400715",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The compiler optimizations we enable and the order in
which we apply them on a program have a substantial
impact on the program execution time. Compilers provide
default optimization sequences which can give good
program speedup. As the default sequences have to
optimize programs with different characteristics, they
embed in them multiple subsequences which can optimize
different classes of programs. These multiple
subsequences may falsely interact with each other and
affect the potential program speedup achievable.
Instead of searching for a single universally optimal
sequence, we can construct a small set of good
sequences such that for every program class there
exists a near-optimal optimization sequence in the good
sequences set. If we can construct such a good
sequences set which covers all the program classes in
the program space, then we can choose the best sequence
for a program by trying all the sequences in the good
sequences set. This approach completely circumvents the
need to solve the program classification problem. Using
a sequence set size of around 10 we got an average
speedup up to 14\% on PolyBench programs and up to 12\%
on MiBench programs. Our approach is quite different
from either the iterative compilation or
machine-learning-based prediction modeling techniques
proposed in the literature so far. We use different
training and test datasets for cross-validation as
against the Leave-One-Out cross-validation technique.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Belviranli:2013:DSS,
author = "Mehmet E. Belviranli and Laxmi N. Bhuyan and Rajiv
Gupta",
title = "A dynamic self-scheduling scheme for heterogeneous
multiprocessor architectures",
journal = j-TACO,
volume = "9",
number = "4",
pages = "57:1--57:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400716",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Today's heterogeneous architectures bring together
multiple general-purpose CPUs and multiple
domain-specific GPUs and FPGAs to provide dramatic
speedup for many applications. However, the challenge
lies in utilizing these heterogeneous processors to
optimize overall application performance by minimizing
workload completion time. Operating system and
application development for these systems is in their
infancy. In this article, we propose a new scheduling
and workload balancing scheme, HDSS, for execution of
loops having dependent or independent iterations on
heterogeneous multiprocessor systems. The new algorithm
dynamically learns the computational power of each
processor during an adaptive phase and then schedules
the remainder of the workload using a weighted
self-scheduling scheme during the completion phase.
Different from previous studies, our scheme uniquely
considers the runtime effects of block sizes on the
performance for heterogeneous multiprocessors. It finds
the right trade-off between large and small block sizes
to maintain balanced workload while keeping the
accelerator utilization at maximum. Our algorithm does
not require offline training or architecture-specific
parameters. We have evaluated our scheme on two
different heterogeneous architectures: AMD 64-core
Bulldozer system with nVidia Fermi C2050 GPU and Intel
Xeon 32-core SGI Altix 4700 supercomputer with Xilinx
Virtex 4 FPGAs. The experimental results show that our
new scheduling algorithm can achieve performance
improvements up to over 200\% when compared to the
closest existing load balancing scheme. Our algorithm
also achieves full processor utilization with all
processors completing at nearly the same time which is
significantly better than alternative current
approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Negi:2013:SCF,
author = "Anurag Negi and Ruben Titos-Gil",
title = "{SCIN-cache}: Fast speculative versioning in
multithreaded cores",
journal = j-TACO,
volume = "9",
number = "4",
pages = "58:1--58:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400717",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article describes cache designs for efficiently
supporting speculative techniques like transactional
memory on chip multiprocessors with multithreaded
cores. On-demand allocation and prompt freeing of
speculative cache space in the design reduces the
burden on nonspeculative execution. Quick access to
both clean and speculative versions of data for
multiple contexts provides flexibility and greater
design freedom to HTM architects. Performance analysis
shows the designs stand up well against other HTM
design proposals, with potential performance gains in
high contention applications with small transactions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "58",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lutz:2013:PAF,
author = "Thibaut Lutz and Christian Fensch and Murray Cole",
title = "{PARTANS}: an autotuning framework for stencil
computation on multi-{GPU} systems",
journal = j-TACO,
volume = "9",
number = "4",
pages = "59:1--59:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400718",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "GPGPUs are a powerful and energy-efficient solution
for many problems. For higher performance or larger
problems, it is necessary to distribute the problem
across multiple GPUs, increasing the already high
programming complexity. In this article, we focus on
abstracting the complexity of multi-GPU programming for
stencil computation. We show that the best strategy
depends not only on the stencil operator, problem size,
and GPU, but also on the PCI express layout. This adds
nonuniform characteristics to a seemingly homogeneous
setup, causing up to 23\% performance loss. We address
this issue with an autotuner that optimizes the
distribution across multiple GPUs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "59",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Xiao:2013:SAT,
author = "Chunhua Xiao and M-C. Frank Chang and Jason Cong and
Michael Gill and Zhangqin Huang and Chunyue Liu and
Glenn Reinman and Hao Wu",
title = "Stream arbitration: Towards efficient bandwidth
utilization for emerging on-chip interconnects",
journal = j-TACO,
volume = "9",
number = "4",
pages = "60:1--60:??",
month = jan,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2400682.2400719",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Jan 18 10:57:16 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Alternative interconnects are attractive for scaling
on-chip communication bandwidth in a power-efficient
manner. However, efficient utilization of the bandwidth
provided by these emerging interconnects still remains
an open problem due to the spatial and temporal
communication heterogeneity. In this article, a Stream
Arbitration scheme is proposed, where at runtime any
source can compete for any communication channel of the
interconnect to talk to any destination. We apply
stream arbitration to radio frequency interconnect
(RF-I). Experimental results show that compared to the
representative token arbitration scheme, stream
arbitration can provide an average 20\% performance
improvement and 12\% power reduction.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "60",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2013:DRU,
author = "Yunji Chen and Tianshi Chen and Ling Li and Ruiyang Wu
and Daofu Liu and Weiwu Hu",
title = "Deterministic Replay Using Global Clock",
journal = j-TACO,
volume = "10",
number = "1",
pages = "1:1--1:??",
month = apr,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2445572.2445573",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Apr 5 18:36:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Debugging parallel programs is a well-known difficult
problem. A promising method to facilitate debugging
parallel programs is using hardware support to achieve
deterministic replay on a Chip Multi-Processor (CMP).
As a Design-For-Debug (DFD) feature, a practical
hardware-assisted deterministic replay scheme should
have low design and verification costs, as well as a
small log size. To achieve these goals, we propose a
novel and succinct hardware-assisted deterministic
replay scheme named LReplay. The key innovation of
LReplay is that instead of recording the logical time
orders between instructions or instruction blocks as
previous investigations, LReplay is built upon
recording the pending period information infused by the
global clock. By the recorded pending period
information, about 99\% execution orders are
inferrable, implying that LReplay only needs to record
directly the residual 1\% noninferrable execution
orders in production run. The 1\% noninferrable orders
can be addressed by a simple yet cost-effective
direction prediction technique, which further reduces
the log size of LReplay. Benefiting from the preceding
innovations, the overall log size of LReplay over
SPLASH-2 benchmarks is about 0.17B/K-Inst (byte per
k-instruction) for the sequential consistency, and
0.57B/K-Inst for the Godson-3 consistency. Such log
sizes are smaller in an order of magnitude than
previous deterministic replay schemes incurring no
performance loss. Furthermore, LReplay only consumes
about 0.5\% area of the Godson-3 CMP, since it requires
only trivial modifications to existing components of
Godson-3. The features of LReplay demonstrate the
potential of integrating hardware support for
deterministic replay into future industrial
processors.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lustig:2013:TIC,
author = "Daniel Lustig and Abhishek Bhattacharjee and Margaret
Martonosi",
title = "{TLB} Improvements for Chip Multiprocessors:
Inter-Core Cooperative Prefetchers and Shared
Last-Level {TLBs}",
journal = j-TACO,
volume = "10",
number = "1",
pages = "2:1--2:??",
month = apr,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2445572.2445574",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Apr 5 18:36:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Translation Lookaside Buffers (TLBs) are critical to
overall system performance. Much past research has
addressed uniprocessor TLBs, lowering access times and
miss rates. However, as Chip MultiProcessors (CMPs)
become ubiquitous, TLB design and performance must be
reevaluated. Our article begins by performing a
thorough TLB performance evaluation of sequential and
parallel benchmarks running on a real-world, modern CMP
system using hardware performance counters. This
analysis demonstrates the need for further improvement
of TLB hit rates for both classes of application, and
it also points out that the data TLB has a
significantly higher miss rate than the instruction TLB
in both cases. In response to the characterization
data, we propose and evaluate both Inter-Core
Cooperative (ICC) TLB prefetchers and Shared Last-Level
(SLL) TLBs as alternatives to the commercial norm of
private, per-core L2 TLBs. ICC prefetchers eliminate
19\% to 90\% of Data TLB (D-TLB) misses across parallel
workloads while requiring only modest changes in
hardware. SLL TLBs eliminate 7\% to 79\% of D-TLB
misses for parallel workloads and 35\% to 95\% of D-TLB
misses for multiprogrammed sequential workloads. This
corresponds to 27\% and 21\% increases in hit rates as
compared to private, per-core L2 TLBs, respectively,
and is achieved this using even more modest hardware
requirements. Because of their benefits for parallel
applications, their applicability to sequential
workloads, and their readily implementable hardware,
SLL TLBs and ICC TLB prefetchers hold great promise for
CMPs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2013:TME,
author = "Rong Chen and Haibo Chen",
title = "{Tiled-MapReduce}: Efficient and Flexible {MapReduce}
Processing on Multicore with Tiling",
journal = j-TACO,
volume = "10",
number = "1",
pages = "3:1--3:??",
month = apr,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2445572.2445575",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Apr 5 18:36:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The prevalence of chip multiprocessors opens
opportunities of running data-parallel applications
originally in clusters on a single machine with many
cores. MapReduce, a simple and elegant programming
model to program large-scale clusters, has recently
been shown a promising alternative to harness the
multicore platform. The differences such as memory
hierarchy and communication patterns between clusters
and multicore platforms raise new challenges to design
and implement an efficient MapReduce system on
multicore. This article argues that it is more
efficient for MapReduce to iteratively process small
chunks of data in turn than processing a large chunk of
data at a time on shared memory multicore platforms.
Based on the argument, we extend the general MapReduce
programming model with a ``tiling strategy'', called
Tiled --- MapReduce (TMR). TMR partitions a large
MapReduce job into a number of small subjobs and
iteratively processes one subjob at a time with
efficient use of resources; TMR finally merges the
results of all subjobs for output. Based on
Tiled-MapReduce, we design and implement several
optimizing techniques targeting multicore, including
the reuse of the input buffer among subjobs, a
NUCA/NUMA-aware scheduler, and pipelining a subjob's
reduce phase with the successive subjob's map phase, to
optimize the memory, cache, and CPU resources
accordingly. Further, we demonstrate that
Tiled-MapReduce supports fine-grained fault tolerance
and enables several usage scenarios such as online and
incremental computing on multicore machines.
Performance evaluation with our prototype system called
Ostrich on a 48-core machine shows that Ostrich saves
up to 87.6\% memory, causes less cache misses, and
makes more efficient use of CPU cores, resulting in a
speedup ranging from 1.86x to 3.07x over Phoenix.
Ostrich also efficiently supports fine-grained fault
tolerance, online, and incremental computing with small
performance penalty.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Becchi:2013:DTS,
author = "Michela Becchi and Patrick Crowley",
title = "{A-DFA}: a Time- and Space-Efficient {DFA} Compression
Algorithm for Fast Regular Expression Evaluation",
journal = j-TACO,
volume = "10",
number = "1",
pages = "4:1--4:26",
month = apr,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2445572.2445576",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Apr 5 18:36:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern network intrusion detection systems need to
perform regular expression matching at line rate in
order to detect the occurrence of critical patterns in
packet payloads. While Deterministic Finite Automata
(DFAs) allow this operation to be performed in linear
time, they may exhibit prohibitive memory requirements.
Kumar et al. [2006a] have proposed Delayed Input DFAs
(D2FAs), which provide a trade-off between the memory
requirements of the compressed DFA and the number of
states visited for each character processed, which in
turn affects the memory bandwidth required to evaluate
regular expressions. In this article we introduce
Amortized time --- bandwidth overhead DFAs ( A --- DFAs
), a general compression technique that results in at
most N ( k + 1)/ k state traversals when processing a
string of length N, k being a positive integer. In
comparison to the D2FA approach, our technique achieves
comparable levels of compression with lower provable
bounds on memory bandwidth (or greater compression for
a given bandwidth bound). Moreover, the A-DFA algorithm
has lower complexity, can be applied during DFA
creation, and is suitable for scenarios where a
compressed DFA needs to be dynamically built or
updated. Finally, we show how to combine A-DFA with
alphabet reduction and multistride DFAs, two techniques
aimed at reducing the memory space and bandwidth
requirement of DFAs, and discuss memory encoding
schemes suitable for A-DFAs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2013:MFM,
author = "Sheng Li and Jung Ho Ahn and Richard D. Strong and Jay
B. Brockman and Dean M. Tullsen and Norman P. Jouppi",
title = "The {McPAT} Framework for Multicore and Manycore
Architectures: Simultaneously Modeling Power, Area, and
Timing",
journal = j-TACO,
volume = "10",
number = "1",
pages = "5:1--5:??",
month = apr,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2445572.2445577",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Apr 5 18:36:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article introduces McPAT, an integrated power,
area, and timing modeling framework that supports
comprehensive design space exploration for multicore
and manycore processor configurations ranging from 90nm
to 22nm and beyond. At microarchitectural level, McPAT
includes models for the fundamental components of a
complete chip multiprocessor, including in-order and
out-of-order processor cores, networks-on-chip, shared
caches, and integrated system components such as memory
controllers and Ethernet controllers. At circuit level,
McPAT supports detailed modeling of critical-path
timing, area, and power. At technology level, McPAT
models timing, area, and power for the device types
forecast in the ITRS roadmap. McPAT has a flexible XML
interface to facilitate its use with many performance
simulators. Combined with a performance simulator,
McPAT enables architects to accurately quantify the
cost of new ideas and assess trade-offs of different
architectures using new metrics such as
Energy-Delay-Area2 Product (EDA2P) and
Energy-Delay-Area Product (EDAP). This article explores
the interconnect options of future manycore processors
by varying the degree of clustering over generations of
process technologies. Clustering will bring interesting
trade-offs between area and performance because the
interconnects needed to group cores into clusters incur
area overhead, but many applications can make good use
of them due to synergies from cache sharing. Combining
power, area, and timing results of McPAT with
performance simulation of PARSEC benchmarks for
manycore designs at the 22nm technology shows that
8-core clustering gives the best energy-delay product,
whereas when die area is taken into account, 4-core
clustering gives the best EDA2P and EDAP.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kritikakou:2013:NOM,
author = "Angeliki Kritikakou and Francky Catthoor and George S.
Athanasiou and Vasilios Kelefouras and Costas Goutis",
title = "Near-Optimal Microprocessor and Accelerators Codesign
with Latency and Throughput Constraints",
journal = j-TACO,
volume = "10",
number = "2",
pages = "6:1--6:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2459316.2459317",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 1 16:38:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A systematic methodology for near-optimal
software/hardware codesign mapping onto an FPGA
platform with microprocessor and HW accelerators is
proposed. The mapping steps deal with the
inter-organization, the foreground memory management,
and the datapath mapping. A step is described by
parameters and equations combined in a scalable
template. Mapping decisions are propagated as design
constraints to prune suboptimal options in next steps.
Several performance-area Pareto points are produced by
instantiating the parameters. To evaluate our
methodology we map a real-time bio-imaging application
and loop-dominated benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jiang:2013:HAC,
author = "Lei Jiang and Yu Du and Bo Zhao and Youtao Zhang and
Bruce R. Childers and Jun Yang",
title = "Hardware-Assisted Cooperative Integration of
Wear-Leveling and Salvaging for Phase Change Memory",
journal = j-TACO,
volume = "10",
number = "2",
pages = "7:1--7:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2459316.2459318",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 1 16:38:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Phase Change Memory (PCM) has recently emerged as a
promising memory technology. However, PCM's limited
write endurance restricts its immediate use as a
replacement for DRAM. To extend the lifetime of PCM
chips, wear-leveling and salvaging techniques have been
proposed. Wear-leveling balances write operations
across different PCM regions while salvaging extends
the duty cycle and provides graceful degradation for a
nonnegligible number of failures. Current wear-leveling
and salvaging schemes have not been designed and
integrated to work cooperatively to achieve the best
PCM device lifetime. In particular, a noncontiguous PCM
space generated from salvaging complicates
wear-leveling and incurs large overhead. In this
article, we propose LLS, a Line-Level mapping and
Salvaging design. By allocating a dynamic portion of
total space in a PCM device as backup space, and
mapping failed lines to backup PCM, LLS constructs a
contiguous PCM space and masks lower-level failures
from the OS and applications. LLS integrates
wear-leveling and salvaging and copes well with modern
OSes. Our experimental results show that LLS achieves
31\% longer lifetime than the state-of-the-art. It has
negligible hardware cost and performance overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Han:2013:PEP,
author = "Kyuseung Han and Junwhan Ahn and Kiyoung Choi",
title = "Power-Efficient Predication Techniques for
Acceleration of Control Flow Execution on {CGRA}",
journal = j-TACO,
volume = "10",
number = "2",
pages = "8:1--8:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2459316.2459319",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 1 16:38:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Coarse-grained reconfigurable architecture typically
has an array of processing elements which are
controlled by a centralized unit. This makes it
difficult to execute programs having control divergence
among PEs without predication. However, conventional
predication techniques have a negative impact on both
performance and power consumption due to longer
instruction words and unnecessary instruction-fetching
decoding nullifying steps. This article reveals
performance and power issues in predicated execution
which have not been well-addressed yet. Furthermore, it
proposes fast and power-efficient predication
mechanisms. Experiments conducted through gate-level
simulation show that our mechanism improves
energy-delay product by 11.9\% to 23.8\% on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2013:MTD,
author = "Chao Wang and Xi Li and Junneng Zhang and Xuehai Zhou
and Xiaoning Nie",
title = "{MP-Tomasulo}: a Dependency-Aware Automatic Parallel
Execution Engine for Sequential Programs",
journal = j-TACO,
volume = "10",
number = "2",
pages = "9:1--9:??",
month = may,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2459316.2459320",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Wed May 1 16:38:16 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article presents MP-Tomasulo, a dependency-aware
automatic parallel task execution engine for sequential
programs. Applying the instruction-level Tomasulo
algorithm to MPSoC environments, MP-Tomasulo detects
and eliminates Write-After-Write (WAW) and
Write-After-Read (WAR) inter-task dependencies in the
dataflow execution, therefore to operate out-of-order
task execution on heterogeneous units. We implemented
the prototype system within a single FPGA. Experimental
results on EEMBC applications demonstrate that
MP-Tomasulo can execute the tasks out-of-order to
achieve as high as 93.6\% to 97.6\% of ideal peak
speedup. A comparative study against a state-of-the-art
dataflow execution scheme is illustrated with a classic
JPEG application. The promising results show
MP-Tomasulo enables programmers to uncover more
task-level parallelism on heterogeneous systems, as
well as to ease the burden of programmers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Anonymous:2013:TR,
author = "Anonymous",
title = "{TACO} Reviewers 2012",
journal = j-TACO,
volume = "10",
number = "3",
pages = "9:1--9:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2509420.2509421",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shifer:2013:LLA,
author = "Eran Shifer and Shlomo Weiss",
title = "Low-latency adaptive mode transitions and hierarchical
power management in asymmetric clustered cores",
journal = j-TACO,
volume = "10",
number = "3",
pages = "10:1--10:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2499901",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recently, engineering solutions that include
asymmetric multicores have been fabricated for low
form-factor computing devices, indicating a potential
direction for future evolution of processors. In this
article we propose an asymmetric clustered core
architecture, exhibiting low-latency switching between
modes relative to asymmetric multicores, and having
similarities with the same asymmetric multicore
architecture in the context of a wider dynamic range of
the processor power-performance characteristic.
Asymmetric clustered cores incur additional
microarchitectural complexity and area cost inside a
core but exhibit better chip-level integration
characteristics compared to asymmetric multicores.
Focusing on power efficiency of asymmetric clustered
cores, we describe: (1) a hierarchical power management
partitioning between the operating system and on-die
firmware for coarse-grain switch policies, and (2)
core-internal tracking hardware for fine-grain
switching. The mode switch policies of the core's
tracking hardware are dependent on higher-level
directives and hints from the operating system, on-die
firmware, and compiler or profiling software. We
further explore the potential power management benefits
of asymmetric clustered cores relative to asymmetric
multicores, demonstrating that the ability of
asymmetric clustered cores to use tight training
periods for adaptive behavior, with low overhead
switching between modes, results in a more efficient
utilization of power management directives.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{BenAsher:2013:HTL,
author = "Yosi {Ben Asher} and Nadav Rotem",
title = "Hybrid type legalization for a sparse {SIMD}
instruction set",
journal = j-TACO,
volume = "10",
number = "3",
pages = "11:1--11:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2509420.2509422",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "SIMD vector units implement only a subset of the
operations used by vectorizing compilers, and there are
multiple conflicting techniques to legalize arbitrary
vector types into register-sized data types.
Traditionally, type legalization is performed using a
set of predefined rules, regardless of the operations
used in the program. This method is not suitable to
sparse SIMD instruction sets and often prevents the
vectorization of programs. In this work we introduce a
new technique for type legalization, namely vector
element promotion, as well as a hybrid method for
combining multiple techniques of type legalization. Our
hybrid type legalization method makes decisions based
on the knowledge of the available instruction set as
well as the operations used in the program. Our
experimental results demonstrate that program-dependent
hybrid type legalization improves the execution time of
vector programs, outperforms the existing legalization
method, and allows the vectorization of workloads which
were not vectorized before.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lei:2013:VCI,
author = "Yuanwu Lei and Yong Dou and Lei Guo and Jinbo Xu and
Jie Zhou and Yazhuo Dong and Hongjian Li",
title = "{VLIW} coprocessor for {IEEE-754} quadruple-precision
elementary functions",
journal = j-TACO,
volume = "10",
number = "3",
pages = "12:1--12:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512430",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/fparith.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, a unified VLIW coprocessor, based on
a common group of atomic operation units, for Quad
arithmetic and elementary functions (QP\_VELP) is
presented. The explicitly parallel scheme of VLIW
instruction and Estrin's evaluation scheme for
polynomials are used to improve the performance. A
two-level VLIW instruction RAM scheme is introduced to
achieve high scalability and customizability, even for
more complex key program kernels. Finally, the Quad
arithmetic accelerator (QAA) with the QP\_VELP array is
implemented on ASIC. Compared with hyper-thread
software implementation on an Intel Xeon E5620, QAA
with 8 QP\_VELP units achieves improvement by a factor
of 18X.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kawahito:2013:IRF,
author = "Motohiro Kawahito and Hideaki Komatsu and Takao
Moriyama and Hiroshi Inoue and Toshio Nakatani",
title = "Idiom recognition framework using topological
embedding",
journal = j-TACO,
volume = "10",
number = "3",
pages = "13:1--13:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512431",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Modern processors support hardware-assist instructions
(such as TRT and TROT instructions on the IBM System z)
to accelerate certain functions such as delimiter
search and character conversion. Such special
instructions are often used in high-performance
libraries, but their exploitation in optimizing
compilers has been limited. We devised a new idiom
recognition technique based on a topological embedding
algorithm to detect idiom patterns in the input
programs more aggressively than in previous approaches
using exact pattern matching. Our approach can detect a
pattern even if the code segment does not exactly match
the idiom. For example, we can detect a code segment
that includes additional code within the idiom pattern.
We also propose an instruction simplification for the
idiom recognition. This optimization analyzes all of
the usages of the output of the optimized code for a
specific idiom. If we find that we do not need an
actual value for the output but only a value in a
subrange, then we can assign a value in that subrange
as the output. The code generation can generate faster
code with this optimization. We implemented our new
idiom recognition approach based on the Java
Just-In-Time (JIT) compiler that is part of the J9 Java
Virtual Machine, and we supported several important
idioms for the special hardware-assist instructions on
the IBM System z and on some models of the IBM System
p. To demonstrate the effectiveness of our technique,
we performed two experiments. The first experiment was
to see how many more patterns we can detect compared to
the previous approach. The second experiment measured
the performance improvements over the previous
approaches. For the first experiment, we used the Java
Compatibility Kit (JCK) API tests. For the second
experiment we used the IBM XML parser, SPECjvm98, and
SPCjbb2000. In summary, relative to a baseline
implementation using exact pattern matching, our
algorithm converted 76\% more loops in JCK tests. On a
z9, we also observed significant average performance
improvement of the XML parser by 54\%, of SPECjvm98 by
1.9\%, and of SPECjbb2000 by 4.4\%. Finally, we
observed that the JIT compilation time increased by
only 0.32\% to 0.44\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shobaki:2013:PIS,
author = "Ghassan Shobaki and Maxim Shawabkeh and Najm Eldeen
Abu Rmaileh",
title = "Preallocation instruction scheduling with register
pressure minimization using a combinatorial
optimization approach",
journal = j-TACO,
volume = "10",
number = "3",
pages = "14:1--14:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512432",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Balancing Instruction-Level Parallelism (ILP) and
register pressure during preallocation instruction
scheduling is a fundamentally important problem in code
generation and optimization. The problem is known to be
NP-complete. Many heuristic techniques have been
proposed to solve this problem. However, due to the
inherently conflicting requirements of maximizing ILP
and minimizing register pressure, heuristic techniques
may produce poor schedules in many cases. If such cases
occur in hot code, significant performance degradation
may result. A few combinatorial optimization approaches
have also been proposed, but none of them has been
shown to solve large real-world instances within
reasonable time. This article presents the first
combinatorial algorithm that is efficient enough to
optimally solve large instances of this problem (basic
blocks with hundreds of instructions) within a few
seconds per instance. The proposed algorithm uses
branch-and-bound enumeration with a number of powerful
pruning techniques to efficiently search the solution
space. The search is based on a cost function that
incorporates schedule length and register pressure. An
implementation of the proposed scheduling algorithm has
been integrated into the LLVM Compiler and evaluated
using SPEC CPU 2006. On x86-64, with a time limit of
10ms per instruction, it optimally schedules 79\% of
the hot basic blocks in FP2006. Another 19\% of the
blocks are not optimally scheduled but are improved in
cost relative to LLVM's heuristic. This improves the
execution time of some benchmarks by up to 21\%, with a
geometric-mean improvement of 2.4\% across the entire
benchmark suite. With the use of precise latency
information, the geometric-mean improvement is
increased to 2.8\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{She:2013:EEM,
author = "Dongrui She and Yifan He and Henk Corporaal",
title = "An energy-efficient method of supporting flexible
special instructions in an embedded processor with
compact {ISA}",
journal = j-TACO,
volume = "10",
number = "3",
pages = "15:1--15:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2509420.2509426",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In application-specific processor design, a common
approach to improve performance and efficiency is to
use special instructions that execute complex operation
patterns. However, in a generic embedded processor with
compact Instruction Set Architecture (ISA), these
special instructions may lead to large overhead such
as: ( i ) more bits are needed to encode the extra
opcodes and operands, resulting in wider instructions;
( ii ) more Register File (RF) ports are required to
provide the extra operands to the function units. Such
overhead may increase energy consumption considerably.
In this article, we propose to support flexible
operation pair patterns in a processor with a compact
24-bit RISC-like ISA using: ( i ) a partially
reconfigurable decoder that exploits the pattern
locality to reduce opcode space requirement; ( ii ) a
software-controlled bypass network to reduce operand
encoding bit and RF port requirement. An energy-aware
compiler backend is designed for the proposed
architecture that performs pattern selection and
bypass-aware scheduling to generate energy-efficient
codes. Though the proposed design imposes extra
constraints on the operation patterns, the experimental
results show that for benchmark applications from
different domains, the average dynamic instruction
count is reduced by over 25\%, which is only about 2\%
less than the architecture without such constraints.
The proposed architecture reduces total energy by an
average of 15.8\% compared to the RISC baseline, while
the one without constraints achieves almost no
improvement due to its high overhead. When high
performance is required, the proposed architecture is
able to achieve a speedup of 13.8\% with 13.1\% energy
reduction compared to the baseline by introducing
multicycle SFU operations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nandivada:2013:IBA,
author = "V. Krishna Nandivada and Rajkishore Barik",
title = "Improved bitwidth-aware variable packing",
journal = j-TACO,
volume = "10",
number = "3",
pages = "16:1--16:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2509420.2509427",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Bitwidth-aware register allocation has caught the
attention of researchers aiming to effectively reduce
the number of variables spilled into memory. For
general-purpose processors, this improves the execution
time performance and reduces runtime memory
requirements (which in turn helps in the compilation of
programs targeted to systems with constrained memory).
Additionally, bitwidth-aware register allocation has
been effective in reducing power consumption in
embedded processors. One of the key components of
bitwidth-aware register allocation is the variable
packing algorithm that packs multiple narrow-width
variables into one physical register. Tallam and Gupta
[2003] have proved that optimal variable packing is an
NP-complete problem for arbitrary-width variables and
have proposed an approximate solution. In this article,
we analyze the complexity of the variable packing
problem and present three enhancements that improve the
overall packing of variables. In particular, the
improvements we describe are: (a) Width Static Single
Assignment (W-SSA) form representation that splits the
live range of a variable into several fixed-width live
ranges (W-SSA) variables; (b) PoTR Representation ---
use of powers-of-two representation for bitwidth
information for W-SSA variables. Our empirical results
have shown that the associated bit wastage resulting
from the overapproximation of the widths of variables
to the nearest next power of two is a small fraction
compared to the total number of bits in use ($ \approx
$ 13\%). The main advantage of this representation is
that it leads to optimal variable packing in polynomial
time; (c) Combined Packing and Coalescing --- we
discuss the importance of coalescing (combining
variables whose live ranges do not interfere) in the
context of variable packing and present an iterative
algorithm to perform coalescing and packing of W-SSA
variables represented in PoTR. Our experimental results
show up to 76.00\% decrease in the number of variables
compared to the number of variables in the input
program in Single Static Assignment (SSA) form. This
reduction in the number of variables led to a
significant reduction in dynamic spilling, packing, and
unpacking instructions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ahn:2013:SHR,
author = "Jung Ho Ahn and Young Hoon Son and John Kim",
title = "Scalable high-radix router microarchitecture using a
network switch organization",
journal = j-TACO,
volume = "10",
number = "3",
pages = "17:1--17:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512433",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As the system size of supercomputers and datacenters
increases, cost-efficient networks become critical in
achieving good scalability on those systems. High
-radix routers reduce network cost by lowering the
network diameter while providing a high bisection
bandwidth and path diversity. The building blocks of
these large-scale networks are the routers or the
switches and they need to scale accordingly to the
increasing port count and increasing pin bandwidth.
However, as the port count increases, the high-radix
router microarchitecture itself needs to scale
efficiently. Hierarchical crossbar switch organization
has been proposed where a single large crossbar used
for a router switch is partitioned into many small
crossbars and overcomes the limitations of conventional
router microarchitecture. Although the organization
provides high performance, it has limited scalability
due to excessive power and area overheads by the wires
and intermediate buffers. In this article, we propose
scalable router microarchitectures that leverage a
network within the switch design of the high-radix
routers themselves. These alternative designs lower the
wiring complexity and buffer requirements. For example,
when a folded-Clos switch is used instead of the
hierarchical crossbar switch for a radix-64 router, it
provides up to 73\%, 58\%, and 87\% reduction in area,
energy-delay product, and energy-delay-area product,
respectively. We also explore more efficient switch
designs by exploiting the traffic-pattern
characteristics of the global network and its impact on
the local network design within the switch for both
folded-Clos and flattened butterfly networks. In
particular, we propose a bilateral butterfly switch
organization that has fewer crossbars and global wires
compared to the topology-agnostic folded-Clos switch
while achieving better low-load latency and equivalent
saturation throughput.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Huang:2013:ACM,
author = "Libo Huang and Zhiying Wang and Nong Xiao and Yongwen
Wang and Qiang Dou",
title = "Adaptive communication mechanism for accelerating
{MPI} functions in {NoC}-based multicore processors",
journal = j-TACO,
volume = "10",
number = "3",
pages = "18:1--18:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512434",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Multicore designs have emerged as the dominant
organization for future high-performance
microprocessors. Communication in such designs is often
enabled by Networks-on-Chip (NoCs). A new trend in such
architectures is to fit a Message Passing Interface
(MPI) programming model on NoCs to achieve optimal
parallel application performance. A key issue in
designing MPI over NoCs is communication protocol,
which has not been explored in previous research. This
article advocates a hardware-supported communication
mechanism using a protocol-adaptive approach to adjust
to varying NoC configurations (e.g., number of buffers)
and workload behavior (e.g., number of messages). We
propose the ADaptive Communication Mechanism (ADCM), a
hybrid protocol that involves behavior similar to
buffered communication when sufficient buffer is
available in the receiver to that similar to a
synchronous protocol when buffers in the receiver are
limited. ADCM adapts dynamically by deciding
communication protocol on a per-request basis using a
local estimate of recent buffer utilization. ADCM
attempts to combine both the advantages of buffered and
synchronous communication modes to achieve enhanced
throughput and performance. Simulations of various
workloads show that the proposed communication
mechanism can be effectively used in future NoC
designs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Malik:2013:OSG,
author = "Avinash Malik and David Gregg",
title = "Orchestrating stream graphs using model checking",
journal = j-TACO,
volume = "10",
number = "3",
pages = "19:1--19:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512435",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article we use model checking to statically
distribute and schedule Synchronous DataFlow (SDF)
graphs on heterogeneous execution architectures. We
show that model checking is capable of providing an
optimal solution and it arrives at these solutions
faster (in terms of algorithm runtime) than equivalent
ILP formulations. Furthermore, we also show how
different types of optimizations such as task
parallelism, data parallelism, and state sharing can be
included within our framework. Finally, comparison of
our approach with the current state-of-the-art
heuristic techniques show the pitfalls of these
techniques and gives a glimpse of how these heuristic
techniques can be improved.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2013:UML,
author = "Zheng Wang and Michael F. P. O'Boyle",
title = "Using machine learning to partition streaming
programs",
journal = j-TACO,
volume = "10",
number = "3",
pages = "20:1--20:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512436",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Stream-based parallel languages are a popular way to
express parallelism in modern applications. The
efficient mapping of streaming parallelism to today's
multicore systems is, however, highly dependent on the
program and underlying architecture. We address this by
developing a portable and automatic compiler-based
approach to partitioning streaming programs using
machine learning. Our technique predicts the ideal
partition structure for a given streaming application
using prior knowledge learned offline. Using the
predictor we rapidly search the program space (without
executing any code) to generate and select a good
partition. We applied this technique to standard
StreamIt applications and compared against existing
approaches. On a 4-core platform, our approach achieves
60\% of the best performance found by iteratively
compiling and executing over 3000 different partitions
per program. We obtain, on average, a 1.90$ \times $
speedup over the already tuned partitioning scheme of
the StreamIt compiler. When compared against a
state-of-the-art analytical, model-based approach, we
achieve, on average, a 1.77$ \times $ performance
improvement. By porting our approach to an 8-core
platform, we are able to obtain 1.8$ \times $
improvement over the StreamIt default scheme,
demonstrating the portability of our approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bakhoda:2013:DCN,
author = "Ali Bakhoda and John Kim and Tor M. Aamodt",
title = "Designing on-chip networks for throughput
accelerators",
journal = j-TACO,
volume = "10",
number = "3",
pages = "21:1--21:??",
month = sep,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2512429",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Sep 16 17:20:12 MDT 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As the number of cores and threads in throughput
accelerators such as Graphics Processing Units (GPU)
increases, so does the importance of on-chip
interconnection network design. This article explores
throughput-effective Network-on-Chips (NoC) for future
compute accelerators that employ Bulk-Synchronous
Parallel (BSP) programming models such as CUDA and
OpenCL. A hardware optimization is ``throughput
effective'' if it improves parallel application-level
performance per unit chip area. We evaluate performance
of future looking workloads using detailed closed-loop
simulations modeling compute nodes, NoC, and the DRAM
memory system. We start from a mesh design with
bisection bandwidth balanced to off-chip demand.
Accelerator workloads tend to demand high off-chip
memory bandwidth which results in a many-to-few traffic
pattern when coupled with expected technology
constraints of slow growth in pins-per-chip. Leveraging
these observations we reduce NoC area by proposing a
``checkerboard'' NoC which alternates between
conventional full routers and half routers with limited
connectivity. Next, we show that increasing network
terminal bandwidth at the nodes connected to DRAM
controllers alleviates a significant fraction of the
remaining imbalance resulting from the many-to-few
traffic pattern. Furthermore, we propose a ``double
checkerboard inverted'' NoC organization which takes
advantage of channel slicing to reduce area while
maintaining the performance improvements of the
aforementioned techniques. This organization also has a
simpler routing mechanism and improves average
application throughput per unit area by 24.3\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jantz:2013:ESM,
author = "Michael R. Jantz and Prasad A. Kulkarni",
title = "Exploring single and multilevel {JIT} compilation
policy for modern machines 1",
journal = j-TACO,
volume = "10",
number = "4",
pages = "22:1--22:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541229",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Dynamic or Just-in-Time (JIT) compilation is essential
to achieve high-performance emulation for programs
written in managed languages, such as Java and C\#. It
has been observed that a conservative JIT compilation
policy is most effective to obtain good runtime
performance without impeding application progress on
single-core machines. At the same time, it is often
suggested that a more aggressive dynamic compilation
strategy may perform best on modern machines that
provide abundant computing resources, especially with
virtual machines (VMs) that are also capable of
spawning multiple concurrent compiler threads. However,
comprehensive research on the best JIT compilation
policy for such modern processors and VMs is currently
lacking. The goal of this work is to explore the
properties of single-tier and multitier JIT compilation
policies that can enable existing and future VMs to
realize the best program performance on modern
machines. In this work, we design novel experiments and
implement new VM configurations to effectively control
the compiler aggressiveness and optimization levels (
if and when methods are compiled) in the
industry-standard Oracle HotSpot Java VM to achieve
this goal. We find that the best JIT compilation policy
is determined by the nature of the application and the
speed and effectiveness of the dynamic compilers. We
extend earlier results showing the suitability of
conservative JIT compilation on single-core machines
for VMs with multiple concurrent compiler threads. We
show that employing the free compilation resources
(compiler threads and hardware cores) to aggressively
compile more program methods quickly reaches a point of
diminishing returns. At the same time, we also find
that using the free resources to reduce compiler queue
backup (compile selected hot methods early )
significantly benefits program performance, especially
for slower (highly optimizing) JIT compilers. For such
compilers, we observe that accurately prioritizing JIT
method compiles is crucial to realize the most
performance benefit with the smallest hardware budget.
Finally, we show that a tiered compilation policy,
although complex to implement, greatly alleviates the
impact of more and early JIT compilation of programs on
modern machines.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dong:2013:CAC,
author = "Xiangyu Dong and Norman P. Jouppi and Yuan Xie",
title = "A circuit-architecture co-optimization framework for
exploring nonvolatile memory hierarchies",
journal = j-TACO,
volume = "10",
number = "4",
pages = "23:1--23:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541230",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many new memory technologies are available for
building future energy-efficient memory hierarchies. It
is necessary to have a framework that can quickly find
the optimal memory technology at each hierarchy level.
In this work, we first build a circuit-architecture
joint design space exploration framework by combining
RC circuit analysis and Artificial Neural Network
(ANN)-based performance modeling. Then, we use this
framework to evaluate some emerging nonvolatile memory
hierarchies. We demonstrate that a Resistive RAM
(ReRAM)-based cache hierarchy on an 8-core
Chip-Multiprocessor (CMP) system can achieve a 24\%
Energy Delay Product (EDP) improvement and a 36\%
Energy Delay Area Product (EDAP) improvement compared
to a conventional hierarchy with SRAM on-chip caches
and DRAM main memory.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2013:OGE,
author = "Jishen Zhao and Guangyu Sun and Gabriel H. Loh and
Yuan Xie",
title = "Optimizing {GPU} energy efficiency with {$3$D}
die-stacking graphics memory and reconfigurable memory
interface",
journal = j-TACO,
volume = "10",
number = "4",
pages = "24:1--24:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541231",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The performance of graphics processing unit (GPU)
systems is improving rapidly to accommodate the
increasing demands of graphics and high-performance
computing applications. With such a performance
improvement, however, power consumption of GPU systems
is dramatically increased. Up to 30\% of the total
power of a GPU system is consumed by the graphic memory
itself. Therefore, reducing graphics memory power
consumption is critical to mitigate the power
challenge. In this article, we propose an
energy-efficient reconfigurable 3D die-stacking
graphics memory design that integrates wide-interface
graphics DRAMs side-by-side with a GPU processor on a
silicon interposer. The proposed architecture is a
``3D+2.5D'' system, where the DRAM memory itself is 3D
stacked memory with through-silicon via (TSV), whereas
the integration of DRAM and the GPU processor is
through the interposer solution (2.5D). Since GPU
computing units, memory controllers, and memory are all
integrated in the same package, the number of memory
I/Os is no longer constrained by the package's pin
count. We can reduce the memory power consumption by
scaling down the supply voltage and frequency of memory
interface while maintaining the same or even higher
peak memory bandwidth. In addition, we design a
reconfigurable memory interface that can dynamically
adapt to the requirements of various applications. We
propose two reconfiguration mechanisms to optimize the
GPU system energy efficiency and throughput,
respectively, and thus benefit both memory-intensive
and compute-intensive applications. The experimental
results show that the proposed GPU memory architecture
can effectively improve GPU system energy efficiency by
21\%, without reconfiguration. The reconfigurable
memory interface can further improve the system energy
efficiency by 26\%, and system throughput by 31\% under
a capped system power budget of 240W.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2013:EMT,
author = "Chien-Chi Chen and Sheng-De Wang",
title = "An efficient multicharacter transition string-matching
engine based on the {Aho--Corasick} algorithm",
journal = j-TACO,
volume = "10",
number = "4",
pages = "25:1--25:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541232",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/string-matching.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "A string-matching engine capable of inspecting
multiple characters in parallel can multiply the
throughput. However, the space required for
implementing a matching engine that can process
multiple characters in parallel generally grows
exponentially with respect to the characters to be
processed in parallel. Based on the Aho--Corasick
algorithm (AC-algorithm), this work presents a novel
multicharacter transition Nondeterministic Finite
Automaton (NFA) approach, called multicharacter AC-NFA,
to allow for the inspection of multiple characters in
parallel. This approach first converts an AC-trie to an
AC-NFA by allowing for the simultaneous activation of
multiple states and then converts the AC-NFA to a
$k$-character AC-NFA by an algorithm with concatenation
operations and assistant transitions. Additionally, the
alignment problem, which occurs while multiple
characters are being inspected in parallel, is solved
using assistant transitions. Moreover, a corresponding
output is provided for each inspected character by
introducing priority multiplexers to determine the
final matching outputs during implementation of the
multicharacter AC-NFA. Consequently, the number of
derived $k$-character transitions grows linearly with
respect to the number $k$. Furthermore, the derived
multicharacter AC-NFA is implemented on FPGAs for
evaluation. The resulting throughput grows
approximately 14 times and the hardware cost grows
about 18 times for 16-character AC-NFA implementation,
as compared with that for 1-character AC-NFA
implementation. The achievable throughput is 21.4Gbps
for the 16-character AC-NFA implementation operating at
a 167.36MHz clock.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Luo:2013:DIH,
author = "Yangchun Luo and Wei-Chung Hsu and Antonia Zhai",
title = "The design and implementation of heterogeneous
multicore systems for energy-efficient speculative
thread execution",
journal = j-TACO,
volume = "10",
number = "4",
pages = "26:1--26:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541233",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the emergence of multicore processors, various
aggressive execution models have been proposed to
exploit fine-grained thread-level parallelism, taking
advantage of the fast on-chip interconnection
communication. However, the aggressive nature of these
execution models often leads to excessive energy
consumption incommensurate to execution time reduction.
In the context of Thread-Level Speculation, we
demonstrated that on a same-ISA heterogeneous multicore
system, by dynamically deciding how on-chip resources
are utilized, speculative threads can achieve
performance gain in an energy-efficient way. Through a
systematic design space exploration, we built a
multicore architecture that integrates heterogeneous
components of processing cores and first-level caches.
To cope with processor reconfiguration overheads, we
introduced runtime mechanisms to mitigate their
impacts. To match program execution with the most
energy-efficient processor configuration, the system
was equipped with a dynamic resource allocation scheme
that characterizes program behaviors using novel
processor counters. We evaluated the proposed
heterogeneous system with a diverse set of benchmark
programs from SPEC CPU2000 and CPU20006 suites.
Compared to the most efficient homogeneous TLS
implementation, we achieved similar performance but
consumed 18\% less energy. Compared to the most
efficient homogeneous uniprocessor running sequential
programs, we improved performance by 29\% and reduced
energy consumption by 3.6\%, which is a 42\%
improvement in energy-delay-squared product.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rolan:2013:VSC,
author = "Dyer Rol{\'a}n and Basilio B. Fraguela and Ram{\'o}n
Doallo",
title = "Virtually split cache: an efficient mechanism to
distribute instructions and data 1",
journal = j-TACO,
volume = "10",
number = "4",
pages = "27:1--27:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541234",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "First-level caches are usually split for both
instructions and data instead of unifying them in a
single cache. Although that approach eases the pipeline
design and provides a simple way to independently treat
data and instructions, its global hit rate is usually
smaller than that of a unified cache. Furthermore,
unified lower-level caches usually behave and process
memory requests disregarding whether they are data or
instruction requests. In this article, we propose a new
technique aimed to balance the amount of space devoted
to instructions and data for optimizing set-associative
caches: the Virtually Split Cache or VSC. Our technique
combines the sharing of resources from unified
approaches with the bandwidth and parallelism that
split configurations provide, thus reducing power
consumption while not degrading performance. Our design
dynamically adjusts cache resources devoted to
instructions and data depending on their particular
demand. Two VSC designs are proposed in order to track
the instructions and data requirements. The Shadow Tag
VSC (ST-VSC) is based on shadow tags that store the
last evicted line related to data and instructions in
order to determine how well the cache would work with
one more way per set devoted to each kind. The Global
Selector VSC (GS-VSC) uses a saturation counter that is
updated every time a cache miss occurs either under an
instruction or data request applying a duel-like
mechanism. Experiments with a variable and a fixed
latency VSC show that ST-VSC and GS-VSC reduce on
average the cache hierarchy power consumption by 29\%
and 24\%, respectively, with respect to a standard
baseline. As for performance, while the fixed latency
designs virtually match the split baseline in a
single-core system, a variable latency ST-VSC and
GS-VSC increase the average IPC by 2.5\% and 2\%,
respectively. In multicore systems, even the slower
fixed latency ST-VSC and GS-VSC designs improve the
baseline IPC by 3.1\% and 2.5\%, respectively, in a
four-core system thanks to the reduction in the
bandwidth demanded from the lower cache levels. This is
in contrast with many techniques that trade performance
degradation for power consumption reduction. VSC
particularly benefits embedded processors with a single
level of cache, where up to an average 9.2\% IPC
improvement is achieved. Interestingly, we also find
that partitioning the LLC for instructions and data can
improve performance around 2\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Subramaniam:2013:UFC,
author = "Samantika Subramaniam and Simon C. Steely and Will
Hasenplaugh and Aamer Jaleel and Carl Beckmann and
Tryggve Fossum and Joel Emer",
title = "Using in-flight chains to build a scalable cache
coherence protocol",
journal = j-TACO,
volume = "10",
number = "4",
pages = "28:1--28:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541235",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As microprocessor designs integrate more cores,
scalability of cache coherence protocols becomes a
challenging problem. Most directory-based protocols
avoid races by using blocking tag directories that can
impact the performance of parallel applications. In
this article, we first quantitatively demonstrate that
state-of-the-art blocking protocols significantly
constrain throughput at large core counts for several
parallel applications. Nonblocking protocols address
this throughput concern at the expense of scalability
in the interconnection network or in the required
resource overheads. To address this concern, we enhance
nonblocking directory protocols by migrating the point
of service of responses. Our approach uses in-flight
chains of cores making parallel memory requests to
incorporate scalability while maintaining
high-throughput. The proposed cache coherence protocol
called chained cache coherence, can outperform blocking
protocols by up to 20\% on scientific and 12\% on
commercial applications. It also has low resource
overheads and simple address ordering requirements
making it both a high-performance and scalable
protocol. Furthermore, in-flight chains provide a
scalable solution to building hierarchical and
nonblocking tag directories as well as optimize
communication latencies.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Sanchez:2013:MIP,
author = "Daniel S{\'a}nchez and Yiannakis Sazeides and Juan M.
Cebri{\'a}n and Jos{\'e} M. Garc{\'\i}a and Juan L.
Arag{\'o}n",
title = "Modeling the impact of permanent faults in caches",
journal = j-TACO,
volume = "10",
number = "4",
pages = "29:1--29:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541236",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The traditional performance cost benefits we have
enjoyed for decades from technology scaling are
challenged by several critical constraints including
reliability. Increases in static and dynamic variations
are leading to higher probability of parametric and
wear-out failures and are elevating reliability into a
prime design constraint. In particular, SRAM cells used
to build caches that dominate the processor area are
usually minimum sized and more prone to failure. It is
therefore of paramount importance to develop effective
methodologies that facilitate the exploration of
reliability techniques for caches. To this end, we
present an analytical model that can determine for a
given cache configuration, address trace, and random
probability of permanent cell failure the exact
expected miss rate and its standard deviation when
blocks with faulty bits are disabled. What
distinguishes our model is that it is fully analytical,
it avoids the use of fault maps, and yet, it is both
exact and simpler than previous approaches. The
analytical model is used to produce the miss-rate
trends ( expected miss-rate ) for future technology
nodes for both uncorrelated and clustered faults. Some
of the key findings based on the proposed model are (i)
block disabling has a negligible impact on the expected
miss-rate unless probability of failure is equal or
greater than 2.6e-4, (ii) the fault map methodology can
accurately calculate the expected miss-rate as long as
1,000 to 10,000 fault maps are used, and (iii) the
expected miss-rate for execution of parallel
applications increases with the number of threads and
is more pronounced for a given probability of failure
as compared to sequential execution.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2013:APF,
author = "Sanghoon Lee and James Tuck",
title = "Automatic parallelization of fine-grained
metafunctions on a chip multiprocessor",
journal = j-TACO,
volume = "10",
number = "4",
pages = "30:1--30:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541237",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Due to the importance of reliability and security,
prior studies have proposed inlining metafunctions into
applications for detecting bugs and security
vulnerabilities. However, because these software
techniques add frequent, fine-grained instrumentation
to programs, they often incur large runtime overheads.
In this work, we consider an automatic thread
extraction technique for removing these fine-grained
checks from a main application and scheduling them on
helper threads. In this way, we can leverage the
resources available on a CMP to reduce the latency and
overhead of fine-grained checking codes. Our
parallelization strategy extracts metafunctions from a
single threaded application and executes them in
customized helper threads-threads constructed to mirror
relevant fragments of the main program's behavior in
order to keep communication and overhead low. To get
good performance, we consider optimizations that reduce
communication and balance work among many threads. We
evaluate our parallelization strategy on Mudflap, a
pointer-use checking tool in GCC. To show the benefits
of our technique, we compare it to a manually
parallelized version of Mudflap. We run our experiments
on an architectural simulator with support for fast
queueing operations. On a subset of SPECint 2000, our
automatically parallelized code using static load
balance is only 19\% slower, on average, than the
manually parallelized version on a simulated eight-core
system. In addition, our automatically parallelized
code using dynamic load balance is competitive, on
average, to the manually parallelized version on a
simulated eight-core system. Furthermore, all the
applications except parser achieve better speedups with
our automatic algorithms than with the manual approach.
Also, our approach introduces very little overhead in
the main program-it is kept under 100\%, which is more
than a 5.3$ \times $ reduction compared to serial
Mudflap.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dubach:2013:DMA,
author = "Christophe Dubach and Timothy M. Jones and Edwin V.
Bonilla",
title = "Dynamic microarchitectural adaptation using machine
learning",
journal = j-TACO,
volume = "10",
number = "4",
pages = "31:1--31:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541238",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Adaptive microarchitectures are a promising solution
for designing high-performance, power-efficient
microprocessors. They offer the ability to tailor
computational resources to the specific requirements of
different programs or program phases. They have the
potential to adapt the hardware cost-effectively at
runtime to any application's needs. However, one of the
key challenges is how to dynamically determine the best
architecture configuration at any given time, for any
new workload. This article proposes a novel control
mechanism based on a predictive model for
microarchitectural adaptivity control. This model is
able to efficiently control adaptivity by monitoring
the behaviour of an application's different phases at
runtime. We show that by using this model on SPEC 2000,
we double the energy\slash performance efficiency of
the processor when compared to the best static
configuration tuned for the whole benchmark suite. This
represents 74\% of the improvement available if we know
the best microarchitecture for each program phase ahead
of time. In addition, we present an extended analysis
of the best configurations found and show that the
overheads associated with the implementation of our
scheme have a negligible impact on performance and
power.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2013:CME,
author = "Long Chen and Yanan Cao and Zhao Zhang",
title = "{E$^3$CC}: a memory error protection scheme with novel
address mapping for subranked and low-power memories",
journal = j-TACO,
volume = "10",
number = "4",
pages = "32:1--32:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2541239",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Dec 23 10:31:41 MST 2013",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This study presents and evaluates E$^3$ CC (Enhanced
Embedded ECC), a full design and implementation of a
generic embedded ECC scheme that enables
power-efficient error protection for subranked memory
systems. It incorporates a novel address mapping scheme
called Biased Chinese Remainder Mapping (BCRM) to
resolve the address mapping issue for memories of page
interleaving, plus a simple and effective cache design
to reduce extra ECC traffic. Our evaluation using SPEC
CPU2006 benchmarks confirms the performance and power
efficiency of the E$^3$ CC scheme for subranked
memories as well as conventional memories.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tian:2013:TBM,
author = "Yingying Tian and Samira M. Khan and Daniel A.
Jim{\'e}nez",
title = "Temporal-based multilevel correlating inclusive cache
replacement",
journal = j-TACO,
volume = "10",
number = "4",
pages = "33:1--33:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555290",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Inclusive caches have been widely used in Chip
Multiprocessors (CMPs) to simplify cache coherence.
However, they have poor performance compared with
noninclusive caches not only because of the limited
capacity of the entire cache hierarchy but also due to
ignorance of temporal locality of the Last-Level Cache
(LLC). Blocks that are highly referenced (referred to
as hot blocks ) are always hit in higher-level caches
(e.g., L1 cache) and are rarely referenced in the LLC.
Therefore, they become replacement victims in the LLC.
Due to the inclusion property, blocks evicted from the
LLC have to also be invalidated from higher-level
caches. Invalidation of hot blocks from the entire
cache hierarchy introduces costly off-chip misses that
makes the inclusive cache perform poorly. Neither
blocks that are highly referenced in the LLC nor blocks
that are highly referenced in higher-level caches
should be the LLC replacement victims. We propose
temporal-based multilevel correlating cache replacement
for inclusive caches to evict blocks in the LLC that
are also not hot in higher-level caches using
correlated temporal information acquired from all
levels of a cache hierarchy with minimal overhead.
Invalidation of these blocks does not hurt the
performance. By contrast, replacing them as early as
possible with useful blocks helps improve cache
performance. Based on our experiments, in a dual-core
CMP, an inclusive cache with temporal-based multilevel
correlating cache replacement significantly outperforms
an inclusive cache with traditional LRU replacement by
yielding an average speedup of 12.7\%, which is
comparable to an enhanced noninclusive cache, while
requiring less than 1\% of storage overhead.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2013:HSA,
author = "Qixiao Liu and Miquel Moreto and Victor Jimenez and
Jaume Abella and Francisco J. Cazorla and Mateo
Valero",
title = "Hardware support for accurate per-task energy metering
in multicore systems",
journal = j-TACO,
volume = "10",
number = "4",
pages = "34:1--34:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555291",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Accurately determining the energy consumed by each
task in a system will become of prominent importance in
future multicore-based systems because it offers
several benefits, including (i) better application
energy/performance optimizations, (ii) improved
energy-aware task scheduling, and (iii) energy-aware
billing in data centers. Unfortunately, existing
methods for energy metering in multicores fail to
provide accurate energy estimates for each task when
several tasks run simultaneously. This article makes a
case for accurate Per-Task Energy Metering (PTEM) based
on tracking the resource utilization and occupancy of
each task. Different hardware implementations with
different trade-offs between energy prediction accuracy
and hardware-implementation complexity are proposed.
Our evaluation shows that the energy consumed in a
multicore by each task can be accurately measured. For
a 32-core, 2-way, simultaneous multithreaded core
setup, PTEM reduces the average accuracy error from
more than 12\% when our hardware support is not used to
less than 4\% when it is used. The maximum observed
error for any task in the workload we used reduces from
58\% down to 9\% when our hardware support is used.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Mehta:2013:TSS,
author = "Sanyam Mehta and Gautham Beeraka and Pen-Chung Yew",
title = "Tile size selection revisited",
journal = j-TACO,
volume = "10",
number = "4",
pages = "35:1--35:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555292",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Loop tiling is a widely used loop transformation to
enhance data locality and allow data reuse. In the
tiled code, however, tiles of different sizes can lead
to significant variation in performance. Thus,
selection of an optimal tile size is critical to
performance of tiled codes. In the past, tile size
selection has been attempted using both static
analytical and dynamic empirical (auto-tuning) models.
Past work using static models assumed a direct-mapped
cache for the purpose of analysis and thus proved to be
less robust. On the other hand, the auto-tuning models
involve an exhaustive search in a large space of tiled
codes. In this article, we propose a new analytical
model for tile size selection that leverages the high
set associativity in modern caches to minimize conflict
misses. Our tile size selection model targets data
reuse in multiple levels of cache. In addition, it
considers the interaction of tiling with the SIMD unit
in modern processors in estimating the optimal tile
size. We find that these factors, not considered in
previous models, are critical in developing a robust
model for tile size selection. We implement our tile
size selection model in a polyhedral compiler and test
it on 12 benchmark kernels using two different problem
sizes. Our model outperforms the previous analytical
models that are based on reusing data in a single level
of cache and achieves an average performance
improvement of 9.7\% and 20.4\%, respectively, over the
best square (cubic) tiles for the two problem sizes. In
addition, the tile size chosen by our tile size
selection algorithm is similar to the best performing
size obtained through an extensive search, validating
the analytical model underlying the algorithm.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Prisacari:2013:FPS,
author = "Bogdan Prisacari and German Rodriguez and Cyriel
Minkenberg and Torsten Hoefler",
title = "Fast pattern-specific routing for fat tree networks",
journal = j-TACO,
volume = "10",
number = "4",
pages = "36:1--36:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555293",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In the context of eXtended Generalized Fat Tree (XGFT)
topologies, widely used in HPC and datacenter network
designs, we propose a generic method, based on Integer
Linear Programming (ILP), to efficiently determine
optimal routes for arbitrary workloads. We propose a
novel approach that combines ILP with dynamic
programming, effectively reducing the time to solution.
Specifically, we divide the network into smaller
subdomains optimized using a custom ILP formulation
that ensures global optimality of local solutions.
Local solutions are then combined into an optimal
global solution using dynamic programming. Finally, we
demonstrate through a series of extensive benchmarks
that our approach scales in practice to networks
interconnecting several thousands of nodes, using a
single-threaded, freely available linear programming
solver on commodity hardware, with the potential for
higher scalability by means of commercial, parallel
solvers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Breughe:2013:SRB,
author = "Maximilien B. Breughe and Lieven Eeckhout",
title = "Selecting representative benchmark inputs for
exploring microprocessor design spaces",
journal = j-TACO,
volume = "10",
number = "4",
pages = "37:1--37:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555294",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The design process of a microprocessor requires
representative workloads to steer the search process
toward an optimum design point for the target
application domain. However, considering a broad set of
workloads to cover the large space of potential
workloads is infeasible given how time-consuming design
space exploration typically is. Hence, it is crucial to
select a small yet representative set of workloads,
which leads to a shorter design cycle while yielding a
(near) optimal design. Prior work has mostly looked
into selecting representative benchmarks; however,
limited attention was given to the selection of
benchmark inputs and how this affects workload
representativeness during design space exploration.
Using a set of 1,000 inputs for a number of embedded
benchmarks and a design space with around 1,700 design
points, we find that selecting a single or three random
input(s) per benchmark potentially (in a worst-case
scenario) leads to a suboptimal design that is 56\% and
33\% off, on average, relative to the optimal design in
our design space in terms of Energy-Delay Product
(EDP). We then propose and evaluate a number of methods
for selecting representative inputs and show that we
can find the optimum design point with as few as three
inputs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kerschbaumer:2013:IFT,
author = "Christoph Kerschbaumer and Eric Hennigan and Per
Larsen and Stefan Brunthaler and Michael Franz",
title = "Information flow tracking meets just-in-time
compilation",
journal = j-TACO,
volume = "10",
number = "4",
pages = "38:1--38:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555295",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Web applications are vulnerable to cross-site
scripting attacks that enable data thefts. Information
flow tracking in web browsers can prevent communication
of sensitive data to unintended recipients and thereby
stop such data thefts. Unfortunately, existing
solutions have focused on incorporating information
flow into browsers' JavaScript interpreters, rather
than just-in-time compilers, rendering the resulting
performance noncompetitive. Few users will switch to a
safer browser if it comes at the cost of significantly
degrading web application performance. We present the
first information flow tracking JavaScript engine that
is based on a true just-in-time compiler, and that
thereby outperforms all previous interpreter-based
information flow tracking JavaScript engines by more
than a factor of two. Our JIT-based engine (i) has the
same coverage as previous interpreter- based solutions,
(ii) requires reasonable implementation effort, and
(iii) introduces new optimizations to achieve
acceptable performance. When evaluated against three
industry-standard JavaScript benchmark suites, there is
still an average slowdown of 73\% over engines that do
not support information flow, but this is now well
within the range that many users will find an
acceptable price for obtaining substantially increased
security.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nasre:2013:TSE,
author = "Rupesh Nasre",
title = "Time- and space-efficient flow-sensitive points-to
analysis",
journal = j-TACO,
volume = "10",
number = "4",
pages = "39:1--39:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555296",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Compilation of real-world programs often requires
hours. The term nightly build known to industrial
researchers is an artifact of long compilation times.
Our goal is to reduce the absolute analysis times for
large C codes (of the order of millions of lines).
Pointer analysis is one of the key analyses performed
during compilation. Its scalability is paramount to
achieve the efficiency of the overall compilation
process and its precision directly affects that of the
client analyses. In this work, we design a time- and
space-efficient flow-sensitive pointer analysis and
parallelize it on graphics processing units. Our
analysis proposes to use an extended bloom filter,
called multibloom, to store points-to information in an
approximate manner and develops an analysis in terms of
the operations over the multibloom. Since bloom filter
is a probabilistic data structure, we develop ways to
gain back the analysis precision. We achieve effective
parallelization by achieving memory coalescing,
reducing thread divergence, and improving load balance
across GPU warps. Compared to a state-of-the-art
sequential solution, our parallel version achieves a
7.8 $ \times $ speedup with less than 5\% precision
loss on a suite of six large programs. Using two client
transformations, we show that this loss in precision
only minimally affects a client's precision.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ruan:2013:BTB,
author = "Wenjia Ruan and Yujie Liu and Michael Spear",
title = "Boosting timestamp-based transactional memory by
exploiting hardware cycle counters",
journal = j-TACO,
volume = "10",
number = "4",
pages = "40:1--40:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555297",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Time-based transactional memories typically rely on a
shared memory counter to ensure consistency.
Unfortunately, such a counter can become a bottleneck.
In this article, we identify properties of hardware
cycle counters that allow their use in place of a
shared memory counter. We then devise algorithms that
exploit the x86 cycle counter to enable bottleneck-free
transactional memory runtime systems. We also consider
the impact of privatization safety and hardware
ordering constraints on the correctness, performance,
and generality of our algorithms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Dey:2013:RMD,
author = "Tanima Dey and Wei Wang and Jack W. Davidson and Mary
Lou Soffa",
title = "{ReSense}: Mapping dynamic workloads of colocated
multithreaded applications using resource sensitivity",
journal = j-TACO,
volume = "10",
number = "4",
pages = "41:1--41:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555298",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "To utilize the full potential of modern chip
multiprocessors and obtain scalable performance
improvements, it is critical to mitigate resource
contention created by multithreaded workloads. In this
article, we describe ReSense, the first runtime system
that uses application characteristics to dynamically
map multithreaded applications from dynamic
workloads-workloads where multithreaded applications
arrive, execute, and terminate continuously in
unpredictable ways. ReSense mitigates contention for
the shared resources in the memory hierarchy by
applying a novel thread-mapping algorithm that
dynamically adjusts the mapping of threads from dynamic
workloads using a precalculated sensitivity score. The
sensitivity score quantifies an application's
sensitivity to sharing a particular memory resource and
is calculated by an efficient characterization process
that involves running the multithreaded application by
itself on the target platform. To measure ReSense's
effectiveness, sensitivity scores were determined for
21 benchmarks from PARSEC-2.1 and NPB-OMP-3.3 for the
shared resources in the memory hierarchy on four
different platforms. Using three different-sized
dynamic workloads composed of randomly selected two,
four, and eight corunning benchmarks with randomly
selected start times, ReSense was able to improve the
average response time of the three workloads by up to
27.03\%, 20.89\%, and 29.34\% and throughput by up to
19.97\%, 46.56\%, and 29.86\%, respectively, over the
native OS on real hardware. By estimating and comparing
ReSense's effectiveness with the optimal thread mapping
for two different workloads, we found that the maximum
average difference with the experimentally determined
optimal performance was 1.49\% for average response
time and 2.08\% for throughput.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Armejach:2013:TIP,
author = "Adri{\`a} Armejach and Ruben Titos-Gil and Anurag Negi
and Osman S. Unsal and Adri{\'a}n Cristal",
title = "Techniques to improve performance in requester-wins
hardware transactional memory",
journal = j-TACO,
volume = "10",
number = "4",
pages = "42:1--42:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555299",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The simplicity of requester-wins Hardware
Transactional Memory (HTM) makes it easy to incorporate
in existing chip multiprocessors. Hence, such systems
are expected to be widely available in the near future.
Unfortunately, these implementations are prone to
suffer severe performance degradation due to transient
and persistent livelock conditions. This article shows
that existing techniques are unable to mitigate this
degradation effectively. It then proposes and evaluates
four novel techniques-two software-based that employ
information provided by the hardware and two that
require simple core-local hardware additions-which have
the potential to boost the performance of
requester-wins HTM designs substantially.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jeon:2013:RDR,
author = "Myeongjae Jeon and Conglong Li and Alan L. Cox and
Scott Rixner",
title = "Reducing {DRAM} row activations with eager read\slash
write clustering",
journal = j-TACO,
volume = "10",
number = "4",
pages = "43:1--43:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555300",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article describes and evaluates a new approach to
optimizing DRAM performance and energy consumption that
is based on eagerly writing dirty cache lines to DRAM.
Under this approach, many dirty cache lines are written
to DRAM before they are evicted. In particular, dirty
cache lines that have not been recently accessed are
eagerly written to DRAM when the corresponding row has
been activated by an ordinary, noneager access, such as
a read. This approach enables clustering of reads and
writes that target the same row, resulting in a
significant reduction in row activations. Specifically,
for a variety of applications, it reduces the number of
DRAM row activations by an average of 42\% and a
maximum of 82\%. Moreover, the results from a
full-system simulator show compelling performance
improvements and energy consumption reductions. Out of
23 applications, 6 have overall performance
improvements between 10\% and 20\%, and 3 have
improvements in excess of 20\%. Furthermore, 12 consume
between 10\% and 20\% less DRAM energy, and 7 have
energy consumption reductions in excess of 20\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhao:2013:HPP,
author = "Zhijia Zhao and Michael Bebenita and Dave Herman and
Jianhua Sun and Xipeng Shen",
title = "{HPar}: a practical parallel parser for {HTML} ---
taming {HTML} complexities for parallel parsing",
journal = j-TACO,
volume = "10",
number = "4",
pages = "44:1--44:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555301",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Parallelizing HTML parsing is challenging due to the
complexities of HTML documents and the inherent
dependencies in its parsing algorithm. As a result,
despite numerous studies in parallel parsing, HTML
parsing remains sequential today. It forms one of the
final barriers for fully parallelizing browser
operations to minimize the browser's response time-an
important variable for user experiences, especially on
portable devices. This article provides a comprehensive
analysis on the special complexities of parallel HTML
parsing and presents a systematic exploration in
overcoming those difficulties through specially
designed speculative parallelizations. This work
develops, to the best of our knowledge, the first
pipelining and data-level parallel HTML parsers. The
data-level parallel parser, named HPar, achieves up to
2.4$ \times $ speedup on quadcore devices. This work
demonstrates the feasibility of efficient, parallel
HTML parsing for the first time and offers a set of
novel insights for parallel HTML parsing",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Totoni:2013:EFE,
author = "Ehsan Totoni and Mert Dikmen and Mar{\'\i}a Jes{\'u}s
Garzar{\'a}n",
title = "Easy, fast, and energy-efficient object detection on
heterogeneous on-chip architectures",
journal = j-TACO,
volume = "10",
number = "4",
pages = "45:1--45:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555302",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We optimize a visual object detection application
(that uses Vision Video Library kernels) and show that
OpenCL is a unified programming paradigm that can
provide high performance when running on the Ivy Bridge
heterogeneous on-chip architecture. We evaluate
different mapping techniques and show that running each
kernel where it fits the best and using software
pipelining can provide 1.91 times higher performance
and 42\% better energy efficiency. We also show how to
trade accuracy for energy at runtime. Overall, our
application can perform accurate object detection at 40
frames per second (fps) in an energy-efficient
manner.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fedorov:2013:AAL,
author = "Viacheslav V. Fedorov and Sheng Qiu and A. L.
Narasimha Reddy and Paul V. Gratz",
title = "{ARI}: Adaptive {LLC}-memory traffic management",
journal = j-TACO,
volume = "10",
number = "4",
pages = "46:1--46:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2543697",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Decreasing the traffic from the CPU LLC to main memory
is a very important issue in modern systems. Recent
work focuses on cache misses, overlooking the impact of
writebacks on the total memory traffic, energy
consumption, IPC, and so forth. Policies that foster a
balanced approach, between reducing write traffic to
memory and improving miss rates, can increase overall
performance and improve energy efficiency and memory
system lifetime for NVM memory technology, such as
phase-change memory (PCM). We propose Adaptive
Replacement and Insertion (ARI), an adaptive approach
to last-level CPU cache management, optimizing the two
parameters (miss rate and writeback rate)
simultaneously. Our specific focus is to reduce
writebacks as much as possible while maintaining or
improving the miss rate relative to conventional LRU
replacement policy. ARI reduces LLC writebacks by 33\%,
on average, while also decreasing misses by 4.7\%, on
average. In a typical system, this boosts IPC by 4.9\%,
on average, while decreasing energy consumption by
8.9\%. These results are achieved with minimal hardware
overheads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gonzalez-Alvarez:2013:AAD,
author = "Cecilia Gonz{\'a}lez-{\'A}lvarez and Jennifer B.
Sartor and Carlos {\'A}lvarez and Daniel
Jim{\'e}nez-Gonz{\'a}lez and Lieven Eeckhout",
title = "Accelerating an application domain with specialized
functional units",
journal = j-TACO,
volume = "10",
number = "4",
pages = "47:1--47:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555303",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Hardware specialization has received renewed interest
recently as chips are hitting power limits. Chip
designers of traditional processor architectures have
primarily focused on general-purpose computing,
partially due to time-to-market pressure and simpler
design processes. But new power limits require some
chip specialization. Although hardware configured for a
specific application yields large speedups for
low-power dissipation, its design is more complex and
less reusable. We instead explore domain-based
specialization, a scalable approach that balances
hardware's reusability and performance efficiency. We
focus on specialization using customized compute units
that accelerate particular operations. In this article,
we develop automatic techniques to identify code
sequences from different applications within a domain
that can be targeted to a new custom instruction that
will be run inside a configurable specialized
functional unit (SFU). We demonstrate that using a
canonical representation of computations finds more
common code sequences among applications that can be
mapped to the same custom instruction, leading to
larger speedups while specializing a smaller core area
than previous pattern-matching techniques. We also
propose new heuristics to narrow the search space of
domain-specific custom instructions, finding those that
achieve the best performance across applications. We
estimate the overall performance achieved with our
automatic techniques using hardware models on a set of
nine media benchmarks, showing that when limiting the
core area devoted to specialization, the SFU
customization with the largest speedups includes both
application- and domain-specific custom instructions.
We demonstrate that exploring domain-specific hardware
acceleration is key to continued computing system
performance improvements.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2013:RMM,
author = "Xiaolin Wang and Lingmei Weng and Zhenlin Wang and
Yingwei Luo",
title = "Revisiting memory management on virtualized
environments",
journal = j-TACO,
volume = "10",
number = "4",
pages = "48:1--48:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555304",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the evolvement of hardware, 64-bit Central
Processing Units (CPUs) and 64-bit Operating Systems
(OSs) have dominated the market. This article
investigates the performance of virtual memory
management of Virtual Machines (VMs) with a large
virtual address space in 64-bit OSs, which imposes
different pressure on memory virtualization than 32-bit
systems. Each of the two conventional memory
virtualization approaches, Shadowing Paging (SP) and
Hardware-Assisted Paging (HAP), causes different
overhead for different applications. Our experiments
show that 64-bit applications prefer to run in a VM
using SP, while 32-bit applications do not have a
uniform preference between SP and HAP. In this article,
we trace this inconsistency between 32-bit applications
and 64-bit applications to its root cause through a
systematic empirical study in Linux systems and
discover that the major overhead of SP results from
memory management in the 32-bit GNU C library ( glibc
). We propose enhancements to the existing memory
management algorithms, which substantially reduce the
overhead of SP. Based on the evaluations using SPEC
CPU2006, Parsec 2.1, and cloud benchmarks, our results
show that SP, with the improved memory allocators, can
compete with HAP in almost all cases, in both 64-bit
and 32-bit systems. We conclude that without a
significant breakthrough in HAP, researchers should pay
more attention to SP, which is more flexible and cost
effective.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jiang:2013:PAP,
author = "Chuntao Jiang and Zhibin Yu and Hai Jin and Chengzhong
Xu and Lieven Eeckhout and Wim Heirman and Trevor E.
Carlson and Xiaofei Liao",
title = "{PCantorSim}: Accelerating parallel architecture
simulation through fractal-based sampling",
journal = j-TACO,
volume = "10",
number = "4",
pages = "49:1--49:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555305",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Computer architects rely heavily on microarchitecture
simulation to evaluate design alternatives.
Unfortunately, cycle-accurate simulation is extremely
slow, being at least 4 to 6 orders of magnitude slower
than real hardware. This longstanding problem is
further exacerbated in the multi-/many-core era,
because single-threaded simulation performance has not
improved much, while the design space has expanded
substantially. Parallel simulation is a promising
approach, yet does not completely solve the simulation
challenge. Furthermore, existing sampling techniques,
which are widely used for single-threaded applications,
do not readily apply to multithreaded applications as
thread interaction and synchronization must now be
taken into account. This work presents PCantorSim, a
novel Cantor set (a classic fractal)--based sampling
scheme to accelerate parallel simulation of
multithreaded applications. Through the use of the
proposed methodology, only less than 5\% of an
application's execution time is simulated in detail. We
have implemented our approach in Sniper (a parallel
multicore simulator) and evaluated it by running the
PARSEC benchmarks on a simulated 8-core system. The
results show that PCantorSim increases simulation speed
over detailed parallel simulation by a factor of 20$
\times $, on average, with an average absolute
execution time prediction error of 5.3\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Stipic:2013:PGT,
author = "Srdan Stipi{\'c} and Vesna Smiljkovi{\'c} and Osman
Unsal and Adri{\'a}n Cristal and Mateo Valero",
title = "Profile-guided transaction coalescing-lowering
transactional overheads by merging transactions",
journal = j-TACO,
volume = "10",
number = "4",
pages = "50:1--50:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555306",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/hash.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Previous studies in software transactional memory
mostly focused on reducing the overhead of
transactional read and write operations. In this
article, we introduce transaction coalescing, a
profile-guided compiler optimization technique that
attempts to reduce the overheads of starting and
committing a transaction by merging two or more small
transactions into one large transaction. We develop a
profiling tool and a transaction coalescing heuristic
to identify candidate transactions suitable for
coalescing. We implement a compiler extension to
automatically merge the candidate transactions at the
compile time. We evaluate the effectiveness of our
technique using the hash table micro-benchmark and the
STAMP benchmark suite. Transaction coalescing improves
the performance of the hash table significantly and the
performance of Vacation and SSCA2 benchmarks by 19.4\%
and 36.4\%, respectively, when running with 12
threads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2013:WWA,
author = "Zhe Wang and Shuchang Shan and Ting Cao and Junli Gu
and Yi Xu and Shuai Mu and Yuan Xie and Daniel A.
Jim{\'e}nez",
title = "{WADE}: Writeback-aware dynamic cache management for
{NVM}-based main memory system",
journal = j-TACO,
volume = "10",
number = "4",
pages = "51:1--51:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555307",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Emerging Non-Volatile Memory (NVM) technologies are
explored as potential alternatives to traditional
SRAM/DRAM-based memory architecture in future
microprocessor design. One of the major disadvantages
for NVM is the latency and energy overhead associated
with write operations. Mitigation techniques to
minimize the write overhead for NVM-based main memory
architecture have been studied extensively. However,
most prior work focuses on optimization techniques for
NVM-based main memory itself, with little attention
paid to cache management policies for the Last-Level
Cache (LLC). In this article, we propose a
Writeback-Aware Dynamic CachE (WADE) management
technique to help mitigate the write overhead in
NVM-based memory. The proposal is based on the
observation that, when dirty cache blocks are evicted
from the LLC and written into NVM-based memory (with
PCM as an example), the long latency and high energy
associated with write operations to NVM-based memory
can cause system performance/power degradation. Thus,
reducing the number of writeback requests from the LLC
is critical. The proposed WADE cache management
technique tries to keep highly reused dirty cache
blocks in the LLC. The technique predicts blocks that
are frequently written back in the LLC. The LLC sets
are dynamically partitioned into a frequent writeback
list and a nonfrequent writeback list. It keeps a best
size of each list in the LLC. Our evaluation shows that
the technique can reduce the number of writeback
requests by 16.5\% for memory-intensive single-threaded
benchmarks and 10.8\% for multicore workloads. It
yields a geometric mean speedup of 5.1\% for
single-thread applications and 7.6\% for multicore
workloads. Due to the reduced number of writeback
requests to main memory, the technique reduces the
energy consumption by 8.1\% for single-thread
applications and 7.6\% for multicore workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Li:2013:CCC,
author = "Yong Li and Yaojun Zhang and Hai LI and Yiran Chen and
Alex K. Jones",
title = "{C1C}: a configurable, compiler-guided {STT-RAM L1}
cache",
journal = j-TACO,
volume = "10",
number = "4",
pages = "52:1--52:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555308",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Spin-Transfer Torque RAM (STT-RAM), a promising
alternative to SRAM for reducing leakage power
consumption, has been widely studied to mitigate the
impact of its asymmetrically long write latency.
Recently, STT-RAM has been proposed for L1 caches by
relaxing the data retention time to improve write
performance and dynamic energy. However, as the
technology scales down from 65nm to 22nm, the
performance of the read operation scales poorly due to
reduced sense margins and sense amplifier delays. In
this article, we leverage a dual-mode STT memory cell
to design a configurable L1 cache architecture termed
C1C to mitigate read performance barriers with
technology scaling. Guided by application access
characteristics discovered through novel compiler
analyses, the proposed cache adaptively switches
between a high performance and a low-power access mode.
Our evaluation demonstrates that the proposed cache
with compiler guidance outperforms a state-of-the-art
STT-RAM cache design by 9\% with high dynamic energy
efficiency, leading to significant performance/watt
improvements over several competing approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fauzia:2013:BRD,
author = "Naznin Fauzia and Venmugil Elango and Mahesh
Ravishankar and J. Ramanujam and Fabrice Rastello and
Atanas Rountev and Louis-No{\"e}l Pouchet and P.
Sadayappan",
title = "Beyond reuse distance analysis: Dynamic analysis for
characterization of data locality potential",
journal = j-TACO,
volume = "10",
number = "4",
pages = "53:1--53:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555309",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Emerging computer architectures will feature
drastically decreased flops/byte (ratio of peak
processing rate to memory bandwidth) as highlighted by
recent studies on Exascale architectural trends.
Further, flops are getting cheaper, while the energy
cost of data movement is increasingly dominant. The
understanding and characterization of data locality
properties of computations is critical in order to
guide efforts to enhance data locality. Reuse distance
analysis of memory address traces is a valuable tool to
perform data locality characterization of programs. A
single reuse distance analysis can be used to estimate
the number of cache misses in a fully associative LRU
cache of any size, thereby providing estimates on the
minimum bandwidth requirements at different levels of
the memory hierarchy to avoid being bandwidth bound.
However, such an analysis only holds for the particular
execution order that produced the trace. It cannot
estimate potential improvement in data locality through
dependence-preserving transformations that change the
execution schedule of the operations in the
computation. In this article, we develop a novel
dynamic analysis approach to characterize the inherent
locality properties of a computation and thereby assess
the potential for data locality enhancement via
dependence-preserving transformations. The execution
trace of a code is analyzed to extract a
Computational-Directed Acyclic Graph (CDAG) of the data
dependences. The CDAG is then partitioned into convex
subsets, and the convex partitioning is used to reorder
the operations in the execution trace to enhance data
locality. The approach enables us to go beyond reuse
distance analysis of a single specific order of
execution of the operations of a computation in
characterization of its data locality properties. It
can serve a valuable role in identifying promising code
regions for manual transformation, as well as assessing
the effectiveness of compiler transformations for data
locality enhancement. We demonstrate the effectiveness
of the approach using a number of benchmarks, including
case studies where the potential shown by the analysis
is exploited to achieve lower data movement costs and
better performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bardizbanyan:2013:DPD,
author = "Alen Bardizbanyan and Magnus Sj{\"a}lander and David
Whalley and Per Larsson-Edefors",
title = "Designing a practical data filter cache to improve
both energy efficiency and performance",
journal = j-TACO,
volume = "10",
number = "4",
pages = "54:1--54:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555310",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Conventional Data Filter Cache (DFC) designs improve
processor energy efficiency, but degrade performance.
Furthermore, the single-cycle line transfer suggested
in prior studies adversely affects Level-1 Data Cache
(L1 DC) area and energy efficiency. We propose a
practical DFC that is accessed early in the pipeline
and transfers a line over multiple cycles. Our DFC
design improves performance and eliminates a
substantial fraction of L1 DC accesses for loads, L1 DC
tag checks on stores, and data translation lookaside
buffer accesses for both loads and stores. Our
evaluation shows that the proposed DFC can reduce the
data access energy by 42.5\% and improve execution time
by 4.2\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hagiescu:2013:GCG,
author = "Andrei Hagiescu and Bing Liu and R. Ramanathan and
Sucheendra K. Palaniappan and Zheng Cui and Bipasa
Chattopadhyay and P. S. Thiagarajan and Weng-Fai Wong",
title = "{GPU} code generation for {ODE}-based applications
with phased shared-data access patterns",
journal = j-TACO,
volume = "10",
number = "4",
pages = "55:1--55:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555311",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We present a novel code generation scheme for GPUs.
Its key feature is the platform-aware generation of a
heterogeneous pool of threads. This exposes more
data-sharing opportunities among the concurrent threads
and reduces the memory requirements that would
otherwise exceed the capacity of the on-chip memory.
Instead of the conventional strategy of focusing on
exposing as much parallelism as possible, our scheme
leverages on the phased nature of memory access
patterns found in many applications that exhibit
massive parallelism. We demonstrate the effectiveness
of our code generation strategy on a computational
systems biology application. This application consists
of computing a Dynamic Bayesian Network (DBN)
approximation of the dynamics of signalling pathways
described as a system of Ordinary Differential
Equations (ODEs). The approximation algorithm involves
(i) sampling many (of the order of a few million) times
from the set of initial states, (ii) generating
trajectories through numerical integration, and (iii)
storing the statistical properties of this set of
trajectories in Conditional Probability Tables (CPTs)
of a DBN via a prespecified discretization of the time
and value domains. The trajectories can be computed in
parallel. However, the intermediate data needed for
computing them, as well as the entries for the CPTs,
are too large to be stored locally. Our experiments
show that the proposed code generation scheme scales
well, achieving significant performance improvements on
three realistic signalling pathways models. These
results suggest how our scheme could be extended to
deal with other applications involving systems of
ODEs.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Lee:2013:TLS,
author = "Junghee Lee and Chrysostomos Nicopoulos and Hyung Gyu
Lee and Jongman Kim",
title = "{TornadoNoC}: a lightweight and scalable on-chip
network architecture for the many-core era",
journal = j-TACO,
volume = "10",
number = "4",
pages = "56:1--56:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555312",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The rapid emergence of Chip Multi-Processors (CMP) as
the de facto microprocessor archetype has highlighted
the importance of scalable and efficient on-chip
networks. Packet-based Networks-on-Chip (NoC) are
gradually cementing themselves as the medium of choice
for the multi-/many-core systems of the near future,
due to their innate scalability. However, the
prominence of the debilitating power wall requires the
NoC to also be as energy efficient as possible. To
achieve these two antipodal requirements-scalability
and energy efficiency-we propose TornadoNoC, an
interconnect architecture that employs a novel flow
control mechanism. To prevent livelocks and deadlocks,
a sequence numbering scheme and a dynamic ring
inflation technique are proposed, and their correctness
formally proven. The primary objective of TornadoNoC is
to achieve substantial gains in (a) scalability to
many-core systems and (b) the area/power footprint, as
compared to current state-of-the-art router
implementations. The new router is demonstrated to
provide better scalability to hundreds of cores than an
ideal single-cycle wormhole implementation and other
scalability-enhanced low-cost routers. Extensive
simulations using both synthetic traffic patterns and
real applications running in a full-system simulator
corroborate the efficacy of the proposed design.
Finally, hardware synthesis analysis using commercial
65nm standard-cell libraries indicates that the area
and power budgets of the new router are reduced by up
to 53\% and 58\%, respectively, as compared to existing
state-of-the-art low-cost routers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Strydis:2013:SAP,
author = "Christos Strydis and Robert M. Seepers and Pedro
Peris-Lopez and Dimitrios Siskos and Ioannis Sourdis",
title = "A system architecture, processor, and communication
protocol for secure implants",
journal = j-TACO,
volume = "10",
number = "4",
pages = "57:1--57:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555313",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Secure and energy-efficient communication between
Implantable Medical Devices (IMDs) and authorized
external users is attracting increasing attention these
days. However, there currently exists no systematic
approach to the problem, while solutions from
neighboring fields, such as wireless sensor networks,
are not directly transferable due to the peculiarities
of the IMD domain. This work describes an original,
efficient solution for secure IMD communication. A new
implant system architecture is proposed, where security
and main-implant functionality are made completely
decoupled by running the tasks onto two separate cores.
Wireless communication goes through a custom security
ASIP, called SISC (Smart-Implant Security Core), which
runs an energy-efficient security protocol. The
security core is powered by RF-harvested energy until
it performs external-reader authentication, providing
an elegant defense mechanism against battery
Denial-of-Service (DoS) and other, more common attacks.
The system has been evaluated based on a realistic case
study involving an artificial pancreas implant. When
synthesized for a UMC 90nm CMOS ASIC technology, our
system architecture achieves defense against
unauthorized accesses having zero energy cost, running
entity authentication through harvesting only 7.45 $
\mu $J of RF energy from the requesting entity. In all
other successfully authenticated accesses, our
architecture achieves secure data exchange without
affecting the performance of the main IMD
functionality, adding less than 1o/oo (1.3 mJ ) to the
daily energy consumption of a typical implant. Compared
to a singe-core, secure reference IMD, which would
still be more vulnerable to some types of attacks, our
secure system on chip (SoC) achieves high security
levels at 56\% energy savings and at an area overhead
of less than 15\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kim:2013:FMS,
author = "Wonsub Kim and Yoonseo Choi and Haewoo Park",
title = "Fast modulo scheduler utilizing patternized routes for
coarse-grained reconfigurable architectures",
journal = j-TACO,
volume = "10",
number = "4",
pages = "58:1--58:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555314",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Coarse-Grained Reconfigurable Architectures (CGRAs)
present a potential of high compute throughput with
energy efficiency. A CGRA consists of an array of
Functional Units (FUs), which communicate with each
other through an interconnect network containing
transmission nodes and register files. To achieve high
performance from the software solutions mapped onto
CGRAs, modulo scheduling of loops is generally
employed. One of the key challenges in modulo
scheduling for CGRAs is to explicitly handle routings
of operands from a source to a destination operations
through various routing resources. Existing modulo
schedulers for CGRAs are slow because finding a valid
routing is generally a searching problem over a large
space, even with the guidance of well-defined cost
metrics. Applications in traditional embedded
multimedia domains are regarded as relatively tolerant
to a slow compile time in exchange for a high-quality
solution. However, many rapidly growing domains of
applications, such as 3D graphics, require a fast
compilation. Entrances of CGRAs to these domains have
been blocked mainly due to their long compile time. We
attack this problem by utilizing patternized routes,
for which resources and time slots for a success can be
estimated in advance when a source operation is placed.
By conservatively reserving predefined resources at
predefined time slots, future routings originating from
the source operation are guaranteed. Experiments on a
real-world 3D graphics benchmark suite show that our
scheduler improves the compile time up to 6,000 times
while achieving an average 70\% throughputs of the
state-of-the-art CGRA modulo scheduler, the
Edge-centric Modulo Scheduler (EMS).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "58",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nuzman:2013:JTC,
author = "Dorit Nuzman and Revital Eres and Sergei Dyshel and
Marcel Zalmanovici and Jose Castanos",
title = "{JIT} technology with {C\slash C++}: Feedback-directed
dynamic recompilation for statically compiled
languages",
journal = j-TACO,
volume = "10",
number = "4",
pages = "59:1--59:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555315",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The growing gap between the advanced capabilities of
static compilers as reflected in benchmarking results
and the actual performance that users experience in
real-life scenarios makes client-side dynamic
optimization technologies imperative to the domain of
static languages. Dynamic optimization of software
distributed in the form of a platform-agnostic
Intermediate-Representation (IR) has been very
successful in the domain of managed languages, greatly
improving upon interpreted code, especially when online
profiling is used. However, can such feedback-directed
IR-based dynamic code generation be viable in the
domain of statically compiled, rather than interpreted,
languages? We show that fat binaries, which combine the
IR together with the statically compiled executable,
can provide a practical solution for software vendors,
allowing their software to be dynamically optimized
without the limitation of binary-level approaches,
which lack the high-level IR of the program, and
without the warm-up costs associated with the IR-only
software distribution approach. We describe and
evaluate the fat-binary-based runtime compilation
approach using SPECint2006, demonstrating that the
overheads it incurs are low enough to be successfully
surmounted by dynamic optimization. Building on Java
JIT technologies, our results already improve upon
common real-world usage scenarios, including very small
workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "59",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ramashekar:2013:ADA,
author = "Thejas Ramashekar and Uday Bondhugula",
title = "Automatic data allocation and buffer management for
multi-{GPU} machines",
journal = j-TACO,
volume = "10",
number = "4",
pages = "60:1--60:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2544100",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Multi-GPU machines are being increasingly used in
high-performance computing. Each GPU in such a machine
has its own memory and does not share the address space
either with the host CPU or other GPUs. Hence,
applications utilizing multiple GPUs have to manually
allocate and manage data on each GPU. Existing works
that propose to automate data allocations for GPUs have
limitations and inefficiencies in terms of allocation
sizes, exploiting reuse, transfer costs, and
scalability. We propose a scalable and fully automatic
data allocation and buffer management scheme for affine
loop nests on multi-GPU machines. We call it the
Bounding-Box-based Memory Manager (BBMM). BBMM can
perform at runtime, during standard set operations like
union, intersection, and difference, finding subset and
superset relations on hyperrectangular regions of array
data (bounding boxes). It uses these operations along
with some compiler assistance to identify, allocate,
and manage data required by applications in terms of
disjoint bounding boxes. This allows it to (1) allocate
exactly or nearly as much data as is required by
computations running on each GPU, (2) efficiently track
buffer allocations and hence maximize data reuse across
tiles and minimize data transfer overhead, and (3) and
as a result, maximize utilization of the combined
memory on multi-GPU machines. BBMM can work with any
choice of parallelizing transformations, computation
placement, and scheduling schemes, whether static or
dynamic. Experiments run on a four-GPU machine with
various scientific programs showed that BBMM reduces
data allocations on each GPU by up to 75\% compared to
current allocation schemes, yields performance of at
least 88\% of manually written code, and allows
excellent weak scaling.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "60",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Vandierendonck:2013:ADT,
author = "Hans Vandierendonck and George Tzenakis and Dimitrios
S. Nikolopoulos",
title = "Analysis of dependence tracking algorithms for task
dataflow execution",
journal = j-TACO,
volume = "10",
number = "4",
pages = "61:1--61:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555316",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Processor architectures has taken a turn toward
many-core processors, which integrate multiple
processing cores on a single chip to increase overall
performance, and there are no signs that this trend
will stop in the near future. Many-core processors are
harder to program than multicore and single-core
processors due to the need for writing parallel or
concurrent programs with high degrees of parallelism.
Moreover, many-cores have to operate in a mode of
strong scaling because of memory bandwidth constraints.
In strong scaling, increasingly finer-grain parallelism
must be extracted in order to keep all processing cores
busy. Task dataflow programming models have a high
potential to simplify parallel programming because they
alleviate the programmer from identifying precisely all
intertask dependences when writing programs. Instead,
the task dataflow runtime system detects and enforces
intertask dependences during execution based on the
description of memory accessed by each task. The
runtime constructs a task dataflow graph that captures
all tasks and their dependences. Tasks are scheduled to
execute in parallel, taking into account dependences
specified in the task graph. Several papers report
important overheads for task dataflow systems, which
severely limits the scalability and usability of such
systems. In this article, we study efficient schemes to
manage task graphs and analyze their scalability. We
assume a programming model that supports input, output,
and in/out annotations on task arguments, as well as
commutative in/out and reductions. We analyze the
structure of task graphs and identify versions and
generations as key concepts for efficient management of
task graphs. Then, we present three schemes to manage
task graphs building on graph representations,
hypergraphs, and lists. We also consider a fourth
edgeless scheme that synchronizes tasks using integers.
Analysis using microbenchmarks shows that the graph
representation is not always scalable and that the
edgeless scheme introduces least overhead in nearly all
situations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "61",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jeong:2013:EET,
author = "Yeonghun Jeong and Seongseok Seo and Jongeun Lee",
title = "Evaluator-executor transformation for efficient
pipelining of loops with conditionals",
journal = j-TACO,
volume = "10",
number = "4",
pages = "62:1--62:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555317",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Control divergence poses many problems in
parallelizing loops. While predicated execution is
commonly used to convert control dependence into data
dependence, it often incurs high overhead because it
allocates resources equally for both branches of a
conditional statement regardless of their execution
frequencies. For those loops with unbalanced
conditionals, we propose a software transformation that
divides a loop into two or three smaller loops so that
the condition is evaluated only in the first loop,
while the less frequent branch is executed in the
second loop in a way that is much more efficient than
in the original loop. To reduce the overhead of extra
data transfer caused by the loop fission, we also
present a hardware extension for a class of
Coarse-Grained Reconfigurable Architectures (CGRAs).
Our experiments using MiBench and computer vision
benchmarks on a CGRA demonstrate that our techniques
can improve the performance of loops over predicated
execution by up to 65\% (37.5\%, on average), when the
hardware extension is enabled. Without any hardware
modification, our software-only version can improve
performance by up to 64\% (33\%, on average), while
simultaneously reducing the energy consumption of the
entire CGRA including configuration and data memory by
22\%, on average.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "62",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Barik:2013:DNS,
author = "Rajkishore Barik and Jisheng Zhao and Vivek Sarkar",
title = "A decoupled non-{SSA} global register allocation using
bipartite liveness graphs",
journal = j-TACO,
volume = "10",
number = "4",
pages = "63:1--63:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2544101",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Register allocation is an essential optimization for
all compilers. A number of sophisticated register
allocation algorithms have been developed over the
years. The two fundamental classes of register
allocation algorithms used in modern compilers are
based on Graph Coloring (GC) and Linear Scan (LS).
However, these two algorithms have fundamental
limitations in terms of precision. For example, the key
data structure used in GC-based algorithms, the
interference graph, lacks information on the program
points at which two variables may interfere. The
LS-based algorithms make local decisions regarding
spilling, and thereby trade off global optimization for
reduced compile-time and space overheads. Recently,
researchers have proposed Static Single Assignment
(SSA)-based decoupled register allocation algorithms
that exploit the live-range split points of the SSA
representation to optimally solve the spilling problem.
However, SSA-based register allocation often requires
extra complexity in repairing register assignments
during SSA elimination and in addressing architectural
constraints such as aliasing and ABI encoding; this
extra overhead can be prohibitively expensive in
dynamic compilation contexts. This article proposes a
decoupled non-SSA--based global register allocation
algorithm for dynamic compilation. It addresses the
limitations in current algorithms by introducing a
Bipartite Liveness Graph (BLG)-based register
allocation algorithm that models the spilling phase as
an optimization problem on the BLG itself and the
assignment phase as a separate optimization problem.
Advanced register allocation optimizations such as move
coalescing, live-range splitting, and register class
handling are also performed along with the spilling and
assignment phases. In the presence of register classes,
we propose a bucket-based greedy heuristic for
assignment that strikes a balance between spill-cost
and register class constraints. We present experimental
evaluation of our BLG-based register allocation
algorithm and compare it with production-quality
register allocators in Jikes RVM and LLVM.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "63",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gavin:2013:RIF,
author = "Peter Gavin and David Whalley and Magnus
Sj{\"a}lander",
title = "Reducing instruction fetch energy in multi-issue
processors",
journal = j-TACO,
volume = "10",
number = "4",
pages = "64:1--64:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2541228.2555318",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Thu Jan 9 10:42:35 MST 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The need to minimize power while maximizing
performance has led to recent developments of powerful
superscalar designs targeted at embedded and portable
use. Instruction fetch is responsible for a significant
fraction of microprocessor power and energy, and is
therefore an attractive target for architectural power
optimization. We present novel techniques that take
advantage of guarantees so that the instruction
translation lookaside buffer, branch target buffer, and
branch prediction buffer can frequently be disabled,
reducing their energy usage, while simultaneously
reducing branch predictor contention. These techniques
require no changes to the instruction set and can
easily be integrated into most single- and
multiple-issue processors.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "64",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Anonymous:2013:LDR,
author = "Anonymous",
title = "List of distinguished reviewers {ACM TACO}",
journal = j-TACO,
volume = "10",
number = "4",
pages = "65:1--65:??",
month = dec,
year = "2013",
CODEN = "????",
DOI = "https://doi.org/10.1145/2560216",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:44 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "65",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Goel:2014:SPR,
author = "Neeraj Goel and Anshul Kumar and Preeti Ranjan Panda",
title = "Shared-port register file architecture for low-energy
{VLIW} processors",
journal = j-TACO,
volume = "11",
number = "1",
pages = "1:1--1:32",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2533397",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We propose a reduced-port Register File (RF)
architecture for reducing RF energy in a VLIW
processor. With port reduction, RF ports need to be
shared among Function Units (FUs), which may lead to
access conflicts, and thus, reduced performance. Our
solution includes (i) a carefully designed RF-FU
interconnection network that permits port sharing with
minimum conflicts and without any delay/energy
overheads, and (ii) a novel scheduling and binding
algorithm that reduces the performance penalty. With
our solution, we observed as much as 83\% RF energy
savings with no more than a 10\% loss in performance
for a set of Mediabench and Mibench benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "1",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2014:IPD,
author = "Zheng Wang and Georgios Tournavitis and Bj{\"o}rn
Franke and Michael F. P. O'Boyle",
title = "Integrating profile-driven parallelism detection and
machine-learning-based mapping",
journal = j-TACO,
volume = "11",
number = "1",
pages = "2:1--2:26",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579561",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Compiler-based auto-parallelization is a much-studied
area but has yet to find widespread application. This
is largely due to the poor identification and
exploitation of application parallelism, resulting in
disappointing performance far below that which a
skilled expert programmer could achieve. We have
identified two weaknesses in traditional parallelizing
compilers and propose a novel, integrated approach
resulting in significant performance improvements of
the generated parallel code. Using profile-driven
parallelism detection, we overcome the limitations of
static analysis, enabling the identification of more
application parallelism, and only rely on the user for
final approval. We then replace the traditional
target-specific and inflexible mapping heuristics with
a machine-learning-based prediction mechanism,
resulting in better mapping decisions while automating
adaptation to different target architectures. We have
evaluated our parallelization strategy on the NAS and
SPEC CPU2000 benchmarks and two different multicore
platforms (dual quad-core Intel Xeon SMP and
dual-socket QS20 Cell blade). We demonstrate that our
approach not only yields significant improvements when
compared with state-of-the-art parallelizing compilers
but also comes close to and sometimes exceeds the
performance of manually parallelized codes. On average,
our methodology achieves 96\% of the performance of the
hand-tuned OpenMP NAS and SPEC parallel benchmarks on
the Intel Xeon platform and gains a significant speedup
for the IBM Cell platform, demonstrating the potential
of profile-guided and machine-learning- based
parallelization for complex multicore platforms.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "2",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Samadi:2014:LGU,
author = "Mehrzad Samadi and Amir Hormati and Janghaeng Lee and
Scott Mahlke",
title = "Leveraging {GPUs} using cooperative loop speculation",
journal = j-TACO,
volume = "11",
number = "1",
pages = "3:1--3:26",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579617",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/pvm.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Graphics processing units, or GPUs, provide TFLOPs of
additional performance potential in commodity computer
systems that frequently go unused by most applications.
Even with the emergence of languages such as CUDA and
OpenCL, programming GPUs remains a difficult challenge
for a variety of reasons, including the inherent
algorithmic characteristics and data structure choices
used by applications as well as the tedious performance
optimization cycle that is necessary to achieve high
performance. The goal of this work is to increase the
applicability of GPUs beyond CUDA/OpenCL to implicitly
data-parallel applications written in C/C++ using
speculative parallelization. To achieve this goal, we
propose Paragon: a static/dynamic compiler platform to
speculatively run possibly data-parallel portions of
sequential applications on the GPU while cooperating
with the system CPU. For such loops, Paragon utilizes
the GPU in an opportunistic way while orchestrating a
cooperative relation between the CPU and GPU to reduce
the overhead of miss-speculations. Paragon monitors the
dependencies for the loops running speculatively on the
GPU and nonspeculatively on the CPU using a lightweight
distributed conflict detection designed specifically
for GPUs, and transfers the execution to the CPU in
case a conflict is detected. Paragon resumes the
execution on the GPU after the CPU resolves the
dependency. Our experiments show that Paragon achieves
4x on average and up to 30x speedup compared to unsafe
CPU execution with four threads and 7x on average and
up to 64x speedup versus sequential execution across a
set of sequential but implicitly data-parallel
applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "3",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2014:EAC,
author = "Jue Wang and Xiangyu Dong and Yuan Xie and Norman P.
Jouppi",
title = "Endurance-aware cache line management for non-volatile
caches",
journal = j-TACO,
volume = "11",
number = "1",
pages = "4:1--4:24",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579671",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Nonvolatile memories (NVMs) have the potential to
replace low-level SRAM or eDRAM on-chip caches because
NVMs save standby power and provide large cache
capacity. However, limited write endurance is a common
problem for NVM technologies, and today's cache
management might result in unbalanced cache write
traffic, causing heavily written cache blocks to fail
much earlier than others. Although wear-leveling
techniques for NVM-based main memories exist, we cannot
simply apply them to NVM-based caches. This is because
cache writes have intraset variations as well as
interset variations, while writes to main memories only
have interset variations. To solve this problem, we
propose i$^2$ WAP, a new cache management policy that
can reduce both inter- and intraset write variations.
i$^2$ WAP has two features: Swap-Shift, an enhancement
based on existing main memory wear leveling to reduce
cache interset write variations, and Probabilistic Set
Line Flush, a novel technique to reduce cache intraset
write variations. Implementing i$^2$ WAP only needs two
global counters and two global registers. In one of our
studies, i$^2$ WAP can improve the NVM cache lifetime
by 75\% on average and up to 224\%. We also validate
that i$^2$ WAP is effective in systems with different
cache configurations and workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "4",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liu:2014:BBS,
author = "Lei Liu and Zehan Cui and Yong Li and Yungang Bao and
Mingyu Chen and Chengyong Wu",
title = "{{BPM\slash BPM+}}: Software-based dynamic memory
partitioning mechanisms for mitigating {DRAM}
bank-\slash channel-level interferences in multicore
systems",
journal = j-TACO,
volume = "11",
number = "1",
pages = "5:1--5:28",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579672",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The main memory system is a shared resource in modern
multicore machines that can result in serious
interference leading to reduced throughput and
unfairness. Many new memory scheduling mechanisms have
been proposed to address the interference problem.
However, these mechanisms usually employ relative
complex scheduling logic and need modifications to
Memory Controllers (MCs), which incur expensive
hardware design and manufacturing overheads. This
article presents a practical software approach to
effectively eliminate the interference without any
hardware modifications. The key idea is to modify the
OS memory management system and adopt a
page-coloring-based Bank-level Partitioning Mechanism
(BPM) that allocates dedicated DRAM banks to each core
(or thread). By using BPM, memory requests from
distinct programs are segregated across multiple memory
banks to promote locality/fairness and reduce
interference. We further extend BPM to BPM+ by
incorporating channel-level partitioning, on which we
demonstrate additional gain over BPM in many cases. To
achieve benefits in the presence of diverse application
memory needs and avoid performance degradation due to
resource underutilization, we propose a dynamic
mechanism upon BPM/BPM+ that assigns appropriate
bank/channel resources based on application
memory/bandwidth demands monitored through PMU
(performance-monitoring unit) and a low-overhead OS
page table scanning process. We implement BPM/BPM+ in
Linux 2.6.32.15 kernel and evaluate the technique on
four-core and eight-core real machines by running a
large amount of randomly generated multiprogrammed and
multithreaded workloads. Experimental results show that
BPM/BPM+ can improve the overall system throughput by
4.7\%/5.9\%, on average, (up to 8.6\%/9.5\%) and reduce
the unfairness by an average of 4.2\%/6.1\% (up to
15.8\%/13.9\%).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "5",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Haubl:2014:TTE,
author = "Christian H{\"a}ubl and Christian Wimmer and Hanspeter
M{\"o}ssenb{\"o}ck",
title = "Trace transitioning and exception handling in a
trace-based {JIT} compiler for {Java}",
journal = j-TACO,
volume = "11",
number = "1",
pages = "6:1--6:26",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579673",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Trace-based Just-In-Time (JIT) compilation generates
machine code for frequently executed paths (so-called
traces) instead of whole methods. While this has
several advantages, it complicates invocation of
compiled traces as well as exception handling, so that
previous trace-based compilers limited the way in which
traces could be invoked. We present a significantly
enhanced trace-based compiler where arbitrary
transitions between interpreted and compiled traces are
possible. For that, we introduce suitable trace calling
conventions and extend exception handling to work both
within traces and across trace boundaries. Furthermore,
we use the recorded trace information for optimizations
and combine the tracing ideas with ideas from
partial-method compilation to avoid code bloat. An
extensive evaluation with the benchmark suites DaCapo
9.12 Bach and SPECjvm2008 shows that our trace-based
compiler achieves up to 59\% higher peak performance
than the method-based Java HotSpot client compiler. On
a few benchmarks, our fairly simple trace-based
compiler shows a higher peak performance than the Java
HotSpot server compiler, which is one of today's best
optimizing JIT compilers for Java.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "6",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Huang:2014:HHH,
author = "Yongbing Huang and Licheng Chen and Zehan Cui and Yuan
Ruan and Yungang Bao and Mingyu Chen and Ninghui Sun",
title = "{HMTT}: a hybrid hardware\slash software tracing
system for bridging the {DRAM} access trace's semantic
gap",
journal = j-TACO,
volume = "11",
number = "1",
pages = "7:1--7:25",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579668",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "DRAM access traces (i.e., off-chip memory references)
can be extremely valuable for the design of memory
subsystems and performance tuning of software. Hardware
snooping on the off-chip memory interface is an
effective and nonintrusive approach to monitoring and
collecting real-life DRAM accesses. However, compared
with software-based approaches, hardware snooping
approaches typically lack semantic information, such as
process/function/object identifiers, virtual addresses,
and lock contexts, that is essential to the complete
understanding of the systems and software under
investigation. In this article, we propose a hybrid
hardware/software mechanism that is able to collect
off-chip memory reference traces with semantic
information. We have designed and implemented a
prototype system called HMTT (Hybrid Memory Trace
Tool), which uses a custom-made DIMM connector to
collect off-chip memory references and a high-level
event-encoding scheme to correlate semantic information
with memory references. In addition to providing
complete, undistorted DRAM access traces, the proposed
system is also able to perform various types of
low-overhead profiling, such as object-relative
accesses and multithread lock accesses.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "7",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chen:2014:AWA,
author = "Quan Chen and Minyi Guo",
title = "Adaptive workload-aware task scheduling for
single-{ISA} asymmetric multicore architectures",
journal = j-TACO,
volume = "11",
number = "1",
pages = "8:1--8:25",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579674",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Single-ISA Asymmetric Multicore (AMC) architectures
have shown high performance as well as power
efficiency. However, current parallel programming
environments do not perform well on AMC because they
are designed for symmetric multicore architectures in
which all cores provide equal performance. Their random
task scheduling policies can result in unbalanced
workloads in AMC and severely degrade the performance
of parallel applications. To balance the workloads of
parallel applications in AMC, this article proposes an
adaptive Workload-Aware Task Scheduler (WATS) that
consists of a history-based task allocator and a
preference-based task scheduler. The history-based task
allocator is based on a near-optimal, static task
allocation using the historical statistics collected
during the execution of a parallel application. The
preference-based task scheduler, which schedules tasks
based on a preference list, can dynamically adjust the
workloads in AMC if the task allocation is less optimal
due to approximation in the history-based task
allocator. Experimental results show that WATS can
improve both the performance and energy efficiency of
task-based applications, with the performance gain up
to 66.1\% compared with traditional task schedulers.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "8",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Savrun-Yeniceri:2014:EHI,
author = "G{\"u}lfem Savrun-Yeni{\c{c}}eri and Wei Zhang and
Huahan Zhang and Eric Seckler and Chen Li and Stefan
Brunthaler and Per Larsen and Michael Franz",
title = "Efficient hosted interpreters on the {JVM}",
journal = j-TACO,
volume = "11",
number = "1",
pages = "9:1--9:24",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2532642",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:08:33 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/java2010.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/2532642",
abstract = "Many guest languages are implemented using the Java
Virtual Machine (JVM) as a host environment. There are
two major implementation choices: custom compilers and
so-called hosted interpreters. Custom compilers are
complex to build but offer good performance. Hosted
interpreters are comparatively simpler to implement but
until now have suffered from poor performance.\par
We studied the performance of hosted interpreters and
identified common bottlenecks preventing their
efficient execution. First, similar to interpreters
written in C/C++, instruction dispatch is expensive on
the JVM. Second, Java's semantics require expensive
runtime exception checks that negatively affect array
performance essential to interpreters.\par
We present two optimizations targeting these
bottlenecks and show that the performance of optimized
interpreters increases dramatically: we report speedups
by a factor of up to 2.45 over the Jython interpreter,
3.57 over the Rhino interpreter, and 2.52 over the
JRuby interpreter, respectively. The resulting
performance is comparable with that of custom
compilers. Our optimizations are enabled by a few
simple annotations that require only modest
implementation effort; in return, performance increases
substantially.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "9",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nair:2014:RPD,
author = "Prashant J. Nair and Chia-Chen Chou and Moinuddin K.
Qureshi",
title = "Refresh pausing in {DRAM} memory systems",
journal = j-TACO,
volume = "11",
number = "1",
pages = "10:1--10:26",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579669",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:08:33 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/2579669",
abstract = "Dynamic Random Access Memory (DRAM) cells rely on
periodic refresh operations to maintain data integrity.
As the capacity of DRAM memories has increased, so has
the amount of time consumed in doing refresh. Refresh
operations contend with read \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "10",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Jothi:2014:TCF,
author = "Komal Jothi and Haitham Akkary",
title = "Tuning the continual flow pipeline architecture with
virtual register renaming",
journal = j-TACO,
volume = "11",
number = "1",
pages = "11:1--11:27",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579675",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Continual Flow Pipelines (CFPs) allow a processor core
to process hundreds of in-flight instructions without
increasing cycle-critical pipeline resources. When a
load misses the data cache, CFP checkpoints the
processor register state and then moves all
miss-dependent instructions into a low-complexity WB to
unblock the pipeline. Meanwhile, miss-independent
instructions execute normally and update the processor
state. When the miss data return, CFP replays the
miss-dependent instructions from the WB and then merges
the miss-dependent and miss-independent execution
results. CFP was initially proposed for cache misses to
DRAM. Later work focused on reducing the execution
overhead of CFP by avoiding the pipeline flush before
replaying miss-dependent instructions and executing
dependent and independent instructions concurrently.
The goal of these improvements was to gain performance
by applying CFP to L1 data cache misses that hit the
last level on chip cache. However, many applications or
execution phases of applications incur excessive amount
of replay and/or rollbacks to the checkpoint. This
frequently cancels benefits from CFP and reduces
performance. In this article, we improve the CFP
architecture by using a novel virtual register renaming
substrate and by tuning the replay policies to mitigate
excessive replays and rollbacks to the checkpoint. We
describe these new design optimizations and show, using
Spec 2006 benchmarks and microarchitecture performance
and power models of our design, that our Tuned-CFP
architecture improves performance and energy
consumption over previous CFP architectures by ~10\%
and ~8\%, respectively. We also demonstrate that our
proposed architecture gives better performance return
on energy per instruction compared to a conventional
superscalar as well as previous CFP architectures.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "11",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Carle:2014:PAM,
author = "Thomas Carle and Dumitru Potop-Butucaru",
title = "Predicate-aware, makespan-preserving software
pipelining of scheduling tables",
journal = j-TACO,
volume = "11",
number = "1",
pages = "12:1--12:26",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579676",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:08:33 MDT 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
URL = "https://dl.acm.org/doi/abs/10.1145/2579676",
abstract = "We propose a software pipelining technique adapted to
specific hard real-time scheduling problems. Our
technique optimizes both computation throughput and
execution cycle makespan, with makespan being
prioritary. It also takes advantage of the \ldots{}",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "12",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kritikakou:2014:SNO,
author = "Angeliki Kritikakou and Francky Catthoor and Vasilios
Kelefouras and Costas Goutis",
title = "A scalable and near-optimal representation of access
schemes for memory management",
journal = j-TACO,
volume = "11",
number = "1",
pages = "13:1--13:25",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2579677",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Memory management searches for the resources required
to store the concurrently alive elements. The solution
quality is affected by the representation of the
element accesses: a sub-optimal representation leads to
overestimation and a non-scalable representation
increases the exploration time. We propose a
methodology to near-optimal and scalable represent
regular and irregular accesses. The representation
consists of a set of pattern entries to compactly
describe the behavior of the memory accesses and of
pattern operations to consistently combine the pattern
entries. The result is a final sequence of pattern
entries which represents the global access scheme
without unnecessary overestimation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "13",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Leather:2014:AFG,
author = "Hugh Leather and Edwin Bonilla and Michael O'Boyle",
title = "Automatic feature generation for machine
learning--based optimising compilation",
journal = j-TACO,
volume = "11",
number = "1",
pages = "14:1--14:32",
month = feb,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2536688",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Fri Mar 14 17:30:52 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recent work has shown that machine learning can
automate and in some cases outperform handcrafted
compiler optimisations. Central to such an approach is
that machine learning techniques typically rely upon
summaries or features of the program. The quality of
these features is critical to the accuracy of the
resulting machine learned algorithm; no machine
learning method will work well with poorly chosen
features. However, due to the size and complexity of
programs, theoretically there are an infinite number of
potential features to choose from. The compiler writer
now has to expend effort in choosing the best features
from this space. This article develops a novel
mechanism to automatically find those features that
most improve the quality of the machine learned
heuristic. The feature space is described by a grammar
and is then searched with genetic programming and
predictive modelling. We apply this technique to loop
unrolling in GCC 4.3.1 and evaluate our approach on a
Pentium 6. On a benchmark suite of 57 programs, GCCs
hard-coded heuristic achieves only 3\% of the maximum
performance available, whereas a state-of-the-art
machine learning approach with hand-coded features
obtains 59\%. Our feature generation technique is able
to achieve 76\% of the maximum available speedup,
outperforming existing approaches.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "14",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kluter:2014:VWL,
author = "Theo Kluter and Samuel Burri and Philip Brisk and
Edoardo Charbon and Paolo Ienne",
title = "Virtual Ways: Low-Cost Coherence for Instruction Set
Extensions with Architecturally Visible Storage",
journal = j-TACO,
volume = "11",
number = "2",
pages = "15:1--15:26",
month = jul,
year = "2014",
DOI = "https://doi.org/10.1145/2576877",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:13:09 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Instruction set extensions (ISEs) improve the
performance and energy consumption of
application-specific processors. ISEs can use
architecturally visible storage (AVS), localized
compiler-controlled memories, to provide higher I/O
bandwidth than reading data from the processor
pipeline. AVS creates coherence and consistence
problems with the data cache. Although a hardware
coherence protocol could solve the problem, this
approach is costly for a single-processor system. As a
low-cost alternative, we introduce Virtual Ways, which
ensures coherence through a reduced form of inclusion
between the data cache and AVS. Virtual Ways achieve
higher performance and lower energy consumption than
using a hardware coherence protocol.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "15",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ren:2014:POE,
author = "Bin Ren and Todd Mytkowicz and Gagan Agrawal",
title = "A Portable Optimization Engine for Accelerating
Irregular Data-Traversal Applications on {SIMD}
Architectures",
journal = j-TACO,
volume = "11",
number = "2",
pages = "16:1--16:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2632215",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 30 19:02:49 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Fine-grained data parallelism is increasingly common
in the form of longer vectors integrated with
mainstream processors (SSE, AVX) and various GPU
architectures. This article develops support for
exploiting such data parallelism for a class of
nonnumeric, nongraphic applications, which perform
computations while traversing many independent,
irregular data structures. We address this problem by
developing several novel techniques. First, for code
generation, we develop an intermediate language for
specifying such traversals, followed by a runtime
scheduler that maps traversals to various SIMD units.
Second, we observe that good data locality is crucial
to sustained performance from SIMD architectures,
whereas many applications that operate on irregular
data structures (e.g., trees and graphs) have poor data
locality. To address this challenge, we develop a set
of data layout optimizations that improve spatial
locality for applications that traverse many irregular
data structures. Unlike prior data layout
optimizations, our approach incorporates a notion of
both interthread and intrathread spatial reuse into
data layout. Finally, we enable performance portability
(i.e., the ability to automatically optimize
applications for different architectures) by accurately
modeling the impact of inter- and intrathread locality
on program performance. As a consequence, our model can
predict which data layout optimization to use on a wide
variety of SIMD architectures. To demonstrate the
efficacy of our approach and optimizations, we first
show how they enable up to a 12X speedup on one SIMD
architecture for a set of real-world applications. To
demonstrate that our approach enables performance
portability, we show how our model predicts the optimal
layout for applications across a diverse set of three
real-world SIMD architectures, which offers as much as
45\% speedup over a suboptimal solution.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "16",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Qi:2014:VVG,
author = "Zhengwei Qi and Jianguo Yao and Chao Zhang and Miao Yu
and Zhizhou Yang and Haibing Guan",
title = "{VGRIS}: Virtualized {GPU} Resource Isolation and
Scheduling in Cloud Gaming",
journal = j-TACO,
volume = "11",
number = "2",
pages = "17:1--17:25",
month = jul,
year = "2014",
DOI = "https://doi.org/10.1145/2632216",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:16:31 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "To achieve efficient resource management on a graphics
processing unit (GPU), there is a demand to develop a
framework for scheduling virtualized resources in cloud
gaming. In this article, we propose VGRIS, a resource
management framework for virtualized GPU resource
isolation and scheduling in cloud gaming. A set of
application programming interfaces (APIs) is provided
so that a variety of scheduling algorithms can be
implemented within the framework without modifying the
framework itself. Three scheduling algorithms are
implemented by the APIs within VGRIS. Experimental
results show that VGRIS can effectively schedule GPU
resources among various workloads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "17",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Shen:2014:RSB,
author = "Bor-Yeh Shen and Wei-Chung Hsu and Wuu Yang",
title = "A Retargetable Static Binary Translator for the {ARM}
Architecture",
journal = j-TACO,
volume = "11",
number = "2",
pages = "18:1--18:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629335",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 30 19:02:49 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Machines designed with new but incompatible
Instruction Set Architecture (ISA) may lack proper
applications. Binary translation can address this
incompatibility by migrating applications from one
legacy ISA to a new one, although binary translation
has problems such as code discovery for variable-length
ISA and code location issues for handling indirect
branches. Dynamic Binary Translation (DBT) has been
widely adopted for migrating applications since it
avoids those problems. Static Binary Translation (SBT)
is a less general solution and has not been actively
researched. However, SBT performs more aggressive
optimizations, which could yield more compact code and
better code quality. Applications translated by SBT can
consume less memory, processor cycles, and power than
DBT and can be started more quickly. These advantages
are even more critical for embedded systems than for
general systems. In this article, we designed and
implemented a new SBT tool, called LLBT, which
translates ARM instructions into LLVM IRs and then
retargets the LLVM IRs to various ISAs, including x86,
x86-64, ARM, and MIPS. LLBT leverages two important
functionalities from LLVM: comprehensive optimizations
and retargetability. More importantly, LLBT solves the
code discovery problem for ARM/Thumb binaries without
resorting to interpretation. LLBT also effectively
reduced the size of the address mapping table, making
SBT a viable solution for embedded systems. Our
experiments based on the EEMBC benchmark suite show
that the LLBT-generated code can run more than $ 6
\times $ and $ 2.3 \times $ faster on average than
emulation with QEMU and HQEMU, respectively.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "18",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gracia:2014:RLN,
author = "Dar{\'\i}o Su{\'a}rez Gracia and Alexandra
Ferrer{\'o}n and Luis Montesano {Del Campo} and Teresa
Monreal Arnal and V{\'\i}ctor Vi{\~n}als Y{\'u}fera",
title = "Revisiting {LP--NUCA} Energy Consumption: Cache Access
Policies and Adaptive Block Dropping",
journal = j-TACO,
volume = "11",
number = "2",
pages = "19:1--19:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2632217",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 30 19:02:49 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Cache working-set adaptation is key as embedded
systems move to multiprocessor and Simultaneous
Multithreaded Architectures (SMT) because interthread
pollution harms system performance and battery life.
Light-Power NUCA (LP-NUCA) is a working-set adaptive
cache that depends on temporal-locality to save energy.
This work identifies the sources of energy waste in
LP-NUCAs: parallel access to the tag and data arrays of
the tiles and low locality phases with useless block
migration. To counteract both issues, we prove that
switching to serial access reduces energy without
harming performance and propose a machine learning
Adaptive Drop Rate (ADR) controller that minimizes the
amount of replacement and migration when locality is
low. This work demonstrates that these techniques
efficiently adapt the cache drop and access policies to
save energy. They reduce LP-NUCA consumption 22.7\% for
1SMT. With interthread cache contention in 2SMT, the
savings rise to 29\%. Versus a conventional
organization, energy--delay improves 20.8\% and 25\%
for 1- and 2SMT benchmarks, and, in 65\% of the 2SMT
mixes, gains are larger than 20\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "19",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Liang:2014:DCC,
author = "Zhibin Liang and Wei Zhang and Yung-Cheng Ma",
title = "Deadline-Constrained Clustered Scheduling for {VLIW}
Architectures using Power-Gated Register Files",
journal = j-TACO,
volume = "11",
number = "2",
pages = "20:1--20:26",
month = jul,
year = "2014",
DOI = "https://doi.org/10.1145/2632218",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Tue Mar 10 08:18:32 2020",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Designing energy-efficient Digital Signal Processor
(DSP) cores has become a key concern in embedded
systems development. This paper proposes an
energy-proportional computing scheme for Very Long
Instruction Word (VLIW) architectures. To make the
processor power scales with adapted parallelism, we
propose incorporating distributed Power-Gated Register
Files (PGRF) into VLIW to achieve a PGRF-VLIW
architecture. For energy efficiency, we also propose an
instruction scheduling algorithm called the
Deadline-Constrained Clustered Scheduling (DCCS)
algorithm. The algorithm clusters the data dependence
graph to reduce data transfer energy and makes optimal
use of low-powered local registers for tree-structured
data dependence graphs. The results of evaluations
conducted using the MiBench and DSPstone benchmark
suites substantiate the expected power saving and
scaling effects.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "20",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fang:2014:PPA,
author = "Shuangde Fang and Zidong Du and Yuntan Fang and
Yuanjie Huang and Yang Chen and Lieven Eeckhout and
Olivier Temam and Huawei Li and Yunji Chen and
Chengyong Wu",
title = "Performance Portability Across Heterogeneous {SoCs}
Using a Generalized Library-Based Approach",
journal = j-TACO,
volume = "11",
number = "2",
pages = "21:1--21:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2608253",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jun 30 19:02:49 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Because of tight power and energy constraints,
industry is progressively shifting toward heterogeneous
system-on-chip (SoC) architectures composed of a mix of
general-purpose cores along with a number of
accelerators. However, such SoC architectures can be
very challenging to efficiently program for the vast
majority of programmers, due to numerous programming
approaches and languages. Libraries, on the other hand,
provide a simple way to let programmers take advantage
of complex architectures, which does not require
programmers to acquire new accelerator-specific or
domain-specific languages. Increasingly, library-based,
also called algorithm-centric, programming approaches
propose to generalize the usage of libraries and to
compose programs around these libraries, instead of
using libraries as mere complements. In this article,
we present a software framework for achieving
performance portability by leveraging a generalized
library-based approach. Inspired by the notion of a
component, as employed in software engineering and
HW/SW codesign, we advocate nonexpert programmers to
write simple wrapper code around existing libraries to
provide simple but necessary semantic information to
the runtime. To achieve performance portability, the
runtime employs machine learning (simulated annealing)
to select the most appropriate accelerator and its
parameters for a given algorithm. This selection
factors in the possibly complex composition of
algorithms used in the application, the communication
among the various accelerators, and the tradeoff
between different objectives (i.e., accuracy,
performance, and energy). Using a set of benchmarks run
on a real heterogeneous SoC composed of a multicore
processor and a GPU, we show that the runtime overhead
is fairly small at 5.1\% for the GPU and 6.4\% for the
multi-core. We then apply our accelerator selection
approach to a simulated SoC platform containing
multiple inexact accelerators. We show that accelerator
selection together with hardware parameter tuning
achieves an average 46.2\% energy reduction and a
speedup of 2.1$ \times $ while meeting the desired
application error target.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "21",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kaitoua:2014:HED,
author = "Abdulrahman Kaitoua and Hazem Hajj and Mazen A. R.
Saghir and Hassan Artail and Haitham Akkary and
Mariette Awad and Mageda Sharafeddine and Khaleel
Mershad",
title = "{Hadoop} Extensions for Distributed Computing on
Reconfigurable Active {SSD} Clusters",
journal = j-TACO,
volume = "11",
number = "2",
pages = "22:1--22:??",
month = jun,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2608199",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:18 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, we propose new extensions to Hadoop
to enable clusters of reconfigurable active solid-state
drives (RASSDs) to process streaming data from SSDs
using FPGAs. We also develop an analytical model to
estimate the performance of RASSD clusters running
under Hadoop. Using the Hadoop RASSD platform and
network simulators, we validate our design and
demonstrate its impact on performance for different
workloads taken from Stanford's Phoenix MapReduce
project. Our results show that for a hardware
acceleration factor of 20$ \times $, compute-intensive
workloads processing 153MB of data can run up to 11$
\times $ faster than a standard Hadoop cluster.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "22",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2014:PSR,
author = "Jue Wang and Xiangyu Dong and Yuan Xie",
title = "Preventing {STT-RAM} Last-Level Caches from Port
Obstruction",
journal = j-TACO,
volume = "11",
number = "3",
pages = "23:1--23:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2633046",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many new nonvolatile memory (NVM) technologies have
been heavily studied to replace the power-hungry
SRAM/DRAM-based memory hierarchy in today's computers.
Among various emerging NVM technologies, Spin-Transfer
Torque RAM (STT-RAM) has many benefits, such as fast
read latency, low leakage power, and high density,
making it a promising candidate for last-level caches
(LLCs).$^1$ However, STT-RAM write operation is
expensive. In particular, a long STT-RAM cache write
operation might obstruct other cache accesses and
result in severe performance degradation. Consequently,
how to mitigate STT-RAM write overhead is critical to
the success of STT-RAM adoption. In this article, we
propose an obstruction-aware cache management policy
called OAP. OAP monitors cache traffic, detects
LLC-obstructive processes, and differentiates the cache
accesses from different processes. Our experiment on a
four-core architecture with an 8MB STT-RAM L3 cache
shows a 14\% performance improvement and 64\% energy
reduction.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "23",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Gonzalez-Mesa:2014:ETM,
author = "M. A. Gonzalez-Mesa and Eladio Gutierrez and Emilio L.
Zapata and Oscar Plata",
title = "Effective Transactional Memory Execution Management
for Improved Concurrency",
journal = j-TACO,
volume = "11",
number = "3",
pages = "24:1--24:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2633048",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article describes a transactional memory
execution model intended to exploit maximum parallelism
from sequential and multithreaded programs. A program
code section is partitioned into chunks that will be
mapped onto threads and executed transactionally. These
transactions run concurrently and out of order, trying
to exploit maximum parallelism but managed by a
specific fully distributed commit control to meet data
dependencies. To accomplish correct parallel execution,
a partial precedence order relation is derived from the
program code section and/or defined by the programmer.
When a conflict between chunks is eagerly detected, the
precedence order relation is used to determine the best
policy to solve the conflict that preserves the
precedence order while maximizing concurrency. The
model defines a new transactional state called executed
but not committed. This state allows exploiting
concurrency on two levels: intrathread and interthread.
Intrathread concurrency is improved by having pending
uncommitted transactions while executing a new one in
the same thread. The new state improves interthread
concurrency because it permits out-of-order transaction
commits regarding the precedence order. Our model has
been implemented in a lightweight software
transactional memory system, TinySTM, and has been
evaluated on a set of benchmarks obtaining an important
performance improvement over the baseline TM system.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "24",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kumar:2014:EPG,
author = "Rakesh Kumar and Alejandro Mart{\'\i}nez and Antonio
Gonz{\'a}lez",
title = "Efficient Power Gating of {SIMD} Accelerators Through
Dynamic Selective Devectorization in an {HW\slash SW}
Codesigned Environment",
journal = j-TACO,
volume = "11",
number = "3",
pages = "25:1--25:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629681",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Leakage energy is a growing concern in current and
future microprocessors. Functional units of
microprocessors are responsible for a major fraction of
this energy. Therefore, reducing functional unit
leakage has received much attention in recent years.
Power gating is one of the most widely used techniques
to minimize leakage energy. Power gating turns off the
functional units during the idle periods to reduce the
leakage. Therefore, the amount of leakage energy
savings is directly proportional to the idle time
duration. This article focuses on increasing the idle
interval for the higher SIMD lanes. The applications
are profiled dynamically, in a hardware/software
codesigned environment, to find the higher SIMD lanes'
usage pattern. If the higher lanes need to be turned on
for small time periods, the corresponding portion of
the code is devectorized to keep the higher lanes off.
The devectorized code is executed on the lowest SIMD
lane. Our experimental results show that the average
energy savings of the proposed mechanism are 15\%,
12\%, and 71\% greater than power gating for
SPECFP2006, Physicsbench, and Eigen benchmark suites,
respectively. Moreover, the slowdown caused by
devectorization is negligible.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "25",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Carlo:2014:FAA,
author = "Stefano {Di Carlo} and Salvatore Galfano and Marco
Indaco and Paolo Prinetto and Davide Bertozzi and Piero
Olivo and Cristian Zambelli",
title = "{FLARES}: an Aging Aware Algorithm to Autonomously
Adapt the Error Correction Capability in {NAND} Flash
Memories",
journal = j-TACO,
volume = "11",
number = "3",
pages = "26:1--26:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2631919",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With the advent of solid-state storage systems, NAND
flash memories are becoming a key storage technology.
However, they suffer from serious reliability and
endurance issues during the operating lifetime that can
be handled by the use of appropriate error correction
codes (ECCs) in order to reconstruct the information
when needed. Adaptable ECCs may provide the flexibility
to avoid worst-case reliability design, thus leading to
improved performance. However, a way to control such
adaptable ECCs' strength is required. This article
proposes FLARES, an algorithm able to adapt the ECC
correction capability of each page of a flash based on
a flash RBER prediction model and on a measurement of
the number of errors detected in a given time window.
FLARES has been fully implemented within the YAFFS 2
filesystem under the Linux operating system. This
allowed us to perform an extensive set of simulations
on a set of standard benchmarks that highlighted the
benefit of FLARES on the overall storage subsystem
performances.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "26",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bartolini:2014:AFG,
author = "Davide B. Bartolini and Filippo Sironi and Donatella
Sciuto and Marco D. Santambrogio",
title = "Automated Fine-Grained {CPU} Provisioning for Virtual
Machines",
journal = j-TACO,
volume = "11",
number = "3",
pages = "27:1--27:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2637480",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/multithreading.bib;
https://www.math.utah.edu/pub/tex/bib/taco.bib;
https://www.math.utah.edu/pub/tex/bib/virtual-machines.bib",
abstract = "Ideally, the pay-as-you-go model of Infrastructure as
a Service (IaaS) clouds should enable users to rent
just enough resources (e.g., CPU or memory bandwidth)
to fulfill their service level objectives (SLOs).
Achieving this goal is hard on current IaaS offers,
which require users to explicitly specify the amount of
resources to reserve; this requirement is nontrivial
for users, because estimating the amount of resources
needed to attain application-level SLOs is often
complex, especially when resources are virtualized and
the service provider colocates virtual machines (VMs)
on host nodes. For this reason, users who deploy VMs
subject to SLOs are usually prone to overprovisioning
resources, thus resulting in inflated business costs.
This article tackles this issue with AutoPro: a runtime
system that enhances IaaS clouds with automated and
fine-grained resource provisioning based on performance
SLOs. Our main contribution with AutoPro is filling the
gap between application-level performance SLOs and
allocation of a contended resource, without requiring
explicit reservations from users. In this article, we
focus on CPU bandwidth allocation to throughput-driven,
compute-intensive multithreaded applications colocated
on a multicore processor; we show that a theoretically
sound, yet simple, control strategy can enable
automated fine-grained allocation of this contended
resource, without the need for offline profiling.
Additionally, AutoPro helps service providers optimize
infrastructure utilization by provisioning idle
resources to best-effort workloads, so as to maximize
node-level utilization. Our extensive experimental
evaluation confirms that AutoPro is able to
automatically determine and enforce allocations to meet
performance SLOs while maximizing node-level
utilization by supporting batch workloads on a
best-effort basis.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "27",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Carlson:2014:EHL,
author = "Trevor E. Carlson and Wim Heirman and Stijn Eyerman
and Ibrahim Hur and Lieven Eeckhout",
title = "An Evaluation of High-Level Mechanistic Core Models",
journal = j-TACO,
volume = "11",
number = "3",
pages = "28:1--28:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2629677",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Large core counts and complex cache hierarchies are
increasing the burden placed on commonly used
simulation and modeling techniques. Although analytical
models provide fast results, they do not apply to
complex, many-core shared-memory systems. In contrast,
detailed cycle-level simulation can be accurate but
also tends to be slow, which limits the number of
configurations that can be evaluated. A middle ground
is needed that provides for fast simulation of complex
many-core processors while still providing accurate
results. In this article, we explore, analyze, and
compare the accuracy and simulation speed of
high-abstraction core models as a potential solution to
slow cycle-level simulation. We describe a number of
enhancements to interval simulation to improve its
accuracy while maintaining simulation speed. In
addition, we introduce the instruction-window centric
(IW-centric) core model, a new mechanistic core model
that bridges the gap between interval simulation and
cycle-accurate simulation by enabling high-speed
simulations with higher levels of detail. We also show
that using accurate core models like these are
important for memory subsystem studies, and that
simple, naive models, like a one-IPC core model, can
lead to misleading and incorrect results and
conclusions in practical design studies. Validation
against real hardware shows good accuracy, with an
average single-core error of 11.1\% and a maximum of
18.8\% for the IW-centric model with a 1.5$ \times $
slowdown compared to interval simulation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "28",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Hijaz:2014:NLN,
author = "Farrukh Hijaz and Omer Khan",
title = "{NUCA-L1}: a Non-Uniform Access Latency Level-1 Cache
Architecture for Multicores Operating at Near-Threshold
Voltages",
journal = j-TACO,
volume = "11",
number = "3",
pages = "29:1--29:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2631918",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Research has shown that operating in the
near-threshold region is expected to provide up to 10$
\times $ energy efficiency for future processors.
However, reliable operation below a minimum voltage
(Vccmin) cannot be guaranteed due to process
variations. Because SRAM margins can easily be violated
at near-threshold voltages, their bit-cell failure
rates are expected to rise steeply. Multicore
processors rely on fast private L1 caches to exploit
data locality and achieve high performance. In the
presence of high bit-cell fault rates, traditionally an
L1 cache either sacrifices capacity or incurs
additional latency to correct the faults. We observe
that L1 cache sensitivity to hit latency offers a
design trade-off between capacity and latency. When
fault rate is high at extreme Vccmin, it is beneficial
to recover L1 cache capacity, even if it comes at the
cost of additional latency. However, at low fault
rates, the additional constant latency to recover cache
capacity degrades performance. With this trade-off in
mind, we propose a Non-Uniform Cache Access L1
architecture (NUCA-L1) that avoids additional latency
on accesses to fault-free cache lines. To mitigate the
capacity bottleneck, it deploys a correction mechanism
to recover capacity at the cost of additional latency.
Using extensive simulations of a 64-core multicore, we
demonstrate that at various bit-cell fault rates, our
proposed private NUCA-L1 cache architecture performs
better than state-of-the-art schemes, along with a
significant reduction in energy consumption.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "29",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Drebes:2014:TAD,
author = "Andi Drebes and Karine Heydemann and Nathalie Drach
and Antoniu Pop and Albert Cohen",
title = "Topology-Aware and Dependence-Aware Scheduling and
Memory Allocation for Task-Parallel Languages",
journal = j-TACO,
volume = "11",
number = "3",
pages = "30:1--30:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2641764",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We present a joint scheduling and memory allocation
algorithm for efficient execution of task-parallel
programs on non-uniform memory architecture (NUMA)
systems. Task and data placement decisions are based on
a static description of the memory hierarchy and on
runtime information about intertask communication.
Existing locality-aware scheduling strategies for
fine-grained tasks have strong limitations: they are
specific to some class of machines or applications,
they do not handle task dependences, they require
manual program annotations, or they rely on fragile
profiling schemes. By contrast, our solution makes no
assumption on the structure of programs or on the
layout of data in memory. Experimental results, based
on the OpenStream language, show that locality of
accesses to main memory of scientific applications can
be increased significantly on a 64-core machine,
resulting in a speedup of up to 1.63$ \times $ compared
to a state-of-the-art work-stealing scheduler.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "30",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tawa:2014:EEF,
author = "Venkata Kalyan Tawa and Ravi Kasha and Madhu Mutyam",
title = "{EFGR}: an Enhanced Fine Granularity Refresh Feature
for High-Performance {DDR4 DRAM} Devices",
journal = j-TACO,
volume = "11",
number = "3",
pages = "31:1--31:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2656340",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "High-density DRAM devices spend significant time
refreshing the DRAM cells, leading to performance drop.
The JEDEC DDR4 standard provides a Fine Granularity
Refresh (FGR) feature to tackle refresh. Motivated by
the observation that in FGR mode, only a few banks are
involved, we propose an Enhanced FGR (EFGR) feature
that introduces three optimizations to the basic FGR
feature and exposes the bank-level parallelism within
the rank even during the refresh. The first
optimization decouples the nonrefreshing banks. The
second and third optimizations determine the maximum
number of nonrefreshing banks that can be active during
refresh and selectively precharge the banks before
refresh, respectively. Our simulation results show that
the EFGR feature is able to recover almost 56.6\% of
the performance loss incurred due to refresh
operations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "31",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yalcin:2014:EEC,
author = "Gulay Yalcin and Oguz Ergin and Emrah Islek and Osman
Sabri Unsal and Adrian Cristal",
title = "Exploiting Existing Comparators for Fine-Grained
Low-Cost Error Detection",
journal = j-TACO,
volume = "11",
number = "3",
pages = "32:1--32:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2656341",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Fault tolerance has become a fundamental concern in
computer design, in addition to performance and power.
Although several error detection schemes have been
proposed to discover a faulty core in the system, these
proposals could waste the whole core, including many
error-free structures in it after error detection.
Moreover, many fault-tolerant designs require
additional hardware for data replication or for
comparing the replicated data. In this study, we
provide a low-cost, fine-grained error detection scheme
by exploiting already existing comparators and data
replications in the several pipeline stages such as
issue queue, rename logic, and translation lookaside
buffer. We reduce the vulnerability of the source
register tags in IQ by 60\%, the vulnerability of
instruction TLB by 64\%, the vulnerability of data TLB
by 45\%, and the vulnerability of the register tags of
rename logic by 20\%.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "32",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ramachandran:2014:HFR,
author = "Pradeep Ramachandran and Siva Kumar Sastry Hari and
Manlap Li and Sarita V. Adve",
title = "Hardware Fault Recovery for {I/O} Intensive
Applications",
journal = j-TACO,
volume = "11",
number = "3",
pages = "33:1--33:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2656342",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "With continued process scaling, the rate of hardware
failures in commodity systems is increasing. Because
these commodity systems are highly sensitive to cost,
traditional solutions that employ heavy redundancy to
handle such failures are no longer acceptable owing to
their high associated costs. Detecting such faults by
identifying anomalous software execution and recovering
through checkpoint-and-replay is emerging as a viable
low-cost alternative for future commodity systems. An
important but commonly ignored aspect of such solutions
is ensuring that external outputs to the system are
fault-free. The outputs must be delayed until the
detectors guarantee this, influencing fault-free
performance. The overheads for resiliency must thus be
evaluated while taking these delays into consideration;
prior work has largely ignored this relationship. This
article concerns recovery for I/O intensive
applications from in-core faults. We present a strategy
to buffer external outputs using dedicated hardware and
show that checkpoint intervals previously considered as
acceptable incur exorbitant overheads when hardware
buffering is considered. We then present two techniques
to reduce the checkpoint interval and demonstrate a
practical solution that provides high resiliency while
incurring low overheads.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "33",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Eyerman:2014:MTM,
author = "Stijn Eyerman and Pierre Michaud and Wouter Rogiest",
title = "Multiprogram Throughput Metrics: a Systematic
Approach",
journal = j-TACO,
volume = "11",
number = "3",
pages = "34:1--34:??",
month = oct,
year = "2014",
CODEN = "????",
DOI = "https://doi.org/10.1145/2663346",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Oct 27 17:02:20 MDT 2014",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Running multiple programs on a processor aims at
increasing the throughput of that processor. However,
defining meaningful throughput metrics in a simulation
environment is not as straightforward as reporting
execution time. This has led to an ongoing debate on
what forms a meaningful throughput metric for
multiprogram workloads. We present a method to
construct throughput metrics in a systematic way: we
start by expressing assumptions on job size, job
distribution, scheduling, and so forth that together
define a theoretical throughput experiment. The
throughput metric is then the average throughput of
this experiment. Different assumptions lead to
different metrics, so one should be aware of these
assumptions when making conclusions based on results
using a specific metric. Throughput metrics should
always be defined from explicit assumptions, because
this leads to a better understanding of the
implications and limits of the results obtained with
that metric. We elaborate multiple metrics based on
different assumptions. In particular, we identify the
assumptions that lead to the commonly used weighted
speedup and harmonic mean of speedups. Our study
clarifies that they are actual throughput metrics,
which was recently questioned. We also propose some new
throughput metrics, which cannot always be expressed as
a closed formula. We use real experimental data to
characterize metrics and show how they relate to each
other.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "34",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Nugteren:2015:BAS,
author = "Cedric Nugteren and Henk Corporaal",
title = "{Bones}: an Automatic Skeleton-Based {C-to-CUDA}
Compiler for {GPUs}",
journal = j-TACO,
volume = "11",
number = "4",
pages = "35:1--35:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2665079",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The shift toward parallel processor architectures has
made programming and code generation increasingly
challenging. To address this programmability challenge,
this article presents a technique to fully
automatically generate efficient and readable code for
parallel processors (with a focus on GPUs). This is
made possible by combining algorithmic skeletons,
traditional compilation, and ``algorithmic species,'' a
classification of program code. Compilation starts by
automatically annotating C code with class information
(the algorithmic species). This code is then fed into
the skeleton-based source-to-source compiler bones to
generate CUDA code. To generate efficient code, bones
also performs optimizations including host-accelerator
transfer optimization and kernel fusion. This results
in a unique approach, integrating a skeleton-based
compiler for the first time into an automated flow. The
benefits are demonstrated experimentally for PolyBench
GPU kernels, showing geometric mean speed-ups of 1.4$
\times $ and 2.4$ \times $ compared to ppcg and
Par4All, and for five Rodinia GPU benchmarks, showing a
gap of only 1.2$ \times $ compared to hand-optimized
code.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "35",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2015:BOM,
author = "Jue Wang and Xiangyu Dong and Yuan Xie",
title = "Building and Optimizing {MRAM}-Based Commodity
Memories",
journal = j-TACO,
volume = "11",
number = "4",
pages = "36:1--36:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2667105",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Emerging non-volatile memory technologies such as MRAM
are promising design solutions for energy-efficient
memory architecture, especially for mobile systems.
However, building commodity MRAM by reusing DRAM
designs is not straightforward. The existing memory
interfaces are incompatible with MRAM small page size,
and they fail to leverage MRAM unique properties,
causing unnecessary performance and energy overhead. In
this article, we propose four techniques to enable and
optimize an LPDDRx-compatible MRAM solution: ComboAS to
solve the pin incompatibility; DynLat to avoid
unnecessary access latencies; and EarlyPA and BufW to
further improve performance by exploiting the MRAM
unique features of non-destructive read and independent
write path. Combining all these techniques together, we
boost the MRAM performance by 17\% and provide a
DRAM-compatible MRAM solution consuming 21\% less
energy.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "36",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Komuravelli:2015:RCH,
author = "Rakesh Komuravelli and Sarita V. Adve and Ching-Tsun
Chou",
title = "Revisiting the Complexity of Hardware Cache Coherence
and Some Implications",
journal = j-TACO,
volume = "11",
number = "4",
pages = "37:1--37:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2663345",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Cache coherence is an integral part of shared-memory
systems but is also widely considered to be one of the
most complex parts of such systems. Much prior work has
addressed this complexity and the verification
techniques to prove the correctness of hardware
coherence. Given the new multicore era with increasing
number of cores, there is a renewed debate about
whether the complexity of hardware coherence has been
tamed or whether it should be abandoned in favor of
software coherence. This article revisits the
complexity of hardware cache coherence by verifying a
publicly available, state-of-the-art implementation of
the widely used MESI protocol, using the Mur$ \varphi $
model checking tool. To our surprise, we found six bugs
in this protocol, most of which were hard to analyze
and took several days to fix. To compare the
complexity, we also verified the recently proposed
DeNovo protocol, which exploits disciplined software
programming models. We found three relatively easy to
fix bugs in this less mature protocol. After fixing
these bugs, our verification experiments showed that,
compared to DeNovo, MESI had 15X more reachable states
leading to a 20X increase in verification (model
checking) time. Although we were eventually successful
in verifying the protocols, the tool required making
several simplifying assumptions (e.g., two cores, one
address). Our results have several implications: (1)
they indicate that hardware coherence protocols remain
complex; (2) they reinforce the need for protocol
designers to embrace formal verification tools to
demonstrate correctness of new protocols and
extensions; (3) they reinforce the need for formal
verification tools that are both scalable and usable by
non-expert; and (4) they show that a system based on
hardware-software co-design can offer a simpler
approach for cache coherence, thus reducing the overall
verification effort and allowing verification of more
detailed models and protocol extensions that are
otherwise limited by computing resources.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "37",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rodriguez:2015:VSR,
author = "Gabriel Rodr{\'\i}guez and Juan Touri{\~n}o and Mahmut
T. Kandemir",
title = "Volatile {STT--RAM} Scratchpad Design and Data
Allocation for Low Energy",
journal = j-TACO,
volume = "11",
number = "4",
pages = "38:1--38:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2669556",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "On-chip power consumption is one of the fundamental
challenges of current technology scaling. Cache
memories consume a sizable part of this power,
particularly due to leakage energy. STT-RAM is one of
several new memory technologies that have been proposed
in order to improve power while preserving performance.
It features high density and low leakage, but at the
expense of write energy and performance. This article
explores the use of STT-RAM--based scratchpad memories
that trade nonvolatility in exchange for faster and
less energetically expensive accesses, making them
feasible for on-chip implementation in embedded
systems. A novel multiretention scratchpad partitioning
is proposed, featuring multiple storage spaces with
different retention, energy, and performance
characteristics. A customized compiler-based allocation
algorithm suitable for use with such a scratchpad
organization is described. Our experiments indicate
that a multiretention STT-RAM scratchpad can provide
energy savings of 53\% with respect to an iso-area,
hardware-managed SRAM cache.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "38",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Camarero:2015:TCH,
author = "Crist{\'o}bal Camarero and Enrique Vallejo and
Ram{\'o}n Beivide",
title = "Topological Characterization of {Hamming} and
Dragonfly Networks and Its Implications on Routing",
journal = j-TACO,
volume = "11",
number = "4",
pages = "39:1--39:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2677038",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Current High-Performance Computing (HPC) and data
center networks rely on large-radix routers. Hamming
graphs (Cartesian products of complete graphs) and
dragonflies (two-level direct networks with nodes
organized in groups) are some direct topologies
proposed for such networks. The original definition of
the dragonfly topology is very loose, with several
degrees of freedom, such as the inter- and intragroup
topology, the specific global connectivity, and the
number of parallel links between groups (or trunking
level). This work provides a comprehensive analysis of
the topological properties of the dragonfly network,
providing balancing conditions for network
dimensioning, as well as introducing and classifying
several alternatives for the global connectivity and
trunking level. From a topological study of the
network, it is noted that a Hamming graph can be seen
as a canonical dragonfly topology with a high level of
trunking. Based on this observation and by carefully
selecting the global connectivity, the Dimension Order
Routing (DOR) mechanism safely used in Hamming graphs
is adapted to dragonfly networks with trunking. The
resulting routing algorithms approximate the
performance of minimal, nonminimal, and adaptive
routings typically used in dragonflies but without
requiring virtual channels to avoid packet deadlock,
thus allowing for lower cost router implementations.
This is obtained by properly selecting the link to
route between groups based on a graph coloring of
network routers. Evaluations show that the proposed
mechanisms are competitive with traditional solutions
when using the same number of virtual channels and
enable for simpler implementations with lower cost.
Finally, multilevel dragonflies are discussed,
considering how the proposed mechanisms could be
adapted to them.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "39",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Yoon:2015:EDM,
author = "Hanbin Yoon and Justin Meza and Naveen Muralimanohar
and Norman P. Jouppi and Onur Mutlu",
title = "Efficient Data Mapping and Buffering Techniques for
Multilevel Cell Phase-Change Memories",
journal = j-TACO,
volume = "11",
number = "4",
pages = "40:1--40:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2669365",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "New phase-change memory (PCM) devices have low-access
latencies (like DRAM) and high capacities (i.e., low
cost per bit, like Flash). In addition to being able to
scale to smaller cell sizes than DRAM, a PCM cell can
also store multiple bits per cell (referred to as
multilevel cell, or MLC), enabling even greater
capacity per bit. However, reading and writing the
different bits of data from and to an MLC PCM cell
requires different amounts of time: one bit is read or
written first, followed by another. Due to this
asymmetric access process, the bits in an MLC PCM cell
have different access latency and energy depending on
which bit in the cell is being read or written. We
leverage this observation to design a new way to store
and buffer data in MLC PCM devices. While traditional
devices couple the bits in each cell next to one
another in the address space, our key idea is to
logically decouple the bits in each cell into two
separate regions depending on their read/write
characteristics: fast-read/slow-write bits and
slow-read/fast-write bits. We propose a low-overhead
hardware/software technique to predict and map data
that would benefit from being in each region at
runtime. In addition, we show how MLC bit decoupling
provides more flexibility in the way data is buffered
in the device, enabling more efficient use of existing
device buffer space. Our evaluations for a multicore
system show that MLC bit decoupling improves system
performance by 19.2\%, memory energy efficiency by
14.4\%, and thread fairness by 19.3\% over a
state-of-the-art MLC PCM system that couples the bits
in its cells. We show that our results are consistent
across a variety of workloads and system
configurations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "40",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Premillieu:2015:EOE,
author = "Nathanael Pr{\'e}millieu and Andr{\'e} Seznec",
title = "Efficient Out-of-Order Execution of Guarded {ISAs}",
journal = j-TACO,
volume = "11",
number = "4",
pages = "41:1--41:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2677037",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "ARM ISA-based processors are no longer low-cost,
low-power processors. Nowadays, ARM ISA-based processor
manufacturers are striving to implement medium-end to
high-end processor cores, which implies implementing a
state-of-the-art out-of-order execution engine.
Unfortunately, providing efficient out-of-order
execution on legacy ARM codes may be quite challenging
due to guarded instructions. Predicting the guarded
instructions addresses the main serialization impact
associated with guarded instructions execution and the
multiple definition problem. Moreover, guard prediction
allows one to use a global branch-and-guard history
predictor to predict both branches and guards, often
improving branch prediction accuracy. Unfortunately,
such a global branch-and-guard history predictor
requires the systematic use of guard predictions. In
that case, poor guard prediction accuracy would lead to
poor overall performance on some applications. Building
on top of recent advances in branch prediction and
confidence estimation, we propose a hybrid
branch-and-guard predictor, combining a global branch
history component and global branch-and-guard history
component. The potential gain or loss due to the
systematic use of guard prediction is dynamically
evaluated at runtime. Two computing modes are enabled:
systematic guard prediction use and
high-confidence-only guard prediction use. Our
experiments show that on most applications, an
overwhelming majority of guarded instructions are
predicted. Therefore, a simple but relatively
inefficient hardware solution can be used to execute
the few unpredicted guarded instructions. Significant
performance benefits are observed on most applications,
while applications with poorly predictable guards do
not suffer from performance loss.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "41",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Wang:2015:APM,
author = "Zheng Wang and Dominik Grewe and Michael F. P.
O'Boyle",
title = "Automatic and Portable Mapping of Data Parallel
Programs to {OpenCL} for {GPU}-Based Heterogeneous
Systems",
journal = j-TACO,
volume = "11",
number = "4",
pages = "42:1--42:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2677036",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "General-purpose GPU-based systems are highly
attractive, as they give potentially massive
performance at little cost. Realizing such potential is
challenging due to the complexity of programming. This
article presents a compiler-based approach to
automatically generate optimized OpenCL code from data
parallel OpenMP programs for GPUs. A key feature of our
scheme is that it leverages existing transformations,
especially data transformations, to improve performance
on GPU architectures and uses automatic machine
learning to build a predictive model to determine if it
is worthwhile running the OpenCL code on the GPU or
OpenMP code on the multicore host. We applied our
approach to the entire NAS parallel benchmark suite and
evaluated it on distinct GPU-based systems. We achieved
average (up to) speedups of $ 4.51 \times $ and $ 4.20
\times $ ($ 143 \times $ and $ 67 \times $) on Core
i7/NVIDIA GeForce GTX580 and Core i7/AMD Radeon 7970
platforms, respectively, over a sequential baseline.
Our approach achieves, on average, greater than $ 10
\times $ speedups over two state-of-the-art automatic
GPU code generators.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "42",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{He:2015:IHF,
author = "Dan He and Fang Wang and Hong Jiang and Dan Feng and
Jing Ning Liu and Wei Tong and Zheng Zhang",
title = "Improving Hybrid {FTL} by Fully Exploiting Internal
{SSD} Parallelism with Virtual Blocks",
journal = j-TACO,
volume = "11",
number = "4",
pages = "43:1--43:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2677160",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Compared with either block or page-mapping Flash
Translation Layer (FTL), hybrid-mapping FTL for flash
Solid State Disks (SSDs), such as Fully Associative
Section Translation (FAST), has relatively high space
efficiency because of its smaller mapping table than
the latter and higher flexibility than the former. As a
result, hybrid-mapping FTL has become the most commonly
used scheme in SSDs. But the hybrid-mapping FTL incurs
a large number of costly full-merge operations. Thus, a
critical challenge to hybrid-mapping FTL is how to
reduce the cost of full-merge operations and improve
partial merge operations and switch operations. In this
article, we propose a novel FTL scheme, called Virtual
Block-based Parallel FAST (VBP-FAST), that divides
flash area into Virtual Blocks (VBlocks) and Physical
Blocks (PBlocks) where VBlocks are used to fully
exploit channel-level, die-level, and plane-level
parallelism of flash. Leveraging these three levels of
parallelism, the cost of full merge in VBP-FAST is
significantly reduced from that of FAST. In the
meantime, VBP-FAST uses PBlocks to retain the
advantages of partial merge and switch operations. Our
extensive trace-driven simulation results show that
VBP-FAST speeds up FAST by a factor of 5.3--8.4 for
random workloads and of 1.7 for sequential workloads
with channel-level, die-level, and plane-level
parallelism of 8, 2, and 2 (i.e., eight channels, two
dies, and two planes).",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "43",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Rubin:2015:MOM,
author = "Eri Rubin and Ely Levy and Amnon Barak and Tal
Ben-Nun",
title = "{MAPS}: Optimizing Massively Parallel Applications
Using Device-Level Memory Abstraction",
journal = j-TACO,
volume = "11",
number = "4",
pages = "44:1--44:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2680544",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "GPUs play an increasingly important role in
high-performance computing. While developing naive code
is straightforward, optimizing massively parallel
applications requires deep understanding of the
underlying architecture. The developer must struggle
with complex index calculations and manual memory
transfers. This article classifies memory access
patterns used in most parallel algorithms, based on
Berkeley's Parallel ``Dwarfs.'' It then proposes the
MAPS framework, a device-level memory abstraction that
facilitates memory access on GPUs, alleviating complex
indexing using on-device containers and iterators. This
article presents an implementation of MAPS and shows
that its performance is comparable to carefully
optimized implementations of real-world applications.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "44",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Cilardo:2015:IMM,
author = "Alessandro Cilardo and Luca Gallo",
title = "Improving Multibank Memory Access Parallelism with
Lattice-Based Partitioning",
journal = j-TACO,
volume = "11",
number = "4",
pages = "45:1--45:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2675359",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Emerging architectures, such as reconfigurable
hardware platforms, provide the unprecedented
opportunity of customizing the memory infrastructure
based on application access patterns. This work
addresses the problem of automated memory partitioning
for such architectures, taking into account potentially
parallel data accesses to physically independent banks.
Targeted at affine static control parts (SCoPs), the
technique relies on the Z-polyhedral model for program
analysis and adopts a partitioning scheme based on
integer lattices. The approach enables the definition
of a solution space including previous works as
particular cases. The problem of minimizing the total
amount of memory required across the partitioned banks,
referred to as storage minimization throughout the
article, is tackled by an optimal approach yielding
asymptotically zero memory waste or, as an alternative,
an efficient approach ensuring arbitrarily small waste.
The article also presents a prototype toolchain and a
detailed step-by-step case study demonstrating the
impact of the proposed technique along with extensive
comparisons with alternative approaches in the
literature.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "45",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Martinsen:2015:EPT,
author = "Jan Kasper Martinsen and H{\aa}kan Grahn and Anders
Isberg",
title = "The Effects of Parameter Tuning in Software
Thread-Level Speculation in {JavaScript} Engines",
journal = j-TACO,
volume = "11",
number = "4",
pages = "46:1--46:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686036",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "JavaScript is a sequential programming language that
has a large potential for parallel execution in Web
applications. Thread-level speculation can take
advantage of this, but it has a large memory overhead.
In this article, we evaluate the effects of adjusting
various parameters for thread-level speculation. Our
results clearly show that thread-level speculation is a
useful technique for taking advantage of multicore
architectures for JavaScript in Web applications, that
nested speculation is required in thread-level
speculation, and that the execution characteristics of
Web applications significantly reduce the needed
memory, the number of threads, and the depth of our
speculation.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "46",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Colombet:2015:SOS,
author = "Quentin Colombet and Florian Brandner and Alain
Darte",
title = "Studying Optimal Spilling in the Light of {SSA}",
journal = j-TACO,
volume = "11",
number = "4",
pages = "47:1--47:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2685392",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Recent developments in register allocation, mostly
linked to static single assignment (SSA) form, have
shown the benefits of decoupling the problem in two
phases: a first spilling phase places load and store
instructions so that the register pressure at all
program points is small enough, and a second assignment
and coalescing phase maps the variables to physical
registers and reduces the number of move instructions
among registers. This article focuses on the first
phase, for which many open questions remain: in
particular, we study the notion of optimal spilling
(what can be expressed?) and the impact of SSA form
(does it help?). To identify the important features for
optimal spilling on load-store architectures, we
develop a new integer linear programming formulation,
more accurate and expressive than previous approaches.
Among other features, we can express SSA $ \phi
$-functions, memory-to-memory copies, and the fact that
a value can be stored simultaneously in a register and
in memory. Based on this formulation, we present a
thorough analysis of the results obtained for the
SPECINT 2000 and EEMBC 1.1 benchmarks, from which we
draw, among others, the following conclusions: (1)
rematerialization is extremely important; (2) SSA
complicates the formulation of optimal spilling,
especially because of memory coalescing when the code
is not in conventional SSA (CSSA); (3)
microarchitectural features are significant and thus
have to be accounted for; and (4) significant savings
can be obtained in terms of static spill costs, cache
miss rates, and dynamic instruction counts.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "47",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Haj-Yihia:2015:CDP,
author = "Jawad Haj-Yihia and Yosi {Ben Asher} and Efraim Rotem
and Ahmad Yasin and Ran Ginosar",
title = "Compiler-Directed Power Management for Superscalars",
journal = j-TACO,
volume = "11",
number = "4",
pages = "48:1--48:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2685393",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Modern superscalar CPUs contain large complex
structures and diverse execution units, consuming wide
dynamic power range. Building a power delivery network
for the worst-case power consumption is not energy
efficient and often is impossible to fit in small
systems. Instantaneous power excursions can cause
voltage droops. Power management algorithms are too
slow to respond to instantaneous events. In this
article, we propose a novel compiler-directed framework
to address this problem. The framework is validated on
a 4th Generation Intel\reg{} CoreTM processor and with
simulator on output trace. Up to 16\% performance
speedup is measured over baseline for the SPEC CPU2006
benchmarks.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "48",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Trinh:2015:EDE,
author = "Hong-Phuc Trinh and Marc Duranton and Michel
Paindavoine",
title = "Efficient Data Encoding for Convolutional Neural
Network application",
journal = j-TACO,
volume = "11",
number = "4",
pages = "49:1--49:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2685394",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This article presents an approximate data encoding
scheme called Significant Position Encoding (SPE). The
encoding allows efficient implementation of the recall
phase (forward propagation pass) of Convolutional
Neural Networks (CNN)-a typical Feed-Forward Neural
Network. This implementation uses only 7 bits data
representation and achieves almost the same
classification performance compared with the initial
network: on MNIST handwriting recognition task, using
this data encoding scheme losses only 0.03\% in terms
of recognition rate (99.27\% vs. 99.3\%). In terms of
storage, we achieve a 12.5\% gain compared with an 8
bits fixed-point implementation of the same CNN.
Moreover, this data encoding allows efficient
implementation of processing unit thanks to the
simplicity of scalar product operation-the principal
operation in a Feed-Forward Neural Network.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "49",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Breugh:2015:MAM,
author = "Maximilien B. Breugh and Stijn Eyerman and Lieven
Eeckhout",
title = "Mechanistic Analytical Modeling of Superscalar
In-Order Processor Performance",
journal = j-TACO,
volume = "11",
number = "4",
pages = "50:1--50:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2678277",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Superscalar in-order processors form an interesting
alternative to out-of-order processors because of their
energy efficiency and lower design complexity. However,
despite the reduced design complexity, it is nontrivial
to get performance estimates or insight in the
application--microarchitecture interaction without
running slow, detailed cycle-level simulations, because
performance highly depends on the order of instructions
within the application's dynamic instruction stream, as
in-order processors stall on interinstruction
dependences and functional unit contention. To limit
the number of detailed cycle-level simulations needed
during design space exploration, we propose a
mechanistic analytical performance model that is built
from understanding the internal mechanisms of the
processor. The mechanistic performance model for
superscalar in-order processors is shown to be accurate
with an average performance prediction error of 3.2\%
compared to detailed cycle-accurate simulation using
gem5. We also validate the model against hardware,
using the ARM Cortex-A8 processor and show that it is
accurate within 10\% on average. We further demonstrate
the usefulness of the model through three case studies:
(1) design space exploration, identifying the optimum
number of functional units for achieving a given
performance target; (2) program--machine interactions,
providing insight into microarchitecture bottlenecks;
and (3) compiler--architecture interactions,
visualizing the impact of compiler optimizations on
performance.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "50",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Seshadri:2015:MPC,
author = "Vivek Seshadri and Samihan Yedkar and Hongyi Xin and
Onur Mutlu and Phillip B. Gibbons and Michael A. Kozuch
and Todd C. Mowry",
title = "Mitigating Prefetcher-Caused Pollution Using Informed
Caching Policies for Prefetched Blocks",
journal = j-TACO,
volume = "11",
number = "4",
pages = "51:1--51:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2677956",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Many modern high-performance processors prefetch
blocks into the on-chip cache. Prefetched blocks can
potentially pollute the cache by evicting more useful
blocks. In this work, we observe that both accurate and
inaccurate prefetches lead to cache pollution, and
propose a comprehensive mechanism to mitigate
prefetcher-caused cache pollution. First, we observe
that over 95\% of useful prefetches in a wide variety
of applications are not reused after the first demand
hit (in secondary caches). Based on this observation,
our first mechanism simply demotes a prefetched block
to the lowest priority on a demand hit. Second, to
address pollution caused by inaccurate prefetches, we
propose a self-tuning prefetch accuracy predictor to
predict if a prefetch is accurate or inaccurate. Only
predicted-accurate prefetches are inserted into the
cache with a high priority. Evaluations show that our
final mechanism, which combines these two ideas,
significantly improves performance compared to both the
baseline LRU policy and two state-of-the-art approaches
to mitigating prefetcher-caused cache pollution (up to
49\%, and 6\% on average for 157 two-core
multiprogrammed workloads). The performance improvement
is consistent across a wide variety of system
configurations.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "51",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Matheou:2015:ASD,
author = "George Matheou and Paraskevas Evripidou",
title = "Architectural Support for Data-Driven Execution",
journal = j-TACO,
volume = "11",
number = "4",
pages = "52:1--52:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686874",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The exponential growth of sequential processors has
come to an end, and thus, parallel processing is
probably the only way to achieve performance growth. We
propose the development of parallel architectures based
on data-driven scheduling. Data-driven scheduling
enforces only a partial ordering as dictated by the
true data dependencies, which is the minimum
synchronization possible. This is very beneficial for
parallel processing because it enables it to exploit
the maximum possible parallelism. We provide
architectural support for data-driven execution for the
Data-Driven Multithreading (DDM) model. In the past,
DDM has been evaluated mostly in the form of virtual
machines. The main contribution of this work is the
development of a highly efficient hardware support for
data-driven execution and its integration into a
multicore system with eight cores on a Virtex-6 FPGA.
The DDM semantics make barriers and cache coherence
unnecessary, which reduces the synchronization
latencies significantly and makes the cache simpler.
The performance evaluation has shown that the support
for data-driven execution is very efficient with
negligible overheads. Our prototype can support very
small problem sizes (matrix $ 16 \times 16$) and
ultra-lightweight threads (block of $ 4 \times 4$) that
achieve speedups close to linear. Such results cannot
be achieved by software-based systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "52",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Morad:2015:GSP,
author = "Amir Morad and Leonid Yavits and Ran Ginosar",
title = "{GP--SIMD} Processing-in-Memory",
journal = j-TACO,
volume = "11",
number = "4",
pages = "53:1--53:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686875",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "GP-SIMD, a novel hybrid general-purpose SIMD computer
architecture, resolves the issue of data
synchronization by in-memory computing through
combining data storage and massively parallel
processing. GP-SIMD employs a two-dimensional access
memory with modified SRAM storage cells and a
bit-serial processing unit per each memory row. An
analytic performance model of the GP-SIMD architecture
is presented, comparing it to associative processor and
to conventional SIMD architectures. Cycle-accurate
simulation of four workloads supports the analytical
comparison. Assuming a moderate die area, GP-SIMD
architecture outperforms both the associative processor
and conventional SIMD coprocessor architectures by
almost an order of magnitude while consuming less
power.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "53",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Schaub:2015:ISW,
author = "Thomas Schaub and Simon Moll and Ralf Karrenberg and
Sebastian Hack",
title = "The Impact of the {SIMD} Width on Control-Flow and
Memory Divergence",
journal = j-TACO,
volume = "11",
number = "4",
pages = "54:1--54:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687355",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Power consumption is a prevalent issue in current and
future computing systems. SIMD processors amortize the
power consumption of managing the instruction stream by
executing the same instruction in parallel on multiple
data. Therefore, in the past years, the SIMD width has
steadily increased, and it is not unlikely that it will
continue to do so. In this article, we experimentally
study the influence of the SIMD width to the execution
of data-parallel programs. We investigate how an
increasing SIMD width (up to 1024) influences
control-flow divergence and memory-access divergence,
and how well techniques to mitigate them will work on
larger SIMD widths. We perform our study on 76 OpenCL
applications and show that a group of programs scales
well up to SIMD width 1024, whereas another group of
programs increasingly suffers from control-flow
divergence. For those programs, thread regrouping
techniques may become increasingly important for larger
SIMD widths. We show what average speedups can be
expected when increasing the SIMD width. For example,
when switching from scalar execution to SIMD width 64,
one can expect a speedup of 60.11, which increases to
62.46 when using thread regrouping. We also analyze the
frequency of regular (uniform, consecutive) memory
access patterns and observe a monotonic decrease of
regular memory accesses from 82.6 at SIMD width 4 to
43.1\% at SIMD width 1024.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "54",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Fang:2015:MMD,
author = "Zhenman Fang and Sanyam Mehta and Pen-Chung Yew and
Antonia Zhai and James Greensky and Gautham Beeraka and
Binyu Zang",
title = "Measuring Microarchitectural Details of Multi- and
Many-Core Memory Systems through Microbenchmarking",
journal = j-TACO,
volume = "11",
number = "4",
pages = "55:1--55:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687356",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "As multicore and many-core architectures evolve, their
memory systems are becoming increasingly more complex.
To bridge the latency and bandwidth gap between the
processor and memory, they often use a mix of
multilevel private/shared caches that are either
blocking or nonblocking and are connected by high-speed
network-on-chip. Moreover, they also incorporate
hardware and software prefetching and simultaneous
multithreading (SMT) to hide memory latency. On such
multi- and many-core systems, to incorporate various
memory optimization schemes using compiler
optimizations and performance tuning techniques, it is
crucial to have microarchitectural details of the
target memory system. Unfortunately, such details are
often unavailable from vendors, especially for newly
released processors. In this article, we propose a
novel microbenchmarking methodology based on short
elapsed-time events (SETEs) to obtain comprehensive
memory microarchitectural details in multi- and
many-core processors. This approach requires detailed
analysis of potential interfering factors that could
affect the intended behavior of such memory systems. We
lay out effective guidelines to control and mitigate
those interfering factors. Taking the impact of SMT
into consideration, our proposed methodology not only
can measure traditional cache/memory latency and
off-chip bandwidth but also can uncover the details of
software and hardware prefetching units not attempted
in previous studies. Using the newly released Intel
Xeon Phi many-core processor (with in-order cores) as
an example, we show how we can use a set of
microbenchmarks to determine various microarchitectural
features of its memory system (many are undocumented
from vendors). To demonstrate the portability and
validate the correctness of such a methodology, we use
the well-documented Intel Sandy Bridge multicore
processor (with out-of-order cores) as another example,
where most data are available and can be validated.
Moreover, to illustrate the usefulness of the measured
data, we do a multistage coordinated data prefetching
case study on both Xeon Phi and Sandy Bridge and show
that by using the measured data, we can achieve 1.3X
and 1.08X performance speedup, respectively, compared
to the state-of-the-art Intel ICC compiler. We believe
that these measurements also provide useful insights
into memory optimization, analysis, and modeling of
such multicore and many-core architectures.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "55",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Chi:2015:LPH,
author = "Chi Ching Chi and Mauricio Alvarez-Mesa and Ben
Juurlink",
title = "Low-Power High-Efficiency Video Decoding using
General-Purpose Processors",
journal = j-TACO,
volume = "11",
number = "4",
pages = "56:1--56:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2685551",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "In this article, we investigate how code optimization
techniques and low-power states of general-purpose
processors improve the power efficiency of HEVC
decoding. The power and performance efficiency of the
use of SIMD instructions, multicore architectures, and
low-power active and idle states are analyzed in detail
for offline video decoding. In addition, the power
efficiency of techniques such as ``race to idle'' and
``exploiting slack'' with DVFS are evaluated for
real-time video decoding. Results show that
``exploiting slack'' is more power efficient than
``race to idle'' for all evaluated platforms
representing smartphone, tablet, laptop, and desktop
computing systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "56",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Luporini:2015:CLO,
author = "Fabio Luporini and Ana Lucia Varbanescu and Florian
Rathgeber and Gheorghe-Teodor Bercea and J. Ramanujam
and David A. Ham and Paul H. J. Kelly",
title = "Cross-Loop Optimization of Arithmetic Intensity for
Finite Element Local Assembly",
journal = j-TACO,
volume = "11",
number = "4",
pages = "57:1--57:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687415",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "We study and systematically evaluate a class of
composable code transformations that improve arithmetic
intensity in local assembly operations, which represent
a significant fraction of the execution time in finite
element methods. Their performance optimization is
indeed a challenging issue. Even though affine loop
nests are generally present, the short trip counts and
the complexity of mathematical expressions, which vary
among different problems, make it hard to determine an
optimal sequence of successful transformations. Our
investigation has resulted in the implementation of a
compiler (called COFFEE) for local assembly kernels,
fully integrated with a framework for developing finite
element methods. The compiler manipulates abstract
syntax trees generated from a domain-specific language
by introducing domain-aware optimizations for
instruction-level parallelism and register locality.
Eventually, it produces C code including vector SIMD
intrinsics. Experiments using a range of real-world
finite element problems of increasing complexity show
that significant performance improvement is achieved.
The generality of the approach and the applicability of
the proposed code transformations to other domains is
also discussed.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "57",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Zhou:2015:OPS,
author = "Xing Zhou and Mar{\'\i}a J. Garzar{\'a}n and David A.
Padua",
title = "Optimal Parallelogram Selection for Hierarchical
Tiling",
journal = j-TACO,
volume = "11",
number = "4",
pages = "58:1--58:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687414",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Loop tiling is an effective optimization to improve
performance of multiply nested loops, which are the
most time-consuming parts in many programs. Most
massively parallel systems today are organized
hierarchically, and different levels of the hierarchy
differ in the organization of parallelism and the
memory models they adopt. To make better use of these
machines, it is clear that loop nests should be tiled
hierarchically to fit the hierarchical organization of
the machine; however, it is not so clear what should be
the exact form of these hierarchical tiles. In
particular, tile shape selection is of critical
importance to expose parallelism of the tiled loop
nests. Although loop tiling is a well-known
optimization, not much is known about tile shape
selection. In this article, we study tile shape
selection when the shapes are any type of
parallelograms and introduce a model to relate the tile
shape of the hierarchy to the execution time. Using
this model, we implement a system that automatically
finds the tile shapes that minimize the execution time
in a hierarchical system. Our experimental results show
that in several cases, the tiles automatically selected
by our system outperform the most intuitive tiling
schemes usually adopted by programmers because of their
simplicity.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "58",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Porter:2015:MMS,
author = "Leo Porter and Michael A. Laurenzano and Ananta Tiwari
and Adam Jundt and William A. {Ward, Jr.} and Roy
Campbell and Laura Carrington",
title = "Making the Most of {SMT} in {HPC}: System- and
Application-Level Perspectives",
journal = j-TACO,
volume = "11",
number = "4",
pages = "59:1--59:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687651",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "This work presents an end-to-end methodology for
quantifying the performance and power benefits of
simultaneous multithreading (SMT) for HPC centers and
applies this methodology to a production system and
workload. Ultimately, SMT's value system-wide depends
on whether users effectively employ SMT at the
application level. However, predicting SMT's benefit
for HPC applications is challenging; by doubling the
number of threads, the application's characteristics
may change. This work proposes statistical modeling
techniques to predict the speedup SMT confers to HPC
applications. This approach, accurate to within 8\%,
uses only lightweight, transparent performance monitors
collected during a single run of the application.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "59",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Tong:2015:OMT,
author = "Xin Tong and Toshihiko Koju and Motohiro Kawahito and
Andreas Moshovos",
title = "Optimizing Memory Translation Emulation in Full System
Emulators",
journal = j-TACO,
volume = "11",
number = "4",
pages = "60:1--60:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2686034",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The emulation speed of a full system emulator (FSE)
determines its usefulness. This work quantitatively
measures where time is spent in QEMU [Bellard 2005], an
industrial-strength FSE. The analysis finds that memory
emulation is one of the most heavily exercised emulator
components. For workloads studied, 38.1\% of the
emulation time is spent in memory emulation on average,
even though QEMU implements a software translation
lookaside buffer (STLB) to accelerate dynamic address
translation. Despite the amount of time spent in memory
emulation, there has been no study on how to further
improve its speed. This work analyzes where time is
spent in memory emulation and studies the performance
impact of a number of STLB optimizations. Although
there are several performance optimization techniques
for hardware TLBs, this work finds that the trade-offs
with an STLB are quite different compared to those with
hardware TLBs. As a result, not all hardware TLB
performance optimization techniques are applicable to
STLBs and vice versa. The evaluated STLB optimizations
target STLB lookups, as well as refills, and result in
an average emulator performance improvement of 24.4\%
over the baseline.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "60",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Kong:2015:CRF,
author = "Martin Kong and Antoniu Pop and Louis-No{\"e}l Pouchet
and R. Govindarajan and Albert Cohen and P.
Sadayappan",
title = "Compiler\slash Runtime Framework for Dynamic Dataflow
Parallelization of Tiled Programs",
journal = j-TACO,
volume = "11",
number = "4",
pages = "61:1--61:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687652",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Task-parallel languages are increasingly popular. Many
of them provide expressive mechanisms for intertask
synchronization. For example, OpenMP 4.0 will integrate
data-driven execution semantics derived from the StarSs
research language. Compared to the more restrictive
data-parallel and fork-join concurrency models, the
advanced features being introduced into task-parallel
models in turn enable improved scalability through load
balancing, memory latency hiding, mitigation of the
pressure on memory bandwidth, and, as a side effect,
reduced power consumption. In this article, we develop
a systematic approach to compile loop nests into
concurrent, dynamically constructed graphs of dependent
tasks. We propose a simple and effective heuristic that
selects the most profitable parallelization idiom for
every dependence type and communication pattern. This
heuristic enables the extraction of interband
parallelism (cross-barrier parallelism) in a number of
numerical computations that range from linear algebra
to structured grids and image processing. The proposed
static analysis and code generation alleviates the
burden of a full-blown dependence resolver to track the
readiness of tasks at runtime. We evaluate our approach
and algorithms in the PPCG compiler, targeting
OpenStream, a representative dataflow task-parallel
language with explicit intertask dependences and a
lightweight runtime. Experimental results demonstrate
the effectiveness of the approach.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "61",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Melot:2015:FCS,
author = "Nicolas Melot and Christoph Kessler and J{\"o}rg
Keller and Patrick Eitschberger",
title = "Fast Crown Scheduling Heuristics for Energy-Efficient
Mapping and Scaling of Moldable Streaming Tasks on
Manycore Systems",
journal = j-TACO,
volume = "11",
number = "4",
pages = "62:1--62:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2687653",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Exploiting effectively massively parallel
architectures is a major challenge that stream
programming can help facilitate. We investigate the
problem of generating energy-optimal code for a
collection of streaming tasks that include
parallelizable or moldable tasks on a generic manycore
processor with dynamic discrete frequency scaling.
Streaming task collections differ from classical task
sets in that all tasks are running concurrently, so
that cores typically run several tasks that are
scheduled round-robin at user level in a data-driven
way. A stream of data flows through the tasks and
intermediate results may be forwarded to other tasks,
as in a pipelined task graph. In this article, we
consider crown scheduling, a novel technique for the
combined optimization of resource allocation, mapping,
and discrete voltage/frequency scaling for moldable
streaming task collections in order to optimize energy
efficiency given a throughput constraint. We first
present optimal offline algorithms for separate and
integrated crown scheduling based on integer linear
programming (ILP). We make no restricting assumption
about speedup behavior. We introduce the fast heuristic
Longest Task, Lowest Group (LTLG) as a generalization
of the Longest Processing Time (LPT) algorithm to
achieve a load-balanced mapping of parallel tasks, and
the Height heuristic for crown frequency scaling. We
use them in feedback loop heuristics based on binary
search and simulated annealing to optimize crown
allocation. Our experimental evaluation of the ILP
models for a generic manycore architecture shows that
at least for small and medium-sized streaming task
collections even the integrated variant of crown
scheduling can be solved to optimality by a
state-of-the-art ILP solver within a few seconds. Our
heuristics produce makespan and energy consumption
close to optimality within the limits of the
phase-separated crown scheduling technique and the
crown structure. Their optimization time is longer than
the one of other algorithms we test, but our heuristics
consistently produce better solutions.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "62",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Ruan:2015:TRM,
author = "Wenjia Ruan and Yujie Liu and Michael Spear",
title = "Transactional Read-Modify-Write Without Aborts",
journal = j-TACO,
volume = "11",
number = "4",
pages = "63:1--63:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2688904",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Language-level transactions are said to provide
``atomicity,'' implying that the order of operations
within a transaction should be invisible to concurrent
transactions and thus that independent operations
within a transaction should be safe to execute in any
order. In this article, we present a mechanism for
dynamically reordering memory operations within a
transaction so that read-modify-write operations on
highly contended locations can be delayed until the
very end of the transaction. When integrated with
traditional transactional conflict detection
mechanisms, our approach reduces aborts on hot memory
locations, such as statistics counters, thereby
improving throughput and reducing wasted work. We
present three algorithms for delaying highly contended
read-modify-write operations within transactions, and
we evaluate their impact on throughput for eager and
lazy transactional systems across multiple workloads.
We also discuss complications that arise from the
interaction between our mechanism and the need for
strong language-level semantics, and we propose
algorithmic extensions that prevent errors from
occurring when accesses are aggressively reordered in a
transactional memory implementation with weak
semantics.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "63",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{UlHuda:2015:UTM,
author = "Zia {Ul Huda} and Ali Jannesari and Felix Wolf",
title = "Using Template Matching to Infer Parallel Design
Patterns",
journal = j-TACO,
volume = "11",
number = "4",
pages = "64:1--64:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2688905",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "The triumphant spread of multicore processors over the
past decade increases the pressure on software
developers to exploit the growing amount of parallelism
available in the hardware. However, writing parallel
programs is generally challenging. For sequential
programs, the formulation of design patterns marked a
turning point in software development, boosting
programmer productivity and leading to more reusable
and maintainable code. While the literature is now also
reporting a rising number of parallel design patterns,
programmers confronted with the task of parallelizing
an existing sequential program still struggle with the
question of which parallel pattern to apply where in
their code. In this article, we show how template
matching, a technique traditionally used in the
discovery of sequential design patterns, can also be
used to support parallelization decisions. After
looking for matches in a previously extracted dynamic
dependence graph, we classify code blocks of the input
program according to the structure of the parallel
patterns we find. Based on this information, the
programmer can easily implement the detected pattern
and create a parallel version of his or her program. We
tested our approach with six programs, in which we
successfully detected pipeline and do-all patterns.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "64",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Litz:2015:ECA,
author = "Heiner Litz and Ricardo J. Dias and David R.
Cheriton",
title = "Efficient Correction of Anomalies in Snapshot
Isolation Transactions",
journal = j-TACO,
volume = "11",
number = "4",
pages = "65:1--65:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2693260",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Transactional memory systems providing snapshot
isolation enable concurrent access to shared data
without incurring aborts on read-write conflicts.
Reducing aborts is extremely relevant as it leads to
higher concurrency, greater performance, and better
predictability. Unfortunately, snapshot isolation does
not provide serializability as it allows certain
anomalies that can lead to subtle consistency
violations. While some mechanisms have been proposed to
verify the correctness of a program utilizing snapshot
isolation transactions, it remains difficult to repair
incorrect applications. To reduce the programmer's
burden in this case, we present a technique based on
dynamic code and graph dependency analysis that
automatically corrects existing snapshot isolation
anomalies in transactional memory programs. Our
evaluation shows that corrected applications retain the
performance benefits characteristic of snapshot
isolation over conventional transactional memory
systems.",
acknowledgement = ack-nhfb,
ajournal = "ACM Trans. Archit. Code Optim.",
articleno = "65",
fjournal = "ACM Transactions on Architecture and Code Optimization
(TACO)",
journal-URL = "https://dl.acm.org/loi/taco",
}
@Article{Bahmann:2015:PRC,
author = "Helge Bahmann and Nico Reissmann and Magnus Jahre and
Jan Christian Meyer",
title = "Perfect Reconstructability of Control Flow from Demand
Dependence Graphs",
journal = j-TACO,
volume = "11",
number = "4",
pages = "66:1--66:??",
month = jan,
year = "2015",
CODEN = "????",
DOI = "https://doi.org/10.1145/2693261",
ISSN = "1544-3566 (print), 1544-3973 (electronic)",
ISSN-L = "1544-3566",
bibdate = "Mon Jan 12 11:38:56 MST 2015",
bibsource = "https://www.math.utah.edu/pub/tex/bib/taco.bib",
abstract = "Demand-based dependence graphs (DDGs), such as the
(Regionalized) Value State Dependence Graph ((R)VSDG),
are intermediate representations (IRs) well suited for
a wide range of program transformations. They
explicitly model the flow of data and state, and only
implicitly represent a restricted form of control flow.
These features make DDGs especially suitable for
automatic parallelization and vectorization, but cannot
be leveraged by practical compilers without efficient
construction and destruction algorithms. Construction
algorithms remodel the arbitrarily complex control flow
of a procedure to make it amenable to DDG
representation, whereas destruction algorithms
reestablish control flow for generating efficient
object code. Existing literature presents solutions to
both problems, but these impose structural constraints
on the generatable control flow, and omit qualitative
evaluation. The key contribution of this article is to
show that there is no intrinsic structural limitation
in the cont