%%% -*-BibTeX-*- %%% ==================================================================== %%% BibTeX-file{ %%% author = "Nelson H. F. Beebe", %%% version = "1.19", %%% date = "01 October 2009", %%% time = "09:23:47 MDT", %%% filename = "taco.bib", %%% address = "University of Utah %%% Department of Mathematics, 110 LCB %%% 155 S 1400 E RM 233 %%% Salt Lake City, UT 84112-0090 %%% USA", %%% telephone = "+1 801 581 5254", %%% FAX = "+1 801 581 4148", %%% URL = "http://www.math.utah.edu/~beebe", %%% checksum = "35797 3646 18697 171428", %%% email = "beebe at math.utah.edu, beebe at acm.org, %%% beebe at computer.org (Internet)", %%% codetable = "ISO/ASCII", %%% keywords = "ACM Transactions on Architecture and Code %%% Optimization; bibliography; TACO", %%% license = "public domain", %%% supported = "yes", %%% docstring = "This is a COMPLETE BibTeX bibliography for %%% ACM Transactions on Architecture and Code %%% Optimization (CODEN ????, ISSN 1544-3566), %%% covering all journal issues from 2004 -- %%% date. %%% %%% At version 1.19, the COMPLETE journal %%% coverage looked like this: %%% %%% 2004 ( 17) 2006 ( 19) 2008 ( 21) %%% 2005 ( 17) 2007 ( 19) 2009 ( 16) %%% %%% Article: 109 %%% %%% Total entries: 109 %%% %%% The journal Web page can be found at: %%% %%% http://www.acm.org/pubs/taco.html %%% %%% The journal table of contents page is at: %%% %%% http://www.acm.org/taco/ %%% http://portal.acm.org/browse_dl.cfm?idx=J924 %%% %%% Qualified subscribers can retrieve the full %%% text of recent articles in PDF form. %%% %%% The initial draft was extracted from the ACM %%% Web pages. %%% %%% ACM copyrights explicitly permit abstracting %%% with credit, so article abstracts, keywords, %%% and subject classifications have been %%% included in this bibliography wherever %%% available. Article reviews have been %%% omitted, until their copyright status has %%% been clarified. %%% %%% bibsource keys in the bibliography entries %%% below indicate the entry originally came %%% from the computer science bibliography %%% archive, even though it has likely since %%% been corrected and updated. %%% %%% URL keys in the bibliography point to %%% World Wide Web locations of additional %%% information about the entry. %%% %%% BibTeX citation tags are uniformly chosen %%% as name:year:abbrev, where name is the %%% family name of the first author or editor, %%% year is a 4-digit number, and abbrev is a %%% 3-letter condensation of important title %%% words. Citation tags were automatically %%% generated by software developed for the %%% BibNet Project. %%% %%% In this bibliography, entries are sorted in %%% publication order, using ``bibsort -byvolume.'' %%% %%% The checksum field above contains a CRC-16 %%% checksum as the first value, followed by the %%% equivalent of the standard UNIX wc (word %%% count) utility output of lines, words, and %%% characters. This is produced by Robert %%% Solovay's checksum utility." %%% } %%% ==================================================================== @Preamble{"\input bibnames.sty" # "\def \TM {${}^{\sc TM}$}" } %%% ==================================================================== %%% Acknowledgement abbreviations: @String{ack-nhfb = "Nelson H. F. Beebe, University of Utah, Department of Mathematics, 110 LCB, 155 S 1400 E RM 233, Salt Lake City, UT 84112-0090, USA, Tel: +1 801 581 5254, FAX: +1 801 581 4148, e-mail: \path|beebe@math.utah.edu|, \path|beebe@acm.org|, \path|beebe@computer.org| (Internet), URL: \path|http://www.math.utah.edu/~beebe/|"} %%% ==================================================================== %%% Journal abbreviations: @String{j-TACO = "ACM Transactions on Architecture and Code Optimization"} %%% ==================================================================== %%% Bibliography entries: @Article{Calder:2004:I, author = "Brad Calder and Dean Tullsen", title = "Introduction", journal = j-TACO, volume = "1", number = "1", pages = "1--2", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2004:RIC, author = "W. Zhang and J. S. Hu and V. Degalahal and M. Kandemir and N. Vijaykrishnan and M. J. Irwin", title = "Reducing instruction cache energy consumption using a compiler-based strategy", journal = j-TACO, volume = "1", number = "1", pages = "3--33", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Isailovic:2004:DCQ, author = "Nemanja Isailovic and Mark Whitney and Yatish Patel and John Kubiatowicz and Dean Copsey and Frederic T. Chong and Isaac L. Chuang and Mark Oskin", title = "Datapath and control for quantum wires", journal = j-TACO, volume = "1", number = "1", pages = "34--61", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sankaralingam:2004:TPA, author = "Karthikeyan Sankaralingam and Ramadass Nagarajan and Haiming Liu and Changkyu Kim and Jaehyuk Huh and Nitya Ranganathan and Doug Burger and Stephen W. Keckler and Robert G. McDonald and Charles R. Moore", title = "{TRIPS}: {A} polymorphous architecture for exploiting {ILP}, {TLP}, and {DLP}", journal = j-TACO, volume = "1", number = "1", pages = "62--93", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Skadron:2004:TAM, author = "Kevin Skadron and Mircea R. Stan and Karthik Sankaranarayanan and Wei Huang and Sivakumar Velusamy and David Tarjan", title = "Temperature-aware microarchitecture: {Modeling} and implementation", journal = j-TACO, volume = "1", number = "1", pages = "94--125", month = mar, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:09 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Aleta:2004:RCC, author = "Alex Alet{\`a} and Josep M. Codina and Antonio Gonz{\'a}lez and David Kaeli", title = "Removing communications in clustered microarchitectures through instruction replication", journal = j-TACO, volume = "1", number = "2", pages = "127--151", month = jun, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:10 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Bai:2004:LPO, author = "Yu Bai and R. Iris Bahar", title = "A low-power in-order\slash out-of-order issue queue", journal = j-TACO, volume = "1", number = "2", pages = "152--179", month = jun, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:10 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Juang:2004:IBP, author = "Philo Juang and Kevin Skadron and Margaret Martonosi and Zhigang Hu and Douglas W. Clark and Philip W. Diodato and Stefanos Kaxiras", title = "Implementing branch-predictor decay using quasi-static memory cells", journal = j-TACO, volume = "1", number = "2", pages = "180--219", month = jun, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:10 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Santana:2004:LCF, author = "Oliverio J. Santana and Alex Ramirez and Josep L. Larriba-Pey and Mateo Valero", title = "A low-complexity fetch architecture for high-performance superscalar processors", journal = j-TACO, volume = "1", number = "2", pages = "220--245", month = jun, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Aug 5 07:08:10 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Lin:2004:CFS, author = "Jin Lin and Tong Chen and Wei-Chung Hsu and Pen-Chung Yew and Roy Dz-Ching Ju and Tin-Fook Ngai and Sun Chan", title = "A compiler framework for speculative optimizations", journal = j-TACO, volume = "1", number = "3", pages = "247--271", month = sep, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Oct 29 06:39:45 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Fields:2004:ICS, author = "Brian A. Fields and Rastislav Bodik and Mark D. Hill and Chris J. Newburn", title = "Interaction cost and shotgun profiling", journal = j-TACO, volume = "1", number = "3", pages = "272--304", month = sep, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Oct 29 06:39:45 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sankaranarayanan:2004:PBA, author = "Karthik Sankaranarayanan and Kevin Skadron", title = "Profile-based adaptation for cache decay", journal = j-TACO, volume = "1", number = "3", pages = "305--322", month = sep, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Oct 29 06:39:45 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Xie:2004:IDV, author = "Fen Xie and Margaret Martonosi and Sharad Malik", title = "Intraprogram dynamic voltage scaling: {Bounding} opportunities with analytic modeling", journal = j-TACO, volume = "1", number = "3", pages = "323--367", month = sep, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Oct 29 06:39:45 MDT 2004", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Hartstein:2004:OPD, author = "A. Hartstein and Thomas R. Puzak", title = "The optimum pipeline depth considering both power and performance", journal = j-TACO, volume = "1", number = "4", pages = "369--388", month = dec, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Apr 14 12:17:47 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Cristal:2004:TKI, author = "Adri{\'a}n Cristal and Oliverio J. Santana and Mateo Valero and Jos{\'e} F. Mart{\'\i}nez", title = "Toward kilo-instruction processors", journal = j-TACO, volume = "1", number = "4", pages = "389--417", month = dec, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Apr 14 12:17:47 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Akkary:2004:ARE, author = "Haitham Akkary and Ravi Rajwar and Srikanth T. Srinivasan", title = "An analysis of a resource efficient checkpoint architecture", journal = j-TACO, volume = "1", number = "4", pages = "418--444", month = dec, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Apr 14 12:17:47 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Yang:2004:TML, author = "Chia-Lin Yang and Alvin R. Lebeck and Hung-Wei Tseng and Chien-Hao Lee", title = "Tolerating memory latency through push prefetching for pointer-intensive applications", journal = j-TACO, volume = "1", number = "4", pages = "445--475", month = dec, year = "2004", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Apr 14 12:17:47 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Calder:2005:I, author = "Brad Calder and Dean Tullsen", title = "Introduction", journal = j-TACO, volume = "2", number = "1", pages = "1--2", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhou:2005:EFA, author = "Yuanyuan Zhou and Pin Zhou and Feng Qin and Wei Liu and Josep Torrellas", title = "Efficient and flexible architectural support for dynamic monitoring", journal = j-TACO, volume = "2", number = "1", pages = "3--33", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2005:WHC, author = "Chuanjun Zhang and Frank Vahid and Jun Yang and Walid Najjar", title = "A way-halting cache for low-energy high-performance systems", journal = j-TACO, volume = "2", number = "1", pages = "34--54", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Abella:2005:ISP, author = "Jaume Abella and Antonio Gonz{\'a}lez and Xavier Vera and Michael F. P. O'Boyle", title = "{IATAC}: a smart predictor to turn-off {L2} cache lines", journal = j-TACO, volume = "2", number = "1", pages = "55--77", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Haskins:2005:AWS, author = "John W. {Haskins, Jr.} and Kevin Skadron", title = "Accelerated warmup for sampled microarchitecture simulation", journal = j-TACO, volume = "2", number = "1", pages = "78--108", month = mar, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Mon May 2 11:13:58 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Li:2005:ABT, author = "Tao Li and Ravi Bhargava and Lizy Kurian John", title = "Adapting branch-target buffer to improve the target predictability of {Java} code", journal = j-TACO, volume = "2", number = "2", pages = "109--130", month = jun, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Jul 7 14:09:53 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2005:DIE, author = "Lingli Zhang and Chandra Krintz", title = "The design, implementation, and evaluation of adaptive code unloading for resource-constrained devices", journal = j-TACO, volume = "2", number = "2", pages = "131--164", month = jun, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Jul 7 14:09:53 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Kulkarni:2005:FES, author = "Prasad A. Kulkarni and Stephen R. Hines and David B. Whalley and Jason D. Hiser and Jack W. Davidson and Douglas L. Jones", title = "Fast and efficient searches for effective optimization-phase sequences", journal = j-TACO, volume = "2", number = "2", pages = "165--198", month = jun, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Jul 7 14:09:53 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Salami:2005:DMI, author = "Esther Salam{\'\i} and Mateo Valero", title = "Dynamic memory interval test vs. interprocedural pointer analysis in multimedia applications", journal = j-TACO, volume = "2", number = "2", pages = "199--219", month = jun, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Jul 7 14:09:53 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Meng:2005:ELL, author = "Yan Meng and Timothy Sherwood and Ryan Kastner", title = "Exploring the limits of leakage power reduction in caches", journal = j-TACO, volume = "2", number = "3", pages = "221--246", month = sep, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Wed Oct 5 07:42:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Garzaran:2005:TBS, author = "Mar{\'\i}a Jes{\'u}s Garzar{\'a}n and Milos Prvulovic and Jos{\'e} Mar{\'\i}a Llaber{\'\i}a and V{\'\i}ctor Vi{\~n}als and Lawrence Rauchwerger and Josep Torrellas", title = "Tradeoffs in buffering speculative memory state for thread-level speculation in multiprocessors", journal = j-TACO, volume = "2", number = "3", pages = "247--279", month = sep, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Wed Oct 5 07:42:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Tarjan:2005:MPG, author = "David Tarjan and Kevin Skadron", title = "Merging path and gshare indexing in perceptron branch prediction", journal = j-TACO, volume = "2", number = "3", pages = "280--300", month = sep, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Wed Oct 5 07:42:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2005:WET, author = "Xiangyu Zhang and Rajiv Gupta", title = "Whole execution traces and their applications", journal = j-TACO, volume = "2", number = "3", pages = "301--334", month = sep, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Wed Oct 5 07:42:22 MDT 2005", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhao:2005:IWA, author = "Wankang Zhao and David Whalley and Christopher Healy and Frank Mueller", title = "Improving {WCET} by applying a {WC} code-positioning optimization", journal = j-TACO, volume = "2", number = "4", pages = "335--365", month = dec, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Feb 16 11:03:13 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, keywords = "WCET (worst case execution time); WC (worst case)", } @Article{Reis:2005:SCF, author = "George A. Reis and Jonathan Chang and Neil Vachharajani and Ram Rangan and David I. August and Shubhendu S. Mukherjee", title = "Software-controlled fault tolerance", journal = j-TACO, volume = "2", number = "4", pages = "366--396", month = dec, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Feb 16 11:03:13 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Li:2005:PPC, author = "Jian Li and Jos{\'e} F. Mart{\'\i}nez", title = "Power-performance considerations of parallel computing on chip multiprocessors", journal = j-TACO, volume = "2", number = "4", pages = "397--422", month = dec, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Feb 16 11:03:13 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sharma:2005:SPE, author = "Saurabh Sharma and Jesse G. Beu and Thomas M. Conte", title = "Spectral prefetcher: {An} effective mechanism for {L2} cache prefetching", journal = j-TACO, volume = "2", number = "4", pages = "423--450", month = dec, year = "2005", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu Feb 16 11:03:13 MST 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Calder:2006:I, author = "Brad Calder and Dean Tullsen", title = "Introduction", journal = j-TACO, volume = "3", number = "1", pages = "1--2", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Tan:2006:BSS, author = "Lin Tan and Brett Brotherton and Timothy Sherwood", title = "Bit-split string-matching engines for intrusion detection and prevention", journal = j-TACO, volume = "3", number = "1", pages = "3--34", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Nagpurkar:2006:ERP, author = "Priya Nagpurkar and Hussam Mousa and Chandra Krintz and Timothy Sherwood", title = "Efficient remote profiling for resource-constrained devices", journal = j-TACO, volume = "3", number = "1", pages = "35--66", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Lin:2006:RCG, author = "Jin Lin and Wei-Chung Hsu and Pen-Chung Yew and Roy Dz-Ching Ju and Tin-Fook Ngai", title = "Recovery code generation for general speculative optimizations", journal = j-TACO, volume = "3", number = "1", pages = "67--89", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Choi:2006:ORR, author = "Yoonseo Choi and Hwansoo Han", title = "Optimal register reassignment for register stack overflow minimization", journal = j-TACO, volume = "3", number = "1", pages = "90--114", month = mar, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Thu May 18 08:38:26 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Xue:2006:LOA, author = "Jingling Xue and Qiong Cai", title = "A lifetime optimal algorithm for speculative {PRE}", journal = j-TACO, volume = "3", number = "2", pages = "115--155", month = jun, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Jun 9 06:47:22 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Sharkey:2006:IPT, author = "Joseph J. Sharkey and Dmitry V. Ponomarev and Kanad Ghose and Oguz Ergin", title = "Instruction packing: {Toward} fast and energy-efficient instruction scheduling", journal = j-TACO, volume = "3", number = "2", pages = "156--181", month = jun, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Jun 9 06:47:22 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Ceze:2006:CUC, author = "Luis Ceze and Karin Strauss and James Tuck and Josep Torrellas and Jose Renau", title = "{CAVA}: {Using} checkpoint-assisted value prediction to hide {L2} misses", journal = j-TACO, volume = "3", number = "2", pages = "182--208", month = jun, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Jun 9 06:47:22 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhang:2006:EAR, author = "Lixin Zhang and Mike Parker and John Carter", title = "Efficient address remapping in distributed shared-memory systems", journal = j-TACO, volume = "3", number = "2", pages = "209--229", month = jun, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Fri Jun 9 06:47:22 MDT 2006", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Zhao:2006:ATP, author = "Min Zhao and Bruce R. Childers and Mary Lou Soffa", title = "An approach toward profit-driven optimization", journal = j-TACO, volume = "3", number = "3", pages = "231--262", month = sep, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1162690.1162691", ISSN = "1544-3566", bibdate = "Sat Sep 23 07:54:36 MDT 2006", bibsource = "http://portal.acm.org/", abstract = "Although optimizations have been applied for a number of years to improve the performance of software, problems with respect to the application of optimizations have not been adequately addressed. For example, in certain circumstances, optimizations may degrade performance. However, there is no efficient way to know when a degradation will occur. In this research, we investigate the profitability of optimizations, which is useful for determining the benefit of applying optimizations. We develop a framework that enables us to predict profitability using analytic models. The profitability of an optimization depends on code context, the particular optimization, and machine resources. Thus, our framework has analytic models for each of these components. As part of the framework, there is also a profitability engine that uses models to predict the profit. In this paper, we target scalar optimizations and, in particular, describe the models for partial redundancy elimination (PRE), loop invariant code motion (LICM), and value numbering (VN). We implemented the framework for predicting the profitability of these optimizations. Based on the predictions, we can selectively apply profitable optimizations. We compared the profit-driven approach with an approach that uses a heuristic in deciding when optimizations should be applied. Our experiments demonstrate that the profitability of scalar optimizations can be accurately predicted by using models. That is, without actually applying a scalar optimization, we can determine if an optimization is beneficial and should be applied.", acknowledgement = ack-nhfb, } @Article{Hazelwood:2006:MBC, author = "Kim Hazelwood and Michael D. Smith", title = "Managing bounded code caches in dynamic binary optimization systems", journal = j-TACO, volume = "3", number = "3", pages = "263--294", month = sep, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1162690.1162692", ISSN = "1544-3566", bibdate = "Sat Sep 23 07:54:36 MDT 2006", bibsource = "http://portal.acm.org/", abstract = "Dynamic binary optimizers store altered copies of original program instructions in software-managed code caches in order to maximize reuse of transformed code. Code caches store code blocks that may vary in size, reference other code blocks, and carry a high replacement overhead. These unique constraints reduce the effectiveness of conventional cache management policies. Our work directly addresses these unique constraints and presents several contributions to the code-cache management problem. First, we show that evicting more than the minimum number of code blocks from the code cache results in less run-time overhead than the existing alternatives. Such granular evictions reduce overall execution time, as the fixed costs of invoking the eviction mechanism are amortized across multiple cache insertions. Second, a study of the ideal lifetimes of dynamically generated code blocks illustrates the benefit of a replacement algorithm based on a generational heuristic. We describe and evaluate a generational approach to code cache management that makes it easy to identify long-lived code blocks and simultaneously avoid any fragmentation because of the eviction of short-lived blocks. Finally, we present results from an implementation of our generational approach in the DynamoRIO framework and illustrate that, as dynamic optimization systems become more prevalent, effective code cache-management policies will be essential for reliable, scalable performance of modern applications.", acknowledgement = ack-nhfb, } @Article{Rochecouste:2006:CCE, author = "Olivier Rochecouste and Gilles Pokam and Andr{\'e} Seznec", title = "A case for a complexity-effective, width-partitioned microarchitecture", journal = j-TACO, volume = "3", number = "3", pages = "295--326", month = sep, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1162690.1162693", ISSN = "1544-3566", bibdate = "Sat Sep 23 07:54:36 MDT 2006", bibsource = "http://portal.acm.org/", abstract = "The analysis of program executions reveals that most integer and multimedia applications make heavy use of narrow-width operations, i.e., instructions exclusively using narrow-width operands and producing a narrow-width result. Moreover, this usage is relatively well distributed over the application. We observed this program property on the MediaBench and SPEC2000 benchmarks with about 40\% of the instructions being narrow-width operations. Current superscalar processors use 64-bit datapaths to execute all the instructions of the applications. In this paper, we suggest the use of a width-partitioned microarchitecture (WPM) to master the hardware complexity of a superscalar processor. For a four-way issue machine, we split the processor in two two-way clusters: the main cluster executing 64-bit operations, load/store, and complex operations and a narrow cluster executing the 16-bit operations. We resort to partitioning to decouple the treatment of the narrow-width operations from that of the other program instructions. This provides the benefit of greatly simplifying the design of the critical processor components in each cluster (e.g., the register file and the bypass network). The dynamic interleaving of the two instruction types allows maintaining the workload balanced among clusters. WPM also helps to reduce the complexity of the interconnection fabric and of the issue logic. In fact, since the 16-bit cluster can only communicate narrow-width data, the datapath-width of the interconnect fabric can be significantly reduced, yielding a corresponding saving of the interconnect power and area. We explore different possible configurations of WPM, discussing the various implementation tradeoffs. We also examine a speculative steering heuristic to distribute the narrow-width operations among clusters. A detailed analysis of the complexity factors shows using WPM instead of a classical 64-bit two-cluster microarchitecture can save power and silicon area with a minimal impact on the overall performance.", acknowledgement = ack-nhfb, } @Article{Zmily:2006:BAI, author = "Ahmad Zmily and Christos Kozyrakis", title = "Block-aware instruction set architecture", journal = j-TACO, volume = "3", number = "3", pages = "327--357", month = sep, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1162690.1162694", ISSN = "1544-3566", bibdate = "Sat Sep 23 07:54:36 MDT 2006", bibsource = "http://portal.acm.org/", abstract = "Instruction delivery is a critical component for wide-issue, high-frequency processors since its bandwidth and accuracy place an upper limit on performance. The processor front-end accuracy and bandwidth are limited by instruction-cache misses, multicycle instruction-cache accesses, and target or direction mispredictions for control-flow operations. This paper presents a block-aware instruction set (BLISS) that allows software to assist with front-end challenges. BLISS defines basic block descriptors that are stored separately from the actual instructions in a program. We show that BLISS allows for a decoupled front-end that tolerates instruction-cache latency, facilitates instruction prefetching, and leads to higher prediction accuracy.", acknowledgement = ack-nhfb, } @Article{Crandall:2006:MAS, author = "Jedidiah R. Crandall and S. Felix Wu and Frederic T. Chong", title = "{Minos}: {Architectural} support for protecting control data", journal = j-TACO, volume = "3", number = "4", pages = "359--389", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Marathe:2006:ACC, author = "Jaydeep Marathe and Frank Mueller and Bronis R. de Supinski", title = "Analysis of cache-coherence bottlenecks with hybrid hardware\slash software techniques", journal = j-TACO, volume = "3", number = "4", pages = "390--423", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Ganusov:2006:FEP, author = "Ilya Ganusov and Martin Burtscher", title = "Future execution: {A} prefetching mechanism that uses multiple cores to speed up single threads", journal = j-TACO, volume = "3", number = "4", pages = "424--449", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Co:2006:ETC, author = "Michele Co and Dee A. B. Weikle and Kevin Skadron", title = "Evaluating trace cache energy efficiency", journal = j-TACO, volume = "3", number = "4", pages = "450--476", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Hu:2006:EMM, author = "Shiwen Hu and Madhavi Valluri and Lizy Kurian John", title = "Effective management of multiple configurable units using dynamic optimization", journal = j-TACO, volume = "3", number = "4", pages = "477--501", month = dec, year = "2006", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, } @Article{Bentley:2006:IAB, author = "Chris Bentley and Scott A. Watterson and David K. Lowenthal and Barry Rountree", title = "Implicit array bounds checking on 64-bit architectures", journal = j-TACO, volume = "3", number = "4", pages = "502--527", month = dec, year = "2006", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1187976.1187982", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", abstract = "Several programming languages guarantee that array subscripts are checked to ensure they are within the bounds of the array. While this guarantee improves the correctness and security of array-based code, it adds overhead to array references. This has been an obstacle to using higher-level languages, such as Java, for high-performance parallel computing, where the language specification requires that all array accesses must be checked to ensure they are within bounds. This is because, in practice, array-bounds checking in scientific applications may increase execution time by more than a factor of 2. Previous research has explored optimizations to statically eliminate bounds checks, but the dynamic nature of many scientific codes makes this difficult or impossible. Our approach is, instead, to create a compiler and operating system infrastructure that does not generate explicit bounds checks. It instead places arrays inside of Index Confinement Regions (ICRs), which are large, isolated, mostly unmapped virtual memory regions. Any array reference outside of its bounds will cause a protection violation; this provides implicit bounds checking. Our results show that when applying this infrastructure to high-performance computing programs written in Java, the overhead of bounds checking relative to a program with no bounds checks is reduced from an average of 63\% to an average of 9\%.", acknowledgement = ack-nhfb, } @Article{Calder:2007:I, author = "Brad Calder and Dean Tullsen", title = "Introduction", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "1", } @Article{Constantinides:2007:ARC, author = "Kypros Constantinides and Stephen Plaza and Jason Blome and Valeria Bertacco and Scott Mahlke and Todd Austin and Bin Zhang and Michael Orshansky", title = "Architecting a reliable {CMP} switch architecture", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "2", } @Article{Hwang:2007:SSA, author = "Yuan-Shin Hwang and Jia-Jhe Li", title = "Snug set-associative caches: {Reducing} leakage power of instruction and data caches with no performance penalties", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "6", } @Article{Luo:2007:CNP, author = "Yan Luo and Jia Yu and Jun Yang and Laxmi N. Bhuyan", title = "Conserving network processor power consumption by exploiting traffic variability", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "4", } @Article{Rong:2007:SDS, author = "Hongbo Rong and Zhizhong Tang and R. Govindarajan and Alban Douillet and Guang R. Gao", title = "Single-dimension software pipelining for multidimensional loops", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "7", } @Article{Sasanka:2007:AES, author = "Ruchira Sasanka and Man-Lap Li and Sarita V. Adve and Yen-Kuang Chen and Eric Debes", title = "{ALP}: {Efficient} support for all levels of parallelism for complex media applications", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "3", } @Article{Soteriou:2007:SDP, author = "Vassos Soteriou and Noel Eisley and Li-Shiuan Peh", title = "Software-directed power-aware interconnection networks", journal = j-TACO, volume = "4", number = "1", pages = "??--??", month = mar, year = "2007", CODEN = "????", ISSN = "1544-3566", bibdate = "Sat Apr 14 10:44:57 MDT 2007", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "5", } @Article{Bower:2007:ODH, author = "Fred A. Bower and Daniel J. Sorin and Sule Ozev", title = "Online diagnosis of hard faults in microprocessors", journal = j-TACO, volume = "4", number = "2", pages = "8:1--8:??", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1250727.1250728", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We develop a microprocessor design that tolerates hard faults, including fabrication defects and in-field faults, by leveraging existing microprocessor redundancy. To do this, we must: detect and correct errors, diagnose hard faults at the field deconfigurable unit (FDU) granularity, and deconfigure FDUs with hard faults. In our reliable microprocessor design, we use DIVA dynamic verification to detect and correct errors. Our new scheme for diagnosing hard faults tracks instructions' core structure occupancy from decode until commit. If a DIVA checker detects an error in an instruction, it increments a small saturating error counter for every FDU used by that instruction, including that DIVA checker. A hard fault in an FDU quickly leads to an above-threshold error counter for that FDU and thus diagnoses the fault. For deconfiguration, we use previously developed schemes for functional units and buffers and present a scheme for deconfiguring DIVA checkers. Experimental results show that our reliable microprocessor quickly and accurately diagnoses each hard fault that is injected and continues to function, albeit with somewhat degraded performance.", acknowledgement = ack-nhfb, articleno = "8", keywords = "fine-grained diagnosis; hard fault tolerance; processor microarchitecture", } @Article{Michaud:2007:STM, author = "Pierre Michaud and Andr{\'e} Seznec and Damien Fetis and Yiannakis Sazeides and Theofanis Constantinou", title = "A study of thread migration in temperature-constrained multicores", journal = j-TACO, volume = "4", number = "2", pages = "9:1--9:??", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1250727.1250729", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Temperature has become an important constraint in high-performance processors, especially multicores. Thread migration will be essential to exploit the full potential of future thermally constrained multicores. We propose and study a thread migration method that maximizes performance under a temperature constraint, while minimizing the number of migrations and ensuring fairness between threads. We show that thread migration brings important performance gains and that it is most effective during the first tens of seconds following a decrease of the number of running threads.", acknowledgement = ack-nhfb, articleno = "9", keywords = "multicore processor; power density; temperature; thermal management; thread migration", } @Article{Chen:2007:CRL, author = "Yu Chen and Fuxin Zhang", title = "Code reordering on limited branch offset", journal = j-TACO, volume = "4", number = "2", pages = "10:1--10:??", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1250727.1250730", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Since the 1980's code reordering has gained popularity as an important way to improve the spatial locality of programs. While the effect of the processor's microarchitecture and memory hierarchy on this optimization technique has been investigated, little research has focused on the impact of the instruction set. In this paper, we analyze the effect of limited branch offset of the MIPS-like instruction set [Hwu et al. 2004, 2005] on code reordering, explore two simple methods to handle the exceeded branches, and propose the bidirectional code layout (BCL) algorithm to reduce the number of branches exceeding the offset limit. The BCL algorithm sorts the chains according to the position of related chains, avoids cache conflict misses deliberately and lays out the code bidirectionally. It strikes a balance among the distance of related blocks, the instruction cache miss rate, the memory size required, and the control flow transfer. Experimental results show that BCL can effectively reduce exceeded branches by 50.1\%, on average, with up to 100\% for some programs. Except for some programs with little spatial locality, the BCL algorithm can achieve the performance, as the case with no branch offset limitation.", acknowledgement = ack-nhfb, articleno = "10", keywords = "code reordering; Godson Processor; link-time optimization", } @Article{Terechko:2007:ICC, author = "A. S. Terechko and H. Corporaal", title = "Inter-cluster communication in {VLIW} architectures", journal = j-TACO, volume = "4", number = "2", pages = "11:1--11:??", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1250727.1250731", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The traditional VLIW (very long instruction word) architecture with a single register file does not scale up well to address growing performance demands on embedded media processors. However, splitting a VLIW processor in smaller clusters, which are comprised of function units fully connected to local register files, can significantly improve VLSI implementation characteristics of the processor, such as speed, energy consumption, and area. In our paper we reveal that achieving the best characteristics of a clustered VLIW requires a thorough selection of an Inter-cluster Communication (ICC) model, which is the way clustering is exposed in the Instruction Set Architecture. For our study we, first, define a taxonomy of ICC models including copy operations, dedicated issue slots, extended operands, extended results, and multicast. Evaluation of the execution time of the models requires both the dynamic cycle count and clock period. We developed an advanced instruction scheduler for all the five ICC models in order to quantify the dynamic cycle counts of our multimedia C benchmarks. To assess the clock period of the ICC models we designed and laid out VLIW datapaths using the RTL hardware descriptions derived from a deeply pipelined commercial TriMedia processor. In contrast to prior art, our research shows that fully distributed register file architectures (with eight clusters in our study) often underperform compared to moderately clustered machines with two or four clusters because of explosion of the cycle count overhead in the former. Among the evaluated ICC models, performance of the copy operation model, popular both in academia and industry, is severely limited by the copy operations hampering scheduling of regular operations in high ILP (instruction-level parallelism) code. The dedicated issue slots model combats this limitation by dedicating extra VLIW issue slots purely for ICC, reaching the highest 1.74 execution time speedup relative to the unicluster. Furthermore, our VLSI experiments show that the lowest area and energy consumption of 42 and 57\% relative to the unicluster, respectively, are achieved by the extended operands model, which, nevertheless, provides higher performance than the copy operation model.", acknowledgement = ack-nhfb, articleno = "11", keywords = "clock frequency; cluster assignment; instruction-level parallelism; instruction scheduler; intercluster communication; optimizing compiler; pipelining; register allocation; VLIW", } @Article{Dou:2007:CCM, author = "Jialin Dou and Marcelo Cintra", title = "A compiler cost model for speculative parallelization", journal = j-TACO, volume = "4", number = "2", pages = "12:1--12:??", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1250727.1250732", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Speculative parallelization is a technique that allows code sections that cannot be fully analyzed by the compiler to be aggressively executed in parallel. However, while speculative parallelization can potentially deliver significant speedups, several overheads associated with this technique can limit these speedups in practice. This paper proposes a novel compiler static cost model of speculative multithreaded execution that can be used to predict the resulting performance. This model attempts to predict the expected speedups, or slowdowns, of the candidate speculative sections based on the estimation of the combined runtime effects of various overheads, and taking into account the scheduling restrictions of most speculative execution environments. The model is based on estimating the likely execution duration of threads and considers all the possible permutations of these threads. This model also produces a quantitative estimate of the speedup, which is different from prior heuristics that only qualitatively estimate the benefits of speculative multithreaded execution. In previous work, a limited version of the framework was evaluated on a number of loops from a collection of SPEC benchmarks that suffer mainly from load imbalance and thread dispatch and commit overheads. In this work, an extended framework is also evaluated on loops that may suffer from data-dependence violations. Experimental results show that prediction accuracy is lower when loops with violations are included. Nevertheless, accuracy is still very high for a static model: the framework can identify, on average, 45\% of the loops that cause slowdowns and, on average, 96\% of the loops that lead to speedups; it predicts the speedups or slowdowns with an error of less than 20\% for an average of 28\% of the loops across the benchmarks and with an error of less than 50\% for an average of 80\% of the loops. Overall, the framework often outperforms, by as much as 25\%, a naive approach that attempts to speculatively parallelize all the loops considered, and is able to curb the large slowdowns caused in many cases by this naive approach.", acknowledgement = ack-nhfb, articleno = "12", keywords = "speculative multithreading; speculative parallelization; thread-level speculation", } @Article{Amme:2007:SBM, author = "Wolfram Amme and Jeffery von Ronne and Michael Franz", title = "{SSA}-based mobile code: {Implementation} and empirical evaluation", journal = j-TACO, volume = "4", number = "2", pages = "13:1--13:??", month = jun, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1250727.1250733", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:40:54 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Although one might expect transportation formats based on static single-assignment form (SSA) to yield faster just-in-time compilation times than those based on stack-based virtual machines, this claim has not previously been validated, in practice. We attempt to quantify the effect of using an SSA-based mobile code representation by integrating support for a verifiable SSA-based IR into Jikes RVM. Performance results, measured with various optimizations and on both the IA32 and PowerPC, show improvements in both compilation time and code quality.", acknowledgement = ack-nhfb, articleno = "13", keywords = "SafeTSA; static single-assignment form; virtual machines", } @Article{Li:2007:CCE, author = "Xiaodong Li and Ritu Gupta and Sarita V. Adve and Yuanyuan Zhou", title = "Cross-component energy management: {Joint} adaptation of processor and memory", journal = j-TACO, volume = "4", number = "3", pages = "14:1--14:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1275937.1275938", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Researchers have proposed the use of adaptation to reduce the energy consumption of different hardware components, such as the processor, memory, disk, and display for general-purpose applications. Previous algorithms to control these adaptations, however, have focused on a single component. This work takes the first step toward developing algorithms that can jointly control adaptations in multiple interacting components for general-purpose applications, with the goal of minimizing the total energy consumed within a specified performance loss. Specifically, we develop a joint-adaptation algorithm for processor and memory adaptations. We identify two properties that enable per-component algorithms to be easily used in a cross-component context---the algorithms' performance impact must be guaranteed and composable. We then modify a current processor and a memory algorithm to obey these properties. This allows the cross-component problem to be reduced to determine an appropriate (energy-optimal) allocation of the target performance loss (slack) between the two components. We develop such an optimal slack allocation algorithm that exploits the above properties. The result is an efficient cross-component adaptation framework that minimizes the total energy of the processor and memory without exceeding the target performance loss, while substantially leveraging current per-component algorithms. Our experiments show that joint processor and memory adaptation provides significantly more energy savings than adapting either component alone; intelligent slack distribution is specifically effective for highly compute- or memory-intensive applications; and the performance slowdown never exceeds the specification.", acknowledgement = ack-nhfb, articleno = "14", keywords = "adaptive systems; control algorithms; energy management; low-power design; memory; performance guarantee; processor", } @Article{Gabor:2007:FES, author = "Ron Gabor and Shlomo Weiss and Avi Mendelson", title = "Fairness enforcement in switch on event multithreading", journal = j-TACO, volume = "4", number = "3", pages = "15:1--15:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1275937.1275939", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The need to reduce power and complexity will increase the interest in Switch On Event multithreading (coarse-grained multithreading). Switch On Event multithreading is a low-power and low-complexity mechanism to improve processor throughput by switching threads on execution stalls. Fairness may, however, become a problem in a multithreaded processor. Unless fairness is properly handled, some threads may starve while others consume all of the processor cycles. Heuristics that were devised in order to improve fairness in simultaneous multithreading are not applicable to Switch On Event multithreading. This paper defines the fairness metric using the ratio of the individual threads' speedups and shows how it can be enforced in Switch On Event multithreading. Fairness is controlled by forcing additional thread switch points. These switch points are determined dynamically by runtime estimation of the single threaded performance of each of the individual threads. We analyze the impact of the fairness enforcement mechanism on aggregate IPC and weighted speedup. We present simulation results of the performance of Switch On Event multithreading. Switch On Event multithreading achieves an average aggregate IPC increase of 26\% over single thread and 12\% weighted speedup when no fairness is enforced. In this case, a sixth of our runs resulted in poor fairness in which one thread ran extremely slowly (10 to 100 times slower than its single-thread performance), while the other thread's performance was hardly affected. By using the proposed mechanism, we can guarantee fairness at different levels of strictness and, in most cases, even improve the weighted speedup.", acknowledgement = ack-nhfb, articleno = "15", keywords = "coarse-grained multithreading; fairness; multithreading; performance; SOE; Switch on Event multithreading; throughput; weighted speedup", } @Article{Andrade:2007:PAA, author = "Diego Andrade and Basilio B. Fraguela and Ram{\'o}n Doallo", title = "Precise automatable analytical modeling of the cache behavior of codes with indirections", journal = j-TACO, volume = "4", number = "3", pages = "16:1--16:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1275937.1275940", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The performance of memory hierarchies, in which caches play an essential role, is critical in nowadays general-purpose and embedded computing systems because of the growing memory bottleneck problem. Unfortunately, cache behavior is very unstable and difficult to predict. This is particularly true in the presence of irregular access patterns, which exhibit little locality. Such patterns are very common, for example, in applications in which pointers or compressed sparse matrices give place to indirections. Nevertheless, cache behavior in the presence of irregular access patterns has not been widely studied. In this paper we present an extension of a systematic analytical modeling technique based on PMEs (probabilistic miss equations), previously developed by the authors, that allows the automated analysis of the cache behavior for codes with irregular access patterns resulting from indirections. The model generates very accurate predictions despite the irregularities and has very low computing requirements, being the first model that gathers these desirable characteristics that can automatically analyze this kind of codes. These properties enable this model to help drive compiler optimizations, as we show with an example.", acknowledgement = ack-nhfb, articleno = "16", keywords = "analytical modeling; irregular access patterns; memory hierarchy; performance prediction", } @Article{Venstermans:2007:JOH, author = "Kris Venstermans and Lieven Eeckhout and Koen De Bosschere", title = "{Java} object header elimination for reduced memory consumption in 64-bit virtual machines", journal = j-TACO, volume = "4", number = "3", pages = "17:1--17:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1275937.1275941", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Memory performance is an important design issue for contemporary computer systems given the huge processor/memory speed gap. This paper proposes a space-efficient Java object model for reducing the memory consumption of 64-bit Java virtual machines. We completely eliminate the object header through typed virtual addressing (TVA) or implicit typing. TVA encodes the object type in the object's virtual address by allocating all objects of a given type in a contiguous memory segment. This allows for removing the type information as well as the status field from the object header. Whenever type and status information is needed, masking is applied to the object's virtual address for obtaining an offset into type and status information structures. Unlike previous work on implicit typing, we apply TVA to a selected number of frequently allocated object types, hence, the name selective TVA (STVA); this limits the amount of memory fragmentation. In addition to applying STVA, we also compress the type information block (TIB) pointers for all objects that do not fall under TVA. We implement the space-efficient Java object model in the 64-bit version of the Jikes RVM on an AIX IBM platform and compare its performance against the traditionally used Java object model using a multitude of Java benchmarks. We conclude that the space-efficient Java object model reduces memory consumption by on average 15\% (and up to 45\% for some benchmarks). About one-half the reduction comes from TIB pointer compression; the other one-half comes from STVA. In terms of performance, the space-efficient object model generally does not affect performance; however, for some benchmarks we observe statistically significant performance speedups, up to 20\%.", acknowledgement = ack-nhfb, articleno = "17", keywords = "64-bit implementation; implicit typing; Java object model; typed virtual addressing; Virtual machine", } @Article{Xiao:2007:VIS, author = "Shu Xiao and Edmund M.-K. Lai", title = "{VLIW} instruction scheduling for minimal power variation", journal = j-TACO, volume = "4", number = "3", pages = "18:1--18:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1275937.1275942", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The focus of this paper is on the minimization of the variation in power consumed by a VLIW processor during the execution of a target program through instruction scheduling. The problem is formulated as a mixed-integer program (MIP) and a problem-specific branch-and-bound algorithm has been developed to solve it more efficiently than generic MIP solvers. Simulation results based on the TMS320C6711 VLIW digital signal processor using benchmarks from Mediabench and Trimaran showed that over 40\% average reduction in power variation can be achieved without sacrificing execution speed of these benchmarks. Computational requirements and convergence rates of our algorithm are also analyzed.", acknowledgement = ack-nhfb, articleno = "18", keywords = "instruction scheduling; power variation reduction; VLIW processors", } @Article{Tallam:2007:UCF, author = "Sriraman Tallam and Rajiv Gupta", title = "Unified control flow and data dependence traces", journal = j-TACO, volume = "4", number = "3", pages = "19:1--19:??", month = sep, year = "2007", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1275937.1275943", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:20 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We describe the design, generation, and compression of the extended whole program path (eWPP), representation that not only captures the control flow history of a program execution but also its data dependence history. This representation is motivated by the observation that, typically, a significant fraction of data dependence history can be recovered from the control flow trace. To capture the remainder of the data dependence history, we introduce disambiguation checks in the program whose control flow signatures capture the results of the checks. The resulting extended control flow trace enables the recovery of otherwise irrecoverable data dependences. The code for the checks is designed to minimize the increase in program execution time and the extended control flow trace size when compared to directly collecting control flow and address traces. Our experiments show that compressed eWPPs are only one-quarter of the size of combined compressed control flow and address traces. However, their collection incurs a 5{\times} increase in runtime overhead relative to the overhead required for directly collecting the control flow and address traces, respectively.", acknowledgement = ack-nhfb, articleno = "19", keywords = "address trace; control flow trace; dynamic data dependence trace; profiling", } @Article{Ipek:2008:EAD, author = "Engin Ipek and Sally A. McKee and Karan Singh and Rich Caruana and Bronis R. de Supinski and Martin Schulz", title = "Efficient architectural design space exploration via predictive modeling", journal = j-TACO, volume = "4", number = "4", pages = "1:1--1:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1328195.1328196", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Efficiently exploring exponential-size architectural design spaces with many interacting parameters remains an open problem: the sheer number of experiments required renders detailed simulation intractable. We attack this via an automated approach that builds accurate predictive models. We simulate sampled points, using results to teach our models the function describing relationships among design parameters. The models can be queried and are very fast, enabling efficient design tradeoff discovery. We validate our approach via two uniprocessor sensitivity studies, predicting IPC with only 1--2\% error. In an experimental study using the approach, training on 1\% of a 250-K-point CMP design space allows our models to predict performance with only 4--5\% error. Our predictive modeling combines well with techniques that reduce the time taken by each simulation experiment, achieving net time savings of three-four orders of magnitude.", acknowledgement = ack-nhfb, articleno = "1", keywords = "artificial neural networks; design space exploration; performance prediction; sensitivity studies", } @Article{Shi:2008:VMS, author = "Yunhe Shi and Kevin Casey and M. Anton Ertl and David Gregg", title = "Virtual machine showdown: {Stack} versus registers", journal = j-TACO, volume = "4", number = "4", pages = "2:1--2:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1328195.1328197", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Virtual machines (VMs) enable the distribution of programs in an architecture-neutral format, which can easily be interpreted or compiled. A long-running question in the design of VMs is whether a stack architecture or register architecture can be implemented more efficiently with an interpreter. We extend existing work on comparing virtual stack and virtual register architectures in three ways. First, our translation from stack to register code and optimization are much more sophisticated. The result is that we eliminate an average of more than 46\% of executed VM instructions, with the bytecode size of the register machine being only 26\% larger than that of the corresponding stack one. Second, we present a fully functional virtual-register implementation of the Java virtual machine (JVM), which supports Intel, AMD64, PowerPC and Alpha processors. This register VM supports inline-threaded, direct-threaded, token-threaded, and switch dispatch. Third, we present experimental results on a range of additional optimizations such as register allocation and elimination of redundant heap loads. On the AMD64 architecture the register machine using switch dispatch achieves an average speedup of 1.48 over the corresponding stack machine. Even using the more efficient inline-threaded dispatch, the register VM achieves a speedup of 1.15 over the equivalent stack-based VM.", acknowledgement = ack-nhfb, articleno = "2", keywords = "interpreter; register architecture; stack architecture; virtual machine", } @Article{Yan:2008:EVR, author = "Jun Yan and Wei Zhang", title = "Exploiting virtual registers to reduce pressure on real registers", journal = j-TACO, volume = "4", number = "4", pages = "3:1--3:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1328195.1328198", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "It is well known that a large fraction of variables are short-lived. This paper proposes a novel approach to exploiting this fact to reduce the register pressure for pipelined processors with data-forwarding network. The idea is that the compiler can allocate virtual registers (i.e., place holders to identify dependences among instructions) to short-lived variables, which do not need to be stored to physical storage locations. As a result, real registers (i.e., physically existed registers) can be reserved for long-lived variables for mitigating the register pressure and decreasing the register spills, leading to performance improvement. In this paper, we develop the architectural and compiler support for exploiting virtual registers for statically scheduled processors. Our experimental results show that virtual registers are very effective at reducing the register spills, which, in many cases, can achieve the performance close to the processor with twice number of real registers. Our results also indicate that, for some applications, using 24 virtual, in addition to 8 real registers, can attain even higher performance than that of 16 real without any virtual registers.", acknowledgement = ack-nhfb, articleno = "3", keywords = "data forwarding; register allocation; register file; short-lived variables; virtual register", } @Article{Yu:2008:OCL, author = "Zoe C. H. Yu and Francis C. M. Lau and Cho-Li Wang", title = "Object co-location and memory reuse for {Java} programs", journal = j-TACO, volume = "4", number = "4", pages = "4:1--4:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1328195.1328199", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We introduce a new memory management system, STEMA, which can improve the execution time of Java programs. STEMA detects prolific types on-the-fly and co-locates their objects in a special memory space which supports reuse of memory. We argue and show that memory reuse and co-location of prolific objects can result in improved cache locality, reduced memory fragmentation, reduced GC time, and faster object allocation. We evaluate STEMA using 16 benchmarks. Experimental results show that STEMA performs 2.7\%, 4.0\%, and 8.2\% on average better than MarkSweep, CopyMS, and SemiSpace.", acknowledgement = ack-nhfb, articleno = "4", keywords = "garbage collector; Java; memory allocator; memory reuse; mutator; object co-location", } @Article{Zhang:2008:RCM, author = "Chuanjun Zhang", title = "Reducing cache misses through programmable decoders", journal = j-TACO, volume = "4", number = "4", pages = "5:1--5:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1328195.1328200", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Level-one caches normally reside on a processor's critical path, which determines clock frequency. Therefore, fast access to level-one cache is important. Direct-mapped caches exhibit faster access time, but poor hit rates, compared with same sized set-associative caches because of nonuniform accesses to the cache sets. The nonuniform accesses generate more cache misses in some sets, while other sets are underutilized. We propose to increase the decoder length and, hence, reduce the accesses to heavily used sets without dynamically detecting the cache set usage information. We increase the access to the underutilized cache sets by incorporating a replacement policy into the cache design using programmable decoders. On average, the proposed techniques achieve as low a miss rate as a traditional 4-way cache on all 26 SPEC2K benchmarks for the instruction and data caches, respectively. This translates into an average IPC improvement of 21.5 and 42.4\% for SPEC2K integer and floating-point benchmarks, respectively. The B-Cache consumes 10.5\% more power per access, but exhibits a 12\% total memory access-related energy savings as a result of the miss rate reductions, and, hence, the reduction to applications' execution time. Compared with previous techniques that aim at reducing the miss rate of direct-mapped caches, our technique requires only one cycle to access all cache hits and has the same access time of a direct-mapped cache.", acknowledgement = ack-nhfb, articleno = "5", keywords = "cache; dynamic optimization; low power", } @Article{Golander:2008:HMP, author = "Amit Golander and Shlomo Weiss", title = "Hiding the misprediction penalty of a resource-efficient high-performance processor", journal = j-TACO, volume = "4", number = "4", pages = "6:1--6:??", month = jan, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1328195.1328201", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:35 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Misprediction is a major obstacle for increasing speculative out-of-order processors performance. Performance degradation depends on both the number of misprediction events and the recovery time associated with each one of them. In recent years a few checkpoint based microarchitectures have been proposed. In comparison with ROB-based processors, checkpoint processors are scalable and highly resource efficient. Unfortunately, in these proposals the misprediction recovery time is proportional to the instruction queue size.\par In this paper we analyze methods to reduce the misprediction recovery time. We propose a new register file management scheme and techniques to selectively flush the instruction queue and the load store queue, and to isolate deeply pipelined execution units. The result is a novel checkpoint processor with Constant misprediction RollBack time (CRB). We further present a streamlined, cost-efficient solution, which saves complexity at the price of slightly lower performance.", acknowledgement = ack-nhfb, articleno = "6", keywords = "checkpoints; misprediction; out-of-order execution; rollback; scalable architecture", } @Article{Calder:2008:E, author = "Brad Calder and Dean Tullsen", title = "Editorial", journal = j-TACO, volume = "5", number = "1", pages = "1:1--1:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1369397", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", acknowledgement = ack-nhfb, articleno = "1", } @Article{Mysore:2008:FIP, author = "Shashidhar Mysore and Banit Agrawal and Rodolfo Neuber and Timothy Sherwood and Nisheeth Shrivastava and Subhash Suri", title = "Formulating and implementing profiling over adaptive ranges", journal = j-TACO, volume = "5", number = "1", pages = "2:1--2:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1369398", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Modern computer systems are called on to deal with billions of events every second, whether they are executed instructions, accessed memory locations, or forwarded packets. This presents a serious challenge to those who seek to quantify, analyze, or optimize such systems, because important trends and behaviors may easily be lost in a sea of data. We present range-adaptive profiling (RAP) as a new and general-purpose profiling method capable of hierarchically efficiently classifying streams of data in hardware. Through the use of RAP, events in an input stream are dynamically classified into increasingly precise categories, based on the frequency with which they occur. The more important a class, or range of events, the more precisely it is quantified. Despite the dynamic nature of our technique, we build upon tight theoretic bounds covering both worst-case error, as well as the required memory. In the limit, it is known that error and the memory bounds can be independent of the stream size and grow only linearly with the level of precision desired. Significantly, we expose the critical constants in these algorithms and through careful engineering, algorithm redesign, and use of heuristics, we show how a high-performance profile system can be implemented for range-adaptive profiling. RAP can be used on various profiles, such as PCs, load values, and memory addresses, and has a broad range of uses, from hot-region profiling to quantifying cache miss value locality. We propose two methods of implementation of RAP, one in software and the other with specialized hardware, for which we also describe our prototype FPGA implementation. We show that with just 8KB of memory, range profiles can be gathered with an average accuracy of 98\%.", acknowledgement = ack-nhfb, articleno = "2", keywords = "profiling hardware; range adaptive; value locality", } @Article{Zhai:2008:CHS, author = "Antonia Zhai and J. Gregory Steffan and Christopher B. Colohan and Todd C. Mowry", title = "Compiler and hardware support for reducing the synchronization of speculative threads", journal = j-TACO, volume = "5", number = "1", pages = "3:1--3:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1369399", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Thread-level speculation (TLS) allows us to automatically parallelize general-purpose programs by supporting parallel execution of threads that might not actually be independent. In this article, we focus on one important limitation of program performance under TLS, which stalls as a result of synchronizing and forwarding scalar values between speculative threads that would otherwise cause frequent data dependences and, hence, failed speculation. Using SPECint benchmarks that have been automatically transformed by our compiler to exploit TLS, we present, evaluate in detail, and compare both compiler and hardware techniques for improving the communication of scalar values. We find that through our dataflow algorithms for three increasingly aggressive instruction scheduling techniques, the compiler can drastically reduce the critical forwarding path introduced by the synchronization and forwarding of scalar values. We also show that hardware techniques for reducing synchronization can be complementary to compiler scheduling, but that the additional performance benefits are minimal and are generally not worth the cost.", acknowledgement = ack-nhfb, articleno = "3", keywords = "automatic parallelization; chip-multiprocessing; instruction scheduling; thread-level speculation", } @Article{Winter:2008:ATN, author = "Jonathan A. Winter and David H. Albonesi", title = "Addressing thermal nonuniformity in {SMT} workloads", journal = j-TACO, volume = "5", number = "1", pages = "4:1--4:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1369400", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "We explore DTM techniques within the context of uniform and nonuniform SMT workloads. While DVS is suitable for addressing workloads with uniformly high temperatures, for nonuniform workloads, performance loss occurs because of the slowdown of the cooler thread. To address this, we propose and evaluate DTM mechanisms that exploit the steering-based thread management mechanisms inherent in a clustered SMT architecture. We show that in contrast to DVS, which operates globally, our techniques are more effective at controlling temperature for nonuniform workloads. Furthermore, we devise a DTM technique that combines steering and DVS to achieve consistently good performance across all workloads.", acknowledgement = ack-nhfb, articleno = "4", keywords = "adaptive microarchitectures; clustered microarchitectures; dynamic thermal management; dynamic voltage scaling; simultaneous multithreading", } @Article{Shahbahrami:2008:VES, author = "Asadollah Shahbahrami and Ben Juurlink and Stamatis Vassiliadis", title = "Versatility of extended subwords and the matrix register file", journal = j-TACO, volume = "5", number = "1", pages = "5:1--5:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1369401", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Extended subwords and the matrix register file (MRF) are two micro architectural techniques that address some of the limitations of existing SIMD architectures. Extended subwords are wider than the data stored in memory. Specifically, for every byte of data stored in memory, there are four extra bits in the media register file. This avoids the need for data-type conversion instructions. The MRF is a register file organization that provides both conventional row-wise, as well as column-wise, access to the register file. In other words, it allows to view the register file as a matrix in which corresponding subwords in different registers corresponds to a column of the matrix. It was introduced to accelerate matrix transposition which is a very common operation in multimedia applications. In this paper, we show that the MRF is very versatile, since it can also be used for other permutations than matrix transposition. Specifically, it is shown how it can be used to provide efficient access to strided data, as is needed in, e.g., color space conversion. Furthermore, it is shown that special-purpose instructions (SPIs), such as the sum-of-absolute differences (SAD) instruction, have limited usefulness when extended subwords and a few general SIMD instructions that we propose are supported, for the following reasons. First, when extended subwords are supported, the SAD instruction provides only a relatively small performance improvement. Second, the SAD instruction processes 8-bit subwords only, which is not sufficient for quarter-pixel resolution nor for cost functions used in image and video retrieval. Results obtained by extending the SimpleScalar toolset show that the proposed techniques provide a speedup of up to 3.00 over the MMX architecture. The results also show that using, at most, 13 extra media registers yields an additional performance improvement ranging from 1.38 to 1.57.", acknowledgement = ack-nhfb, articleno = "5", keywords = "multimedia standards; SIMD architectures; SIMD programming", } @Article{Guo:2008:EHC, author = "Zhi Guo and Walid Najjar and Betul Buyukkurt", title = "Efficient hardware code generation for {FPGAs}", journal = j-TACO, volume = "5", number = "1", pages = "6:1--6:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1369402", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "The wider acceptance of FPGAs as a computing device requires a higher level of programming abstraction. ROCCC is an optimizing C to HDL compiler. We describe the code generation approach in ROCCC. The smart buffer is a component that reuses input data between adjacent iterations. It significantly improves the performance of the circuit and simplifies loop control. The ROCCC-generated datapath can execute one loop iteration per clock cycle when there is no loop dependency or there is only scalar recurrence variable dependency. ROCCC's approach to supporting while-loops operating on scalars makes the compiler able to move scalar iterative computation into hardware.", acknowledgement = ack-nhfb, articleno = "6", keywords = "data reuse; FPGA; high-level synthesis; reconfigurable computing; VHDL", } @Article{Kotzmann:2008:DJH, author = "Thomas Kotzmann and Christian Wimmer and Hanspeter M{\"o}ssenb{\"o}ck and Thomas Rodriguez and Kenneth Russell and David Cox", title = "Design of the {Java HotSpot\TM} client compiler for {Java 6}", journal = j-TACO, volume = "5", number = "1", pages = "7:1--7:??", month = may, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1369396.1370017", ISSN = "1544-3566", bibdate = "Mon Jun 16 11:41:51 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Version 6 of Sun Microsystems' Java HotSpot{\TM} VM ships with a redesigned version of the client just-in-time compiler that includes several research results of the last years. The client compiler is at the heart of the VM configuration used by default for interactive desktop applications. For such applications, low startup and pause times are more important than peak performance. This paper outlines the new architecture of the client compiler and shows how it interacts with the VM. It presents the intermediate representation that now uses static single-assignment (SSA) form and the linear scan algorithm for global register allocation. Efficient support for exception handling and deoptimization fulfills the demands that are imposed by the dynamic features of the Java programming language. The evaluation shows that the new client compiler generates better code in less time. The popular SPECjvm98 benchmark suite is executed 45\% faster, while the compilation speed is also up to 40\% better. This indicates that a carefully selected set of global optimizations can also be integrated in just-in-time compilers that focus on compilation speed and not on peak performance. In addition, the paper presents the impact of several optimizations on execution and compilation speed. As the source code is freely available, the Java HotSpot{\TM} VM and the client compiler are the ideal basis for experiments with new feedback-directed optimizations in a production-level Java just-in-time compiler. The paper outlines research projects that add fast algorithms for escape analysis, automatic object inlining, and array bounds check elimination.", acknowledgement = ack-nhfb, articleno = "7", keywords = "compiler; deoptimization; intermediate representation; Java; just-in-time compilation; optimization; register allocation", } @Article{Rangan:2008:PSD, author = "Ram Rangan and Neil Vachharajani and Guilherme Ottoni and David I. August", title = "Performance scalability of decoupled software pipelining", journal = j-TACO, volume = "5", number = "2", pages = "8:1--8:??", month = aug, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1400112.1400113", ISSN = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Any successful solution to using multicore processors to scale general-purpose program performance will have to contend with rising intercore communication costs while exposing coarse-grained parallelism. Recently proposed pipelined multithreading (PMT) techniques have been demonstrated to have general-purpose applicability and are also able to effectively tolerate inter-core latencies through pipelined interthread communication. These desirable properties make PMT techniques strong candidates for program parallelization on current and future multicore processors and understanding their performance characteristics is critical to their deployment. To that end, this paper evaluates the performance scalability of a general-purpose PMT technique called decoupled software pipelining (DSWP) and presents a thorough analysis of the communication bottlenecks that must be overcome for optimal DSWP scalability.", acknowledgement = ack-nhfb, articleno = "8", keywords = "decoupled software pipelining; performance analysis", } @Article{Long:2008:TMM, author = "Jieyi Long and Seda Ogrenci Memik and Gokhan Memik and Rajarshi Mukherjee", title = "Thermal monitoring mechanisms for chip multiprocessors", journal = j-TACO, volume = "5", number = "2", pages = "9:1--9:??", month = aug, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1400112.1400114", ISSN = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "With large-scale integration and increasing power densities, thermal management has become an important tool to maintain performance and reliability in modern process technologies. In the core of dynamic thermal management schemes lies accurate reading of on-die temperatures. Therefore, careful planning and embedding of thermal monitoring mechanisms into high-performance systems becomes crucial. In this paper, we propose three techniques to create sensor infrastructures for monitoring the maximum temperature on a multicore system. Initially, we extend a nonuniform sensor placement methodology proposed in the literature to handle chip multiprocessors (CMPs) and show its limitations. We then analyze a grid-based approach where the sensors are placed on a static grid covering each core and show that the sensor readings can differ from the actual maximum core temperature by as much as 12.6^\circ C when using 16 sensors per core. Also, as large as 10.6\% of the thermal emergencies are not captured using the same number of sensors. Based on this observation, we first develop an interpolation scheme, which estimates the maximum core temperature through interpolation of the readings collected at the static grid points. We show that the interpolation scheme improves the measurement accuracy and emergency coverage compared to grid-based placement when using the same number of sensors. Second, we present a dynamic scheme where only a subset of the sensor readings is collected to predict the maximum temperature of each core. Our results indicate that, we can reduce the number of active sensors by as much as 50\%, while maintaining similar measurement accuracy and emergency coverage compared to the case where the entire sensor set on the grid is sampled at all times.", acknowledgement = ack-nhfb, articleno = "9", keywords = "nonuniform and uniform sensor placement; thermal sensor allocation", } @Article{Joshi:2008:DEP, author = "Ajay Joshi and Lieven Eeckhout and Robert H. {Bell, Jr.} and Lizy K. John", title = "Distilling the essence of proprietary workloads into miniature benchmarks", journal = j-TACO, volume = "5", number = "2", pages = "10:1--10:??", month = aug, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1400112.1400115", ISSN = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Benchmarks set standards for innovation in computer architecture research and industry product development. Consequently, it is of paramount importance that these workloads are representative of real-world applications. However, composing such representative workloads poses practical challenges to application analysis teams and benchmark developers (1) real-world workloads are intellectual property and vendors hesitate to share these proprietary applications; and (2) porting and reducing these applications to benchmarks that can be simulated in a tractable amount of time is a nontrivial task. In this paper, we address this problem by proposing a technique that automatically distills key inherent behavioral attributes of a proprietary workload and captures them into a miniature synthetic benchmark clone. The advantage of the benchmark clone is that it hides the functional meaning of the code but exhibits similar performance characteristics as the target application. Moreover, the dynamic instruction count of the synthetic benchmark clone is substantially shorter than the proprietary application, greatly reducing overall simulation time for SPEC CPU, the simulation time reduction is over five orders of magnitude compared to entire benchmark execution. Using a set of benchmarks representative of general-purpose, scientific, and embedded applications, we demonstrate that the power and performance characteristics of the synthetic benchmark clone correlate well with those of the original application across a wide range of microarchitecture configurations.", acknowledgement = ack-nhfb, articleno = "10", keywords = "benchmark cloning; benchmarks; workload characterization", } @Article{Catania:2008:RCM, author = "Vincenzo Catania and Maurizio Palesi and Davide Patti", title = "Reducing complexity of multiobjective design space exploration in {VLIW}-based embedded systems", journal = j-TACO, volume = "5", number = "2", pages = "11:1--11:??", month = aug, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1400112.1400116", ISSN = "1544-3566", bibdate = "Thu Aug 28 13:25:00 MDT 2008", bibsource = "http://portal.acm.org/", abstract = "Architectures based on very-long instruction word (VLIW) have found fertile ground in multimedia electronic appliances thanks to their ability to exploit high degrees of instruction level parallelism (ILP) with a reasonable trade-off in complexity and silicon cost. Specialization of such architectures involves the configuration of both hardware-related aspects (e.g., register files, functional units, memory subsystem) and software-related issues (e.g., the compilation strategy). The complex interactions between the components of such systems will force a human designer to rely on judgment and experience in designing them, possibly eliminating interesting configurations, and making tuning of the system, for either power, energy, or performance, difficult. In this paper we propose tools and methodologies to efficiently cope with this complexity from a multiobjective perspective. We first analyze the impact of ILP-oriented code transformations using two alternative compilation profiles to quantitatively show the effect of such transformations on typical design objectives like performance, power dissipation, and energy consumption. Next, by means of statistical analysis, we collect useful data to predict the effectiveness of a given compilation profiles for a specific application. Information gathered from such analysis can be exploited to drastically reduce the computational effort needed to perform the design space exploration.", acknowledgement = ack-nhfb, articleno = "11", keywords = "design space exploration; energy; genetic algorithms; hyperblock formation; ILP; multiobjective optimization; performances; power; statistical analysis; VLIW architectures", } @Article{Leverich:2008:CEM, author = "Jacob Leverich and Hideho Arakida and Alex Solomatnikov and Amin Firoozshahian and Mark Horowitz and Christos Kozyrakis", title = "Comparative evaluation of memory models for chip multiprocessors", journal = j-TACO, volume = "5", number = "3", pages = "12:1--12:??", month = nov, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1455650.1455651", ISSN = "1544-3566", bibdate = "Mon Dec 8 14:28:18 MST 2008", bibsource = "http://portal.acm.org/", abstract = "There are two competing models for the on-chip memory in Chip Multiprocessor (CMP) systems: {\em hardware-managed coherent caches\/} and {\em software-managed streaming memory}. This paper performs a direct comparison of the two models under the same set of assumptions about technology, area, and computational capabilities. The goal is to quantify how and when they differ in terms of performance, energy consumption, bandwidth requirements, and latency tolerance for general-purpose CMPs. We demonstrate that for data-parallel applications on systems with up to 16 cores, the cache-based and streaming models perform and scale equally well. For certain applications with little data reuse, streaming scales better due to better bandwidth use and macroscopic software prefetching. However, the introduction of techniques such as hardware prefetching and nonallocating stores to the cache-based model eliminates the streaming advantage. Overall, our results indicate that there is not sufficient advantage in building streaming memory systems where all on-chip memory structures are explicitly managed. On the other hand, we show that streaming at the programming model level is particularly beneficial, even with the cache-based model, as it enhances locality and creates opportunities for bandwidth optimizations. Moreover, we observe that stream programming is actually easier with the cache-based model because the hardware guarantees correct, best-effort execution even when the programmer cannot fully regularize an application's code.", acknowledgement = ack-nhfb, articleno = "12", keywords = "cache coherence; Chip multiprocessors; locality optimizations; parallel programming; streaming memory", } @Article{Sharkey:2008:RRP, author = "Joseph J. Sharkey and Jason Loew and Dmitry V. Ponomarev", title = "Reducing register pressure in {SMT} processors through {L2}-miss-driven early register release", journal = j-TACO, volume = "5", number = "3", pages = "13:1--13:??", month = nov, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1455650.1455652", ISSN = "1544-3566", bibdate = "Mon Dec 8 14:28:18 MST 2008", bibsource = "http://portal.acm.org/", abstract = "The register file is one of the most critical datapath components limiting the number of threads that can be supported on a simultaneous multithreading (SMT) processor. To allow the use of smaller register files without degrading performance, techniques that maximize the efficiency of using registers through aggressive register allocation/deallocation can be considered. In this article, we propose a novel technique to early deallocate physical registers allocated to threads which experience L2 cache misses. This is accomplished by speculatively committing the load-independent instructions and deallocating the registers corresponding to the previous mappings of their destinations, without waiting for the cache miss request to be serviced. The early deallocated registers are then made immediately available for allocation to instructions within the same thread as well as within other threads, thus improving the overall processor throughput. On the average across the simulated mixes of multiprogrammed SPEC 2000 workloads, our technique results in 33\% improvement in throughput and 25\% improvement in terms of harmonic mean of weighted IPCs over the baseline SMT with the state-of-the-art DCRA policy. This is achieved without creating checkpoints, maintaining per-register counters of pending consumers, performing tag rebroadcasts, register remappings, and/or additional associative searches.", acknowledgement = ack-nhfb, articleno = "13", keywords = "register file; Simultaneous multithreading", } @Article{Mehrara:2008:ESP, author = "Mojtaba Mehrara and Todd Austin", title = "Exploiting selective placement for low-cost memory protection", journal = j-TACO, volume = "5", number = "3", pages = "14:1--14:??", month = nov, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1455650.1455653", ISSN = "1544-3566", bibdate = "Mon Dec 8 14:28:18 MST 2008", bibsource = "http://portal.acm.org/", abstract = "Many embedded processing applications, such as those found in the automotive or medical field, require hardware designs that are at the same time low cost and reliable. Traditionally, reliable memory systems have been implemented using coded storage techniques, such as ECC. While these designs can effectively detect and correct memory faults such as transient errors and single-bit defects, their use bears a significant cost overhead. In this article, we propose a novel partial memory protection scheme that provides high-coverage fault protection for program code and data, but with much lower cost than traditional approaches. Our approach profiles program code and data usage to assess which program elements are most critical to maintaining program correctness. Critical code and variables are then placed into a limited protected storage resources. To ensure high coverage of program elements, our placement technique considers all program components simultaneously, including code, global variables, stack frames, and heap variables. The fault coverage of our approach is gauged using Monte Carlo fault-injection experiments, which confirm that our technique provides high levels of fault protection (99\% coverage) with limited memory protection resources (36\% protected area).", acknowledgement = ack-nhfb, articleno = "14", keywords = "fault-tolerant design; memory system design; Partial memory protection; selective placement; transient faults", } @Article{Vandierendonck:2008:SRA, author = "Hans Vandierendonck and Andr{\'e} Seznec", title = "Speculative return address stack management revisited", journal = j-TACO, volume = "5", number = "3", pages = "15:1--15:??", month = nov, year = "2008", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1455650.1455654", ISSN = "1544-3566", bibdate = "Mon Dec 8 14:28:18 MST 2008", bibsource = "http://portal.acm.org/", abstract = "Branch prediction feeds a speculative execution processor core with instructions. Branch mispredictions are inevitable and have negative effects on performance and energy consumption. With the advent of highly accurate conditional branch predictors, nonconditional branch instructions are gaining importance.\par In this article, we address the prediction of procedure returns. On modern processors, procedure returns are predicted through a return address stack (RAS). The overwhelming majority of the return mispredictions are due to RAS overflows and/or overwriting the top entries of the RAS on a mispredicted path. These sources of misprediction were addressed by previously proposed speculative return address stacks [Jourdan et al. 1996; Skadron et al. 1998]. However, the remaining misprediction rate of these RAS designs is still significant when compared to state-of-the-art conditional predictors.\par We present two low-cost corruption detectors for RAS predictors. They detect RAS overflows and wrong path corruption with 100\% coverage. As a consequence, when such a corruption is detected, another source can be used for predicting the return. On processors featuring a branch target buffer (BTB), this BTB can be used as a free backup predictor for predicting returns when corruption is detected.\par Our experiments show that our proposal can be used to improve the behavior of all previously proposed speculative RASs. For instance, without any specific management of the speculative states on the RAS, an 8-entry BTB-backed up RAS achieves the same performance level as a state-of-the-art, but complex, 64-entry self-checkpointing RAS [Jourdan et al. 1996]. Therefore, our proposal can be used either to improve the performance of the processor or to reduce its hardware complexity.", acknowledgement = ack-nhfb, articleno = "15", keywords = "back-up predictor; corruption detection; Return address prediction", } @Article{Chhabra:2009:MSP, author = "Siddhartha Chhabra and Brian Rogers and Yan Solihin and Milos Prvulovic", title = "Making secure processors {OS}- and performance-friendly", journal = j-TACO, volume = "5", number = "4", pages = "16:1--16:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1498690.1498691", ISSN = "1544-3566", bibdate = "Wed Mar 18 21:35:33 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "In today's digital world, computer security issues have become increasingly important. In particular, researchers have proposed designs for secure processors that utilize hardware-based memory encryption and integrity verification to protect the privacy and integrity of computation even from sophisticated physical attacks. However, currently proposed schemes remain hampered by problems that make them impractical for use in today's computer systems: lack of virtual memory and Inter-Process Communication support as well as excessive storage and performance overheads. In this article, we propose (1) address independent seed encryption (AISE), a counter-mode-based memory encryption scheme using a novel seed composition, and (2) bonsai Merkle trees (BMT), a novel Merkle tree-based memory integrity verification technique, to eliminate these system and performance issues associated with prior counter-mode memory encryption and Merkle tree integrity verification schemes. We present both a qualitative discussion and a quantitative analysis to illustrate the advantages of our techniques over previously proposed approaches in terms of complexity, feasibility, performance, and storage. Our results show that AISE+BMT reduces the overhead of prior memory encryption and integrity verification schemes from 12\% to 2\% on average for single-threaded benchmarks on uniprocessor systems, and from 15\% to 4\% for coscheduled benchmarks on multicore systems while eliminating critical system-level problems.", acknowledgement = ack-nhfb, articleno = "16", keywords = "memory encryption; memory integrity verification; Secure processor architectures; virtualization", } @Article{Jimenez:2009:GNB, author = "Daniel A. Jim{\'e}nez", title = "Generalizing neural branch prediction", journal = j-TACO, volume = "5", number = "4", pages = "17:1--17:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1498690.1498692", ISSN = "1544-3566", bibdate = "Wed Mar 18 21:35:33 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Improved branch prediction accuracy is essential to sustaining instruction throughput with today's deep pipelines. Traditional branch predictors exploit correlations between pattern history and branch outcome to predict branches, but there is a stronger and more natural correlation between path history and branch outcome. We explore the potential for exploiting this correlation. We introduce {\em piecewise linear branch prediction}, an idealized branch predictor that develops a set of linear functions, one for each program path to the branch to be predicted, that separate predicted taken from predicted not taken branches. Taken together, all of these linear functions form a piecewise linear decision surface. We present a limit study of this predictor showing its potential to greatly improve predictor accuracy.\par We then introduce a practical implementable branch predictor based on piecewise linear branch prediction. In making our predictor practical, we show how a parameterized version of it unifies the previously distinct concepts of perceptron prediction and path-based neural prediction. Our new branch predictor has implementation costs comparable to current prominent predictors in the literature while significantly improving accuracy. For a deeply pipelined simulated microarchitecture our predictor with a 256-KB hardware budget improves the harmonic mean normalized instructions-per-cycle rate by 8\% over both the original path-based neural predictor and 2Bc-{\em gskew}. The average misprediction rate is decreased by 16\% over the path-based neural predictor and by 22\% over 2Bc-{\em gskew}.", acknowledgement = ack-nhfb, articleno = "17", keywords = "Branch prediction; machine learning", } @Article{Jeon:2009:AAP, author = "Jinseong Jeon and Keoncheol Shin and Hwansoo Han", title = "Abstracting access patterns of dynamic memory using regular expressions", journal = j-TACO, volume = "5", number = "4", pages = "18:1--18:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1498690.1498693", ISSN = "1544-3566", bibdate = "Wed Mar 18 21:35:33 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Unless the speed gap between CPU and memory disappears, efficient memory usage remains a decisive factor for performance. To optimize data usage of programs in the presence of the memory hierarchy, we are particularly interested in two compiler techniques: {\em pool allocation\/} and {\em field layout restructuring}. Since foreseeing runtime behaviors of programs at compile time is difficult, most of the previous work relied on profiling. On the contrary, our goal is to develop a fully automatic compiler that statically transforms input codes to use memory efficiently. Noticing that {\em regular expressions}, which denote repetition explicitly, are sufficient for memory access patterns, we describe how to extract memory access patterns as regular expressions in detail. Based on static patterns presented in regular expressions, we apply pool allocation to repeatedly accessed structures and exploit field layout restructuring according to field affinity relations of chosen structures. To make a scalable framework, we devise and apply new abstraction techniques, which build and interpret access patterns for the whole programs in a bottom-up fashion. We implement our analyses and transformations with the CIL compiler. To verify the effect and scalability of our scheme, we examine 17 benchmarks including 2 SPECINT 2000 benchmarks whose source lines of code are larger than 10,000. Our experiments demonstrate that the static layout transformations for dynamic memory can reduce L1D cache misses by 16\% and execution times by 14\% on average.", acknowledgement = ack-nhfb, articleno = "18", keywords = "Access patterns; field affinity; layout transformation; pool allocation; regular expressions", } @Article{Shobaki:2009:OTS, author = "Ghassan Shobaki and Kent Wilken and Mark Heffernan", title = "Optimal trace scheduling using enumeration", journal = j-TACO, volume = "5", number = "4", pages = "19:1--19:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1498690.1498694", ISSN = "1544-3566", bibdate = "Wed Mar 18 21:35:33 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "This article presents the first optimal algorithm for trace scheduling. The trace is a global scheduling region used by compilers to exploit instruction-level parallelism across basic block boundaries. Several heuristic techniques have been proposed for trace scheduling, but the precision of these techniques has not been studied relative to optimality. This article describes a technique for finding provably optimal trace schedules, where optimality is defined in terms of a weighted sum of schedule lengths across all code paths in a trace. The optimal algorithm uses branch-and-bound enumeration to efficiently explore the entire solution space. Experimental evaluation of the algorithm shows that, with a time limit of 1 s per problem, 91\% of the hard trace scheduling problems in the SPEC CPU 2006 Integer Benchmarks are solved optimally. For 58\% of these hard problems, the optimal schedule is improved compared to that produced by a heuristic scheduler with a geometric mean improvement of 3.2\% in weighted schedule length and 18\% in compensation code size.", acknowledgement = ack-nhfb, articleno = "19", keywords = "branch-and-bound enumeration; compiler optimizations; global instruction scheduling; instruction-level parallelism; Instruction scheduling; optimal instruction scheduling; trace scheduling", } @Article{Kulkarni:2009:PEO, author = "Prasad A. Kulkarni and David B. Whalley and Gary S. Tyson and Jack W. Davidson", title = "Practical exhaustive optimization phase order exploration and evaluation", journal = j-TACO, volume = "6", number = "1", pages = "1:1--1:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1509864.1509865", ISSN = "1544-3566", bibdate = "Thu May 7 14:55:25 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Choosing the most appropriate optimization phase ordering has been a long-standing problem in compiler optimizations. Exhaustive evaluation of all possible orderings of optimization phases for each function is generally dismissed as infeasible for production-quality compilers targeting accepted benchmarks. In this article, we show that it is possible to exhaustively evaluate the optimization phase order space for each function in a reasonable amount of time for most of the functions in our benchmark suite. To achieve this goal, we used various techniques to significantly prune the optimization phase order search space so that it can be inexpensively enumerated in most cases and reduce the number of program simulations required to evaluate program performance for each distinct phase ordering. The techniques described are applicable to other compilers in which it is desirable to find the best phase ordering for most functions in a reasonable amount of time. We also describe some interesting properties of the optimization phase order space, which will prove useful for further studies of related problems in compilers.", acknowledgement = ack-nhfb, articleno = "1", keywords = "exhaustive search; iterative compilation; Phase ordering", } @Article{Hohenauer:2009:SOF, author = "Manuel Hohenauer and Felix Engel and Rainer Leupers and Gerd Ascheid and Heinrich Meyr", title = "A {SIMD} optimization framework for retargetable compilers", journal = j-TACO, volume = "6", number = "1", pages = "2:1--2:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1509864.1509866", ISSN = "1544-3566", bibdate = "Thu May 7 14:55:25 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Retargetable C compilers are currently widely used to quickly obtain compiler support for new embedded processors and to perform early processor architecture exploration. A partially inherent problem of the retargetable compilation approach, though, is the limited code quality as compared to hand-written compilers or assembly code due to the lack of dedicated optimizations techniques. This problem can be circumvented by designing flexible, retargetable code optimization techniques that apply to a certain range of target architectures. This article focuses on target machines with SIMD instruction support, a common feature in embedded processors for multimedia applications. However, SIMD optimization is known to be a difficult task since SIMD architectures are largely nonuniform, support only a limited set of data types and impose several memory alignment constraints. Additionally, such techniques require complicated loop transformations, which are tailored to the SIMD architecture in order to exhibit the necessary amount of parallelism in the code. Thus, integrating the SIMD optimization {\em and\/} the required loop transformations together in a single retargeting formalism is an ambitious challenge. In this article, we present an efficient and quickly retargetable SIMD code optimization framework that is integrated into an industrial retargetable C compiler. Experimental results for different processors demonstrate that the proposed technique applies to real-life target machines and that it produces code quality improvements close to the theoretical limit.", acknowledgement = ack-nhfb, articleno = "2", keywords = "ASIP; retargetable compilers; SIMD; subword parallelism; vectorization", } @Article{Eyerman:2009:MLP, author = "Stijn Eyerman and Lieven Eeckhout", title = "Memory-level parallelism aware fetch policies for simultaneous multithreading processors", journal = j-TACO, volume = "6", number = "1", pages = "3:1--3:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1509864.1509867", ISSN = "1544-3566", bibdate = "Thu May 7 14:55:25 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "A thread executing on a simultaneous multithreading (SMT) processor that experiences a long-latency load will eventually stall while holding execution resources. Existing long-latency load aware SMT fetch policies limit the amount of resources allocated by a stalled thread by identifying long-latency loads and preventing the thread from fetching more instructions --- and in some implementations, instructions beyond the long-latency load are flushed to release allocated resources.\par This article proposes an SMT fetch policy that takes into account the available memory-level parallelism (MLP) in a thread. The key idea proposed in this article is that in case of an isolated long-latency load (i.e., there is no MLP), the thread should be prevented from allocating additional resources. However, in case multiple independent long-latency loads overlap (i.e., there is MLP), the thread should allocate as many resources as needed in order to fully expose the available MLP. MLP-aware fetch policies achieve better performance for MLP-intensive threads on SMT processors, leading to higher overall system throughput and shorter average turnaround time than previously proposed fetch policies.", acknowledgement = ack-nhfb, articleno = "3", keywords = "Fetch Policy; Memory-Level Parallelism (MLP); Simultaneous Multithreading (SMT)", } @Article{Strozek:2009:EAE, author = "Lukasz Strozek and David Brooks", title = "Energy- and area-efficient architectures through application clustering and architectural heterogeneity", journal = j-TACO, volume = "6", number = "1", pages = "4:1--4:??", month = mar, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1509864.1509868", ISSN = "1544-3566", bibdate = "Thu May 7 14:55:25 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Customizing architectures for particular applications is a promising approach to yield highly energy-efficient designs for embedded systems. This work explores the benefits of architectural customization for a class of embedded architectures typically used in energy- and area-constrained application domains, such as sensor nodes and multimedia processing. We implement a process flow that performs an automatic synthesis and evaluation of the different architectures based on runtime profiles of applications and determines an efficient architecture, with consideration for both energy and area constraints. An expressive architectural model, used by our engine, is introduced that takes advantage of efficient opcode allocation, several memory addressing modes, and operand types. By profiling embedded benchmarks from a variety of sensor and multimedia applications, we show that the energy savings resulting from various architectural optimizations relative to the base architectures (e.g., MIPS and MSP430) are significant and can reach 50\%, depending on the application. We then identify the set of architectures that achieves near-optimal savings for a group of applications. Finally, we propose the use of heterogeneous ISA processors implementing those architectures as a solution to capitalize on energy savings provided by application customization while executing a range of applications efficiently.", acknowledgement = ack-nhfb, articleno = "4", keywords = "Efficient custom architectures; heterogeneous ISA processors", } @Article{Venkataramani:2009:MAM, author = "Guru Venkataramani and Ioannis Doudalis and Yan Solihin and Milos Prvulovic", title = "{MemTracker}: {An} accelerator for memory debugging and monitoring", journal = j-TACO, volume = "6", number = "2", pages = "5:1--5:??", month = jun, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1543753.1543754", ISSN = "1544-3566", bibdate = "Thu Jul 2 12:32:04 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Memory bugs are a broad class of bugs that is becoming increasingly common with increasing software complexity, and many of these bugs are also security vulnerabilities. Existing software and hardware approaches for finding and identifying memory bugs have a number of drawbacks including considerable performance overheads, target only a specific type of bug, implementation cost, and inefficient use of computational resources.\par This article describes MemTracker, a new hardware support mechanism that can be configured to perform different kinds of memory access monitoring tasks. MemTracker associates each word of data in memory with a few bits of state, and uses a programmable state transition table to react to different events that can affect this state. The number of state bits per word, the events to which MemTracker reacts, and the transition table are all fully programmable. MemTracker's rich set of states, events, and transitions can be used to implement different monitoring and debugging checkers with minimal performance overheads, even when frequent state updates are needed. To evaluate MemTracker, we map three different checkers onto it, as well as a checker that combines all three. For the most demanding (combined) checker with 8 bits state per memory word, we observe performance overheads of only around 3\%, on average, and 14.5\% worst-case across different benchmark suites. Such low overheads allow continuous (always-on) use of MemTracker-enabled checkers, even in production runs.", acknowledgement = ack-nhfb, articleno = "5", keywords = "Accelerator; debugging; memory access monitoring", } @Article{Gabor:2009:SLA, author = "Ron Gabor and Avi Mendelson and Shlomo Weiss", title = "Service level agreement for multithreaded processors", journal = j-TACO, volume = "6", number = "2", pages = "6:1--6:??", month = jun, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1543753.1543755", ISSN = "1544-3566", bibdate = "Thu Jul 2 12:32:04 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Multithreading is widely used to increase processor throughput. As the number of shared resources increase, managing them while guaranteeing predicted performance becomes a major problem. Attempts have been made in previous work to ease this via different fairness mechanisms. In this article, we present a new approach to control the resource allocation and sharing via a service level agreement (SLA)-based mechanism; that is, via an agreement in which multithreaded processors guarantee a minimal level of service to the running threads. We introduce a new metric, {\em C\/}$_{SLA}$, for conformance to SLA in multithreaded processors and show that controlling resources using with SLA allows for higher gains than are achievable by previously suggested fairness techniques. It also permits improving one metric (e.g., power) while maintaining SLA in another (e.g., performance). We compare SLA enforcement to schemes based on other fairness metrics, which are mostly targeted at equalizing execution parameters. We show that using SLA rather than fairness based algorithms provides a range of acceptable execution points from which we can select the point that best fits our optimization target, such as maximizing the weighted speedup (sum of the speedups of the individual threads) or reducing power. We demonstrate the effectiveness of the new SLA approach using switch-on-event (coarse-grained) multithreading. Our weighted speedup improvement scheme successfully enforces SLA while improving the weighted speedup by an average of 10\% for unbalanced threads. This result is significant when compared with performance losses that may be incurred by fairness enforcement methods. When optimizing for power reduction in unbalanced threads SLA enforcement reduces the power by an average of 15\%. SLA may be complemented by other power reduction methods to achieve further power savings {\em and\/} maintain the same service level for the threads. We also demonstrate differentiated SLA, where weighted speedup is maximized while each thread may have a different throughput constraint.", acknowledgement = ack-nhfb, articleno = "6", keywords = "fairness; performance; power; Service level agreement; throughput", } @Article{Fung:2009:DWF, author = "Wilson W. L. Fung and Ivan Sham and George Yuan and Tor M. Aamodt", title = "Dynamic warp formation: {Efficient MIMD} control flow on {SIMD} graphics hardware", journal = j-TACO, volume = "6", number = "2", pages = "7:1--7:??", month = jun, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1543753.1543756", ISSN = "1544-3566", bibdate = "Thu Jul 2 12:32:04 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Recent advances in graphics processing units (GPUs) have resulted in massively parallel hardware that is easily programmable and widely available in today's desktop and notebook computer systems. GPUs typically use single-instruction, multiple-data (SIMD) pipelines to achieve high performance with minimal overhead for control hardware. Scalar threads running the same computing kernel are grouped together into SIMD batches, sometimes referred to as warps. While SIMD is ideally suited for simple programs, recent GPUs include control flow instructions in the GPU instruction set architecture and programs using these instructions may experience reduced performance due to the way branch execution is supported in hardware. One solution is to add a stack to allow different SIMD processing elements to execute distinct program paths after a branch instruction. The occurrence of diverging branch outcomes for different processing elements significantly degrades performance using this approach. In this article, we propose dynamic warp formation and scheduling, a mechanism for more efficient SIMD branch execution on GPUs. It dynamically regroups threads into new warps on the fly following the occurrence of diverging branch outcomes. We show that a realistic hardware implementation of this mechanism improves performance by 13\%, on average, with 256 threads per core, 24\% with 512 threads, and 47\% with 768 threads for an estimated area increase of 8\%.", acknowledgement = ack-nhfb, articleno = "7", keywords = "control flow; fine-grained multithreading; GPU; SIMD", } @Article{Koh:2009:TPV, author = "Cheng-Kok Koh and Weng-Fai Wong and Yiran Chen and Hai Li", title = "Tolerating process variations in large, set-associative caches: {The} buddy cache", journal = j-TACO, volume = "6", number = "2", pages = "8:1--8:??", month = jun, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1543753.1543757", ISSN = "1544-3566", bibdate = "Thu Jul 2 12:32:04 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "One important trend in today's microprocessor architectures is the increase in size of the processor caches. These caches also tend to be set associative. As technology scales, process variations are expected to increase the fault rates of the SRAM cells that compose such caches. As an important component of the processor, the parametric yield of SRAM cells is crucial to the overall performance and yield of the microchip. In this article, we propose a microarchitectural solution, called the buddy cache that permits large, set-associative caches to tolerate faults in SRAM cells due to process variations. In essence, instead of disabling a faulty cache block in a set (as is the current practice), it is paired with another faulty cache block in the same set --- the buddy. Although both cache blocks are faulty, if the faults of the two blocks do not overlap, then instead of losing two blocks, buddying will yield a functional block from the nonfaulty portions of the two blocks. We found that with buddying, caches can better mitigate the negative impacts of process variations on performance and yield, gracefully downgrading performance as opposed to catastrophic failure. We will describe the details of the buddy cache and give insights as to why it is both more performance and yield resilient to faults.", acknowledgement = ack-nhfb, articleno = "8", keywords = "caches; fault recovery; memory structures; Processor architectures", } @Article{Li:2009:CDS, author = "Lian Li and Hui Feng and Jingling Xue", title = "Compiler-directed scratchpad memory management via graph coloring", journal = j-TACO, volume = "6", number = "3", pages = "9:1--9:??", month = sep, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1582710.1582711", ISSN = "1544-3566", bibdate = "Thu Oct 1 09:20:47 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Scratchpad memory (SPM), a fast on-chip SRAM managed by software, is widely used in embedded systems. This article introduces a general-purpose compiler approach, called memory coloring, to assign static data aggregates, such as arrays and structs, in a program to an SPM. The novelty of this approach lies in partitioning the SPM into a pseudo--register file (with interchangeable and aliased registers), splitting the live ranges of data aggregates to create potential data transfer statements between SPM and off-chip memory, and finally, adapting an existing graph coloring algorithm for register allocation to assign the data aggregates to the pseudo--register file. Our experimental results using a set of 10 C benchmarks from MediaBench and MiBench show that our methodology is capable of managing SPMs efficiently and effectively for large embedded applications. In addition, our SPM allocator can obtain close to optimal solutions when evaluated and compared against an existing heuristics-based SPM allocator and an ILP-based SPM allocator.", acknowledgement = ack-nhfb, articleno = "9", keywords = "graph coloring; live range splitting; memory allocation; memory coloring; register coalescing; Scratchpad memory; software-managed cache", } @Article{Golander:2009:CAR, author = "Amit Golander and Shlomo Weiss", title = "Checkpoint allocation and release", journal = j-TACO, volume = "6", number = "3", pages = "10:1--10:??", month = sep, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1582710.1582712", ISSN = "1544-3566", bibdate = "Thu Oct 1 09:20:47 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Out-of-order speculative processors need a bookkeeping method to recover from incorrect speculation. In recent years, several microarchitectures that employ checkpoints have been proposed, either extending the reorder buffer or entirely replacing it. This work presents an in-dept-study of checkpointing in checkpoint-based microarchitectures, from the desired content of a checkpoint, via implementation trade-offs, and to checkpoint allocation and release policies. A major contribution of the article is a novel adaptive checkpoint allocation policy that outperforms known policies. The adaptive policy controls checkpoint allocation according to dynamic events, such as second-level cache misses and rollback history. It achieves 6.8\% and 2.2\% speedup for the integer and floating point benchmarks, respectively, and does not require a branch confidence estimator. The results show that the proposed adaptive policy achieves most of the potential of an oracle policy whose performance improvement is 9.8\% and 3.9\% for the integer and floating point benchmarks, respectively. We exploit known techniques for saving leakage power by adapting and applying them to checkpoint-based microarchitectures. The proposed applications combine to reduce the leakage power of the register file to about one half of its original value.", acknowledgement = ack-nhfb, articleno = "10", keywords = "Checkpoint; early register release; leakage; misprediction; out-of-order execution; rollback", } @Article{Xu:2009:TXP, author = "Weifeng Xu and Russell Tessier", title = "{Tetris-XL}: {A} performance-driven spill reduction technique for embedded {VLIW} processors", journal = j-TACO, volume = "6", number = "3", pages = "11:1--11:??", month = sep, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1582710.1582713", ISSN = "1544-3566", bibdate = "Thu Oct 1 09:20:47 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "As technology has advanced, the application space of Very Long Instruction Word (VLIW) processors has grown to include a variety of embedded platforms. Due to cost and power consumption constraints, many embedded VLIW processors contain limited resources, including registers. As a result, a VLIW compiler that maximizes instruction level parallelism (ILP) without considering register constraints may generate excessive register spills, leading to reduced overall system performance. To address this issue, this article presents a new spill reduction technique that improves VLIW runtime performance by reordering operations prior to register allocation and instruction scheduling. Unlike earlier algorithms, our approach explicitly considers both register reduction and data dependency in performing operation reordering. Data dependency control limits unexpected schedule length increases during subsequent instruction scheduling. Our technique has been evaluated using Trimaran, an academic VLIW compiler, and evaluated using a set of embedded systems benchmarks. Experimental results show that, on average, this technique improves VLIW performance by 10\% for VLIW processors with 32 registers and 8 functional units compared with previous spill reduction techniques. Limited improvement is seen versus prior approaches for VLIW processors with 64 registers and 8 functional units.", acknowledgement = ack-nhfb, articleno = "11", keywords = "instruction level parallelism; Register pressure; Very Long Instruction Word (VLIW) processor", } @Article{Jones:2009:ELE, author = "Timothy M. Jones and Michael F. P. O'Boyle and Jaume Abella and Antonio Gonz{\'a}lez and O{\u{g}}uz Ergin", title = "Exploring the limits of early register release: {Exploiting} compiler analysis", journal = j-TACO, volume = "6", number = "3", pages = "12:1--12:??", month = sep, year = "2009", CODEN = "????", DOI = "http://doi.acm.org/10.1145/1582710.1582714", ISSN = "1544-3566", bibdate = "Thu Oct 1 09:20:47 MDT 2009", bibsource = "http://portal.acm.org/", abstract = "Register pressure in modern superscalar processors can be reduced by releasing registers early and by copying their contents to cheap back-up storage. This article quantifies the potential benefits of register occupancy reduction and shows that existing hardware-based schemes typically achieve only a small fraction of this potential. This is because they are unable to accurately determine the last use of a register and must wait until the redefining instruction enters the pipeline. On the other hand, compilers have a global view of the program and, using simple dataflow analysis, can determine the last use. This article evaluates the extent to which compiler analysis can aid early releasing, explores the design space, and introduces commit and issue-based early releasing schemes, quantifying their benefits. Using simple compiler analysis and microarchitecture changes, we achieve 70\% of the potential register file occupancy reduction. By adding more hardware support, we can increase this to 94\%. Our schemes are compared to state-of-the-art approaches for varying register file sizes and are shown to outperform these existing techniques.", acknowledgement = ack-nhfb, articleno = "12", keywords = "compiler; energy efficiency; Low-power design; microarchitecture; register file", }